Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Replace torchaudio with pydub #381

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions bolna/helpers/analytics_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import os
from datetime import datetime, timezone
from dotenv import load_dotenv
from dateutil import parser
import copy
from .utils import format_messages
from .logger_config import configure_logger
Expand Down Expand Up @@ -80,7 +79,7 @@ def update_execution_details(current_high_level_assistant_analytics_data, run_de

def update_historical_values(arr, current_run_val, last_updated_at, should_increment, multiplier = 0, interval_minutes=1440):
now = datetime.now(timezone.utc)
last_updated_datetime = parser.isoparse(last_updated_at)
last_updated_datetime = datetime.fromisoformat(last_updated_at)
difference_in_minutes = (now - last_updated_datetime).total_seconds() / 60

if not arr or len(arr) == 0:
Expand Down
113 changes: 50 additions & 63 deletions bolna/helpers/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import datetime
import json
import asyncio
import math
import re
import copy
import hashlib
Expand All @@ -11,9 +10,6 @@
import wave
import numpy as np
import aiofiles
import torch
import torchaudio
from scipy.io import wavfile
from botocore.exceptions import BotoCoreError, ClientError
from aiobotocore.session import AioSession
from contextlib import AsyncExitStack
Expand Down Expand Up @@ -90,12 +86,9 @@ def float32_to_int16(float_audio):

def wav_bytes_to_pcm(wav_bytes):
wav_buffer = io.BytesIO(wav_bytes)
rate, data = wavfile.read(wav_buffer)
if data.dtype == np.int16:
return data.tobytes()
if data.dtype == np.float32:
data = float32_to_int16(data)
return data.tobytes()
audio_segment = AudioSegment.from_file(wav_buffer, format="wav")
pcm_data = audio_segment.raw_data
return pcm_data


# def wav_bytes_to_pcm(wav_bytes):
Expand Down Expand Up @@ -337,15 +330,18 @@ def yield_chunks_from_memory(audio_bytes, chunk_size=512):
yield audio_bytes[i:i + chunk_size]


def pcm_to_wav_bytes(pcm_data, sample_rate = 16000, num_channels = 1, sample_width = 2):
buffer = io.BytesIO()
bit_depth = 16
if len(pcm_data)%2 == 1:
def pcm_to_wav_bytes(pcm_data, sample_rate=16000, num_channels=1, sample_width=2):
if len(pcm_data) % 2 == 1:
pcm_data += b'\x00'
tensor_pcm = torch.frombuffer(pcm_data, dtype=torch.int16)
tensor_pcm = tensor_pcm.float() / (2**(bit_depth - 1))
tensor_pcm = tensor_pcm.unsqueeze(0)
torchaudio.save(buffer, tensor_pcm, sample_rate, format='wav')
audio_segment = AudioSegment(
data=pcm_data,
sample_width=sample_width,
frame_rate=sample_rate,
channels=num_channels
)
buffer = io.BytesIO()
audio_segment.export(buffer, format="wav")

return buffer.getvalue()


Expand All @@ -359,16 +355,16 @@ def convert_audio_to_wav(audio_bytes, source_format = 'flac'):
return buffer.getvalue()


def resample(audio_bytes, target_sample_rate, format = "mp3"):
def resample(audio_bytes, target_sample_rate, format="mp3"):
audio_buffer = io.BytesIO(audio_bytes)
waveform, orig_sample_rate = torchaudio.load(audio_buffer, format = format)
audio_segment = AudioSegment.from_file(audio_buffer, format=format)
orig_sample_rate = audio_segment.frame_rate
if orig_sample_rate == target_sample_rate:
return audio_bytes
resampler = torchaudio.transforms.Resample(orig_sample_rate, target_sample_rate)
audio_waveform = resampler(waveform)
audio_buffer = io.BytesIO()
logger.info(f"Resampling from {orig_sample_rate} to {target_sample_rate}")
torchaudio.save(audio_buffer, audio_waveform, target_sample_rate, format="wav")
resampled_audio = audio_segment.set_frame_rate(target_sample_rate)
audio_buffer = io.BytesIO()
resampled_audio.export(audio_buffer, format="wav")
return audio_buffer.getvalue()


Expand Down Expand Up @@ -450,61 +446,52 @@ async def write_request_logs(message, run_id):
else:
await log_file.write(log_string)

async def save_audio_file_to_s3(conversation_recording, sampling_rate = 24000, assistant_id = None, run_id = None):
async def save_audio_file_to_s3(conversation_recording, sampling_rate=24000, assistant_id=None, run_id=None):
last_frame_end_time = conversation_recording['output'][0]['start_time']
logger.info(f"LENGTH OF OUTPUT AUDIO {len(conversation_recording['output'])}")
initial_gap = (last_frame_end_time - conversation_recording["metadata"]["started"] ) *1000
initial_gap = (last_frame_end_time - conversation_recording["metadata"]["started"]) * 1000
logger.info(f"Initial gap {initial_gap}")

combined_audio = AudioSegment.silent(duration=initial_gap, frame_rate=sampling_rate)

for i, frame in enumerate(conversation_recording['output']):
frame_start_time = frame['start_time']
logger.info(f"Processing frame {i}, fram start time = {last_frame_end_time}, frame start time= {frame_start_time}")
frame_start_time = frame['start_time']
logger.info(f"Processing frame {i}, frame start time = {last_frame_end_time}, frame start time = {frame_start_time}")

if last_frame_end_time < frame_start_time:
gap_duration_samples = frame_start_time - last_frame_end_time
silence = AudioSegment.silent(duration=gap_duration_samples*1000, frame_rate=sampling_rate)
silence = AudioSegment.silent(duration=gap_duration_samples * 1000, frame_rate=sampling_rate)
combined_audio += silence

last_frame_end_time = frame_start_time + frame['duration']
frame_as = AudioSegment.from_file(io.BytesIO(frame['data']), format = "wav")
combined_audio +=frame_as

webm_segment = AudioSegment.from_file(io.BytesIO(conversation_recording['input']["data"]))
wav_bytes = io.BytesIO()
webm_segment.export(wav_bytes, format="wav")
wav_bytes.seek(0) # Reset the pointer to the start
waveform, sample_rate = torchaudio.load(wav_bytes)
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=sampling_rate)
downsampled_waveform = resampler(waveform)
torchaudio_wavio = io.BytesIO()
torchaudio.save(torchaudio_wavio, downsampled_waveform, sampling_rate, format= "wav")
frame_as = AudioSegment.from_file(io.BytesIO(frame['data']), format="wav")
combined_audio += frame_as

webm_segment = AudioSegment.from_file(io.BytesIO(conversation_recording['input']["data"]), format="webm")
webm_segment = webm_segment.set_frame_rate(sampling_rate)

audio_segment_bytes = io.BytesIO()
combined_audio = combined_audio.set_frame_rate(sampling_rate)
combined_audio.export(audio_segment_bytes, format="wav")
audio_segment_bytes.seek(0)
waveform_audio_segment, sample_rate = torchaudio.load(audio_segment_bytes)

if waveform_audio_segment.shape[0] > 1:
waveform_audio_segment = waveform_audio_segment[:1, :]

# Adjust shapes to be [1, N] if not already
downsampled_waveform = downsampled_waveform.unsqueeze(0) if downsampled_waveform.dim() == 1 else downsampled_waveform
waveform_audio_segment = waveform_audio_segment.unsqueeze(0) if waveform_audio_segment.dim() == 1 else waveform_audio_segment

# Ensure both waveforms have the same length
max_length = max(downsampled_waveform.size(1), waveform_audio_segment.size(1))
downsampled_waveform_padded = torch.nn.functional.pad(downsampled_waveform, (0, max_length - downsampled_waveform.size(1)))
waveform_audio_segment_padded = torch.nn.functional.pad(waveform_audio_segment, (0, max_length - waveform_audio_segment.size(1)))
stereo_waveform = torch.cat((downsampled_waveform_padded, waveform_audio_segment_padded), 0)

# Verify the stereo waveform shape is [2, M]
assert stereo_waveform.shape[0] == 2, "Stereo waveform should have 2 channels."
key = f'{assistant_id + run_id.split("#")[1]}.wav'


combined_audio_segment = AudioSegment.from_file(audio_segment_bytes, format="wav")
combined_audio_segment = combined_audio_segment.set_channels(1)

if len(webm_segment) > len(combined_audio_segment):
combined_audio_segment = combined_audio_segment + AudioSegment.silent(duration=len(webm_segment) - len(combined_audio_segment))
elif len(webm_segment) < len(combined_audio_segment):
webm_segment = webm_segment + AudioSegment.silent(duration=len(combined_audio_segment) - len(webm_segment))
webm_segment = webm_segment.set_channels(1)
combined_audio_segment = combined_audio_segment.set_channels(1)
stereo_audio_segment = webm_segment.overlay(combined_audio_segment)
audio_buffer = io.BytesIO()
torchaudio.save(audio_buffer, stereo_waveform, 24000, format="wav")
stereo_audio_segment.export(audio_buffer, format="wav")
audio_buffer.seek(0)

key = f'{assistant_id + run_id.split("#")[1]}.wav'
logger.info(f"Storing in {RECORDING_BUCKET_URL}{key}")
await store_file(bucket_name=RECORDING_BUCKET_NAME, file_key=key, file_data=audio_buffer, content_type="wav")

await store_file(bucket_name=RECORDING_BUCKET_NAME, file_key=key, file_data=audio_buffer, content_type="audio/wav")
return f'{RECORDING_BUCKET_URL}{key}'

def list_number_of_wav_files_in_directory(directory):
Expand Down
1 change: 0 additions & 1 deletion bolna/helpers/vad.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
import subprocess
import requests
import torch
import numpy as np
Expand Down
11 changes: 3 additions & 8 deletions bolna/memory/cache/vector_cache.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@

from bolna.helpers.logger_config import configure_logger
from bolna.memory.cache.base_cache import BaseCache
from typing import List
import numpy as np
from fastembed import TextEmbedding
from sentence_transformers import util
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

logger = configure_logger(__name__)

Expand All @@ -23,11 +20,9 @@ def set(self, documents):
)

def __get_top_cosine_similarity_doc(self, query_embedding):
#util.pytorch_cos_sim(self.embeddings, query_embedding)
# scores = np.dot(self.embeddings, query_embedding)
# sorted_scores = np.argsort(scores)[::-1]

similarities = cosine_similarity([query_embedding], self.embeddings)[0]
query_norm = query_embedding / np.linalg.norm(query_embedding)
embeddings_norm = self.embeddings / np.linalg.norm(self.embeddings, axis=1)[:, np.newaxis]
similarities = np.dot(embeddings_norm, query_norm)
most_similar_index = np.argmax(similarities)
return self.documents[most_similar_index]

Expand Down
2 changes: 1 addition & 1 deletion bolna/synthesizer/azure_synthesizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from dotenv import load_dotenv
from bolna.helpers.logger_config import configure_logger
from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet, pcm_to_wav_bytes, resample, wav_bytes_to_pcm
from bolna.helpers.utils import create_ws_data_packet, wav_bytes_to_pcm
from bolna.memory.cache.inmemory_scalar_cache import InmemoryScalarCache
from .base_synthesizer import BaseSynthesizer
import azure.cognitiveservices.speech as speechsdk
Expand Down
11 changes: 5 additions & 6 deletions bolna/synthesizer/base_synthesizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import io
import torchaudio
from bolna.helpers.logger_config import configure_logger
import asyncio
from pydub import AudioSegment

logger = configure_logger(__name__)

Expand Down Expand Up @@ -29,12 +29,11 @@ def get_synthesized_characters(self):
return 0

def resample(self, audio_bytes):
audio_buffer = io.BytesIO(audio_bytes)
waveform, orig_sample_rate = torchaudio.load(audio_buffer)
resampler = torchaudio.transforms.Resample(orig_sample_rate, 8000)
audio_waveform = resampler(waveform)
audio_segment = AudioSegment.from_file(io.BytesIO(audio_bytes))
audio_segment = audio_segment.set_frame_rate(8000)
audio_segment = audio_segment.set_channels(1)
audio_buffer = io.BytesIO()
torchaudio.save(audio_buffer, audio_waveform, 8000, format="wav")
audio_segment.export(audio_buffer, format="wav")
audio_buffer.seek(0)
audio_data = audio_buffer.read()
return audio_data
Expand Down
2 changes: 1 addition & 1 deletion bolna/synthesizer/elevenlabs_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from bolna.memory.cache.inmemory_scalar_cache import InmemoryScalarCache
from .base_synthesizer import BaseSynthesizer
from bolna.helpers.logger_config import configure_logger
from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet, pcm_to_wav_bytes, resample
from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet, resample


logger = configure_logger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion bolna/synthesizer/melo_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
from dotenv import load_dotenv
from bolna.helpers.logger_config import configure_logger
from bolna.helpers.utils import create_ws_data_packet, resample, wav_bytes_to_pcm
from bolna.helpers.utils import create_ws_data_packet, wav_bytes_to_pcm
from bolna.memory.cache.inmemory_scalar_cache import InmemoryScalarCache
from .base_synthesizer import BaseSynthesizer
import json
Expand Down
2 changes: 1 addition & 1 deletion bolna/synthesizer/openai_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
from dotenv import load_dotenv
from bolna.helpers.logger_config import configure_logger
from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet, pcm_to_wav_bytes, resample
from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet, resample
from .base_synthesizer import BaseSynthesizer
from openai import AsyncOpenAI
import io
Expand Down
4 changes: 2 additions & 2 deletions bolna/synthesizer/polly_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from aiobotocore.session import AioSession
from contextlib import AsyncExitStack
from bolna.helpers.logger_config import configure_logger
from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet, pcm_to_wav_bytes, resample
from bolna.helpers.utils import convert_audio_to_wav, create_ws_data_packet
from bolna.memory.cache.inmemory_scalar_cache import InmemoryScalarCache
from .base_synthesizer import BaseSynthesizer

Expand All @@ -14,7 +14,7 @@

class PollySynthesizer(BaseSynthesizer):
def __init__(self, voice, language, audio_format="pcm", sampling_rate=8000, stream=False, engine="neural",
buffer_size=400, speaking_rate = "100%", volume = "0dB", caching= True, **kwargs):
buffer_size=400, speaking_rate="100%", volume="0dB", caching=True, **kwargs):
super().__init__(stream, buffer_size)
self.engine = engine
self.format = self.get_format(audio_format.lower())
Expand Down
4 changes: 0 additions & 4 deletions bolna/transcriber/bodhi_transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
from audioop import ulaw2lin
import traceback
import uuid
import numpy as np
import torch
import websockets
import os
import json
Expand All @@ -14,9 +12,7 @@
from .base_transcriber import BaseTranscriber
from bolna.helpers.logger_config import configure_logger
from bolna.helpers.utils import create_ws_data_packet
import ssl

torch.set_num_threads(1)

logger = configure_logger(__name__)
load_dotenv()
Expand Down
3 changes: 0 additions & 3 deletions bolna/transcriber/deepgram_transcriber.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import asyncio
import traceback
import numpy as np
import torch
import websockets
import os
import json
Expand All @@ -13,8 +12,6 @@
from bolna.helpers.logger_config import configure_logger
from bolna.helpers.utils import create_ws_data_packet

torch.set_num_threads(1)

logger = configure_logger(__name__)
load_dotenv()

Expand Down
9 changes: 1 addition & 8 deletions bolna/transcriber/whisper_transcriber.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,19 @@
import asyncio
# from asyncio.base_tasks import tasks
import traceback
import numpy as np
import torch
import websockets
import os
import json
import time
from .base_transcriber import BaseTranscriber
from bolna.helpers.logger_config import configure_logger
from bolna.helpers.utils import create_ws_data_packet, int2float
from bolna.helpers.vad import VAD
from bolna.helpers.utils import create_ws_data_packet
from audioop import ulaw2lin, ratecv
import json
import os
import time
from queue import Queue
from websockets.exceptions import *

import uvloop
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
torch.set_num_threads(1)

logger = configure_logger(__name__)

Expand Down
Loading