From c8f5bce9700e58b11501b8fb9b7990a7dd88e414 Mon Sep 17 00:00:00 2001 From: younesselrag Date: Mon, 9 Sep 2024 03:16:36 +0100 Subject: [PATCH] Dockerization Server side --- DockerFile | 13 +++++++++ DockerFile.fastapi.cuda | 21 +++++++++++++ README.md | 65 ++++------------------------------------- 3 files changed, 39 insertions(+), 60 deletions(-) create mode 100644 DockerFile.fastapi.cuda diff --git a/DockerFile b/DockerFile index e69de29..7ddb4de 100644 --- a/DockerFile +++ b/DockerFile @@ -0,0 +1,13 @@ +FROM pytorch/pytorch:2.4.0-cuda12.1-cudnn9-devel + +ENV PYTHONUNBUFFERED 1 + +WORKDIR /usr/src/app + +# Install packages +RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . \ No newline at end of file diff --git a/DockerFile.fastapi.cuda b/DockerFile.fastapi.cuda new file mode 100644 index 0000000..1a2a6ed --- /dev/null +++ b/DockerFile.fastapi.cuda @@ -0,0 +1,21 @@ +# Copyright (C) 2024 Youness Elbrag + +FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04 + +RUN apt-get update && apt-get install -y software-properties-common +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt-get update && apt-get install -y python3.10 python3.10-venv python3-pip ffmpeg + +WORKDIR /Server-fastapi +COPY . . + +RUN python3.10 -m venv venv +RUN /bin/bash -c "source venv/bin/activate && pip install --upgrade pip" +RUN /bin/bash -c "source venv/bin/activate && pip install -e ." +RUN /bin/bash -c "source venv/bin/activate && pip install -r fastapi/requirements-fastapi-cuda.txt" + +WORKDIR /Server-fastapi/backend + +EXPOSE 8000 + +CMD ["../venv/bin/uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/README.md b/README.md index 2bb715c..ace185a 100644 --- a/README.md +++ b/README.md @@ -1,31 +1,20 @@ # 🎤 Real-Time Speech-to-Speech (S2S) Conversion using Hugging Face Models -**Speech-Ora** is an open-source web app for real-time speech-to-speech conversion, leveraging state-of-the-art models from the Hugging Face ecosystem to create interactions similar to GPT-like conversations. - -## 📖 Table of Contents -* [Overview](#overview) - - [System Structure](#system-structure) - - [Modularity](#modularity) -* [Setup Instructions](#setup-instructions) -* [Usage](#usage) - - [Server/Client Approach](#serverclient-approach) - - [Local Approach](#local-approach) -* [Command-Line Usage](#command-line-usage) - - [Model Parameters](#model-parameters) - - [Generation Parameters](#generation-parameters) - - [Key Parameters](#key-parameters) -* [Docker Setup](#docker-setup) +**Speech-OraAgent** is an open-source Automated-Agent Driven aackend for real-time speech-to-speech conversion, leveraging state-of-the-art models from the Hugging Face ecosystem to create interactions similar to GPT-like conversations. + ## Overview ### System Structure -This repository implements a speech-to-speech Server-Side/Client-side System deign with the following components based Open source model used to built so : +This Project implements a speech-to-speech Server-Side/Client-side backend calling APis System design with the following components based Open source model used to built so : 1. **Voice Activity Detection (VAD)**: Powered by [Silero VAD v5](https://github.com/snakers4/silero-vad). 2. **Speech to Text (STT)**: Uses Whisper models from the Hugging Face hub. 3. **Language Model (LM)**: Any Hugging Face instruct model can be used. 4. **Text to Speech (TTS)**: Uses [Parler-TTS](https://github.com/huggingface/parler-tts) for speech synthesis. +**Note** : you can re-load a different Open Source model Or using APIs support Model Via Key APIs by Providers + ### Modularity The pipeline is modular and flexible, allowing customization at each stage: - **VAD**: Integrates the [Silero VAD](https://github.com/snakers4/silero-vad). @@ -48,48 +37,6 @@ All components are implemented as independent classes for easy customization. pip install -r requirements.txt ``` -## Usage - -The pipeline supports two primary modes of operation: -1. **Server/Client Approach**: Models run on a server, and audio input/output are streamed between client and server. -2. **Local Approach**: Uses a client-server architecture on the same machine using the loopback address. - -### Server/Client Approach - -1. Start the pipeline on the server: - - -2. Run the client to send audio input and receive the generated audio: - - -### Local Approach - -Run the pipeline locally by using the loopback address: - - -## Command-Line Usage - -The pipeline includes customizable parameters for running the models. - -### Model Parameters - -- VAD Model: The voice activity detection model. -- STT Model: Whisper model checkpoint from Hugging Face. -- LM Model: Any Hugging Face instruct model. - -### Generation Parameters - -- Sampling Rate: Adjustable sampling rate for audio. -- Chunk Size: Chunk size for audio processing. - - - -## Docker Setup - -To run the pipeline in a Docker container with GPU access, follow these steps: - - - ### Setup the Environment Set up GPU Access for Docker here stpes to run docker-Image of full the code run top on Containter . @@ -114,9 +61,7 @@ sudo apt-get install -y nvidia-container-toolkit ```bash sudo nvidia-ctk runtime configure --runtime=docker ``` -Now you're ready to run the project in a container with GPU support. -### Build and Run the Docker Container