Merge pull request #442 from roedoejet/dev.ap/demo-gradio

Dev.ap/demo gradio
EveryVoiceTTS · May 29, 2024 · 6b4f601 · 6b4f601
2 parents de413a7 + 05b5dab
commit 6b4f601
Show file tree

Hide file tree

Showing 7 changed files with 167 additions and 7 deletions.
diff --git a/everyvoice/cli.py b/everyvoice/cli.py
@@ -241,6 +241,55 @@ def test(suite: TestSuites = typer.Argument(TestSuites.dev)):
 SCHEMAS_TO_OUTPUT: dict[str, Any] = {}  # dict[str, type[BaseModel]]
 
 
+@app.command()
+def demo(
+    text_to_spec_model: Path = typer.Argument(
+        ...,
+        file_okay=True,
+        exists=True,
+        dir_okay=False,
+        help="The path to a trained text-to-spec EveryVoice model.",
+        autocompletion=complete_path,
+    ),
+    spec_to_wav_model: Path = typer.Argument(
+        ...,
+        help="The path to a trained vocoder.",
+        dir_okay=False,
+        file_okay=True,
+        autocompletion=complete_path,
+    ),
+    language: str = typer.Argument(
+        ...,
+        help="Specify which language to use in a multilingual system. TODO: let this be selectable in app",
+    ),
+    speaker: str = typer.Argument(
+        ...,
+        help="Specify which speaker to use in a multispeaker system. TODO: let this be selectable in app",
+    ),
+    output_dir: Path = typer.Option(
+        "synthesis_output",
+        "--output-dir",
+        "-o",
+        file_okay=False,
+        dir_okay=True,
+        help="The directory where your synthesized audio should be written",
+        autocompletion=complete_path,
+    ),
+    accelerator: str = typer.Option("auto", "--accelerator", "-a"),
+):
+    from everyvoice.demo.app import create_demo_app
+
+    demo = create_demo_app(
+        text_to_spec_model_path=text_to_spec_model,
+        spec_to_wav_model_path=spec_to_wav_model,
+        language=language,
+        speaker=speaker,
+        output_dir=output_dir,
+        accelerator=accelerator,
+    )
+    demo.launch()
+
+
 @app.command(hidden=True)
 def update_schemas(
     out_dir: Path = typer.Option(

diff --git a/everyvoice/dataloader/__init__.py b/everyvoice/dataloader/__init__.py
@@ -1,4 +1,5 @@
 import os
+from pathlib import Path
 from typing import Callable, Optional, Union
 
 import pytorch_lightning as pl
@@ -18,11 +19,16 @@ def __init__(
         config: Union[
             AlignerConfig, VocoderConfig, FeaturePredictionConfig, EveryVoiceConfig
         ],
+        inference_output_dir: Optional[Path] = None,
     ):
         super().__init__()
         self.collate_fn: Union[Callable, None] = None
         self.config = config
         self.use_weighted_sampler = False
+        self.inference_output_dir = inference_output_dir
+        if self.inference_output_dir is not None:
+            self.inference_output_dir.mkdir(exist_ok=True, parents=True)
+            self.predict_path = self.inference_output_dir / "latest_predict_data.pth"
         self.train_path = os.path.join(
             self.config.training.logger.save_dir,
             self.config.training.logger.name,
@@ -33,11 +39,6 @@ def __init__(
             self.config.training.logger.name,
             "val_data.pth",
         )
-        self.predict_path = os.path.join(
-            self.config.training.logger.save_dir,
-            self.config.training.logger.name,
-            "latest_predict_data.pth",
-        )
 
     def setup(self, stage: Optional[str] = None):
         # load it back here

diff --git a/everyvoice/demo/__init__.py b/everyvoice/demo/__init__.py
diff --git a/everyvoice/demo/app.py b/everyvoice/demo/app.py
@@ -0,0 +1,109 @@
+import os
+from functools import partial
+
+import gradio as gr
+import torch
+
+from everyvoice.config.type_definitions import TargetTrainingTextRepresentationLevel
+from everyvoice.model.feature_prediction.FastSpeech2_lightning.fs2.cli.synthesize import (
+    synthesize_helper,
+)
+from everyvoice.model.feature_prediction.FastSpeech2_lightning.fs2.model import (
+    FastSpeech2,
+)
+from everyvoice.model.feature_prediction.FastSpeech2_lightning.fs2.prediction_writing_callback import (
+    PredictionWritingWavCallback,
+)
+from everyvoice.model.feature_prediction.FastSpeech2_lightning.fs2.type_definitions import (
+    SynthesizeOutputFormats,
+)
+from everyvoice.model.vocoder.HiFiGAN_iSTFT_lightning.hfgl.utils import (
+    load_hifigan_from_checkpoint,
+)
+from everyvoice.utils.heavy import get_device_from_accelerator
+
+os.environ["no_proxy"] = "localhost,127.0.0.1,::1"
+
+
+def synthesize_audio(
+    text,
+    duration_control,
+    text_to_spec_model,
+    vocoder_model,
+    vocoder_config,
+    accelerator,
+    device,
+    language=None,
+    speaker=None,
+    output_dir=None,
+):
+    config, device, predictions = synthesize_helper(
+        model=text_to_spec_model,
+        vocoder_model=vocoder_model,
+        vocoder_config=vocoder_config,
+        texts=[text],
+        language=language,
+        accelerator=accelerator,
+        devices="1",
+        device=device,
+        global_step=1,
+        output_type=[],
+        text_representation=TargetTrainingTextRepresentationLevel.characters,
+        output_dir=output_dir,
+        speaker=speaker,
+        duration_control=duration_control,
+        filelist=None,
+        teacher_forcing_directory=None,
+        batch_size=1,
+        num_workers=1,
+    )
+    output_key = "postnet_output" if text_to_spec_model.config.model.use_postnet else "output"
+    wav_writer = PredictionWritingWavCallback(
+        output_dir=output_dir,
+        config=config,
+        output_key=output_key,
+        device=device,
+        global_step=1,
+        vocoder_model=vocoder_model,
+        vocoder_config=vocoder_config,
+    )
+    # move to device because lightning accumulates predictions on cpu
+    predictions[0][output_key] = predictions[0][output_key].to(device)
+    wav, sr = wav_writer.synthesize_audio(predictions[0])
+    return sr, wav[0]
+
+
+def create_demo_app(
+    text_to_spec_model_path,
+    spec_to_wav_model_path,
+    language,
+    speaker,
+    output_dir,
+    accelerator,
+) -> gr.Interface:
+    device = get_device_from_accelerator(accelerator)
+    vocoder_ckpt = torch.load(spec_to_wav_model_path, map_location=device)
+    vocoder_model, vocoder_config = load_hifigan_from_checkpoint(vocoder_ckpt, device)
+    model: FastSpeech2 = FastSpeech2.load_from_checkpoint(text_to_spec_model_path).to(
+        device
+    )
+    model.eval()
+    return gr.Interface(
+        partial(
+            synthesize_audio,
+            text_to_spec_model=model,
+            vocoder_model=vocoder_model,
+            vocoder_config=vocoder_config,
+            language=language,
+            speaker=speaker,
+            output_dir=output_dir,
+            accelerator=accelerator,
+            device=device,
+        ),
+        [
+            "textbox",
+            gr.Slider(0.75, 1.75, 1.0, step=0.25),
+        ],
+        gr.Audio(format="mp3"),
+        title="EveryVoice Demo",
+    )
diff --git a/everyvoice/model/feature_prediction/FastSpeech2_lightning b/everyvoice/model/feature_prediction/FastSpeech2_lightning
diff --git a/everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning b/everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning
diff --git a/requirements.txt b/requirements.txt
@@ -3,6 +3,7 @@ deepdiff>=6.5.0
 anytree>=2.8.0
 einops==0.5.0
 g2p~=2.0.0
+gradio>=4.12.0
 grapheme>=0.6.0
 ipatok>=0.4.1
 librosa==0.9.2