diff --git a/everyvoice/cli.py b/everyvoice/cli.py index 2317f1c5..cd04ecfc 100644 --- a/everyvoice/cli.py +++ b/everyvoice/cli.py @@ -241,6 +241,55 @@ def test(suite: TestSuites = typer.Argument(TestSuites.dev)): SCHEMAS_TO_OUTPUT: dict[str, Any] = {} # dict[str, type[BaseModel]] +@app.command() +def demo( + text_to_spec_model: Path = typer.Argument( + ..., + file_okay=True, + exists=True, + dir_okay=False, + help="The path to a trained text-to-spec EveryVoice model.", + autocompletion=complete_path, + ), + spec_to_wav_model: Path = typer.Argument( + ..., + help="The path to a trained vocoder.", + dir_okay=False, + file_okay=True, + autocompletion=complete_path, + ), + language: str = typer.Argument( + ..., + help="Specify which language to use in a multilingual system. TODO: let this be selectable in app", + ), + speaker: str = typer.Argument( + ..., + help="Specify which speaker to use in a multispeaker system. TODO: let this be selectable in app", + ), + output_dir: Path = typer.Option( + "synthesis_output", + "--output-dir", + "-o", + file_okay=False, + dir_okay=True, + help="The directory where your synthesized audio should be written", + autocompletion=complete_path, + ), + accelerator: str = typer.Option("auto", "--accelerator", "-a"), +): + from everyvoice.demo.app import create_demo_app + + demo = create_demo_app( + text_to_spec_model_path=text_to_spec_model, + spec_to_wav_model_path=spec_to_wav_model, + language=language, + speaker=speaker, + output_dir=output_dir, + accelerator=accelerator, + ) + demo.launch() + + @app.command(hidden=True) def update_schemas( out_dir: Path = typer.Option( diff --git a/everyvoice/dataloader/__init__.py b/everyvoice/dataloader/__init__.py index 4b114b24..606a446e 100644 --- a/everyvoice/dataloader/__init__.py +++ b/everyvoice/dataloader/__init__.py @@ -1,4 +1,5 @@ import os +from pathlib import Path from typing import Callable, Optional, Union import pytorch_lightning as pl @@ -18,11 +19,16 @@ def __init__( config: Union[ AlignerConfig, VocoderConfig, FeaturePredictionConfig, EveryVoiceConfig ], + inference_output_dir: Optional[Path] = None, ): super().__init__() self.collate_fn: Union[Callable, None] = None self.config = config self.use_weighted_sampler = False + self.inference_output_dir = inference_output_dir + if self.inference_output_dir is not None: + self.inference_output_dir.mkdir(exist_ok=True, parents=True) + self.predict_path = self.inference_output_dir / "latest_predict_data.pth" self.train_path = os.path.join( self.config.training.logger.save_dir, self.config.training.logger.name, @@ -33,11 +39,6 @@ def __init__( self.config.training.logger.name, "val_data.pth", ) - self.predict_path = os.path.join( - self.config.training.logger.save_dir, - self.config.training.logger.name, - "latest_predict_data.pth", - ) def setup(self, stage: Optional[str] = None): # load it back here diff --git a/everyvoice/demo/__init__.py b/everyvoice/demo/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/everyvoice/demo/app.py b/everyvoice/demo/app.py new file mode 100644 index 00000000..52768e54 --- /dev/null +++ b/everyvoice/demo/app.py @@ -0,0 +1,109 @@ +import os +from functools import partial + +import gradio as gr +import torch + +from everyvoice.config.type_definitions import TargetTrainingTextRepresentationLevel +from everyvoice.model.feature_prediction.FastSpeech2_lightning.fs2.cli.synthesize import ( + synthesize_helper, +) +from everyvoice.model.feature_prediction.FastSpeech2_lightning.fs2.model import ( + FastSpeech2, +) +from everyvoice.model.feature_prediction.FastSpeech2_lightning.fs2.prediction_writing_callback import ( + PredictionWritingWavCallback, +) +from everyvoice.model.feature_prediction.FastSpeech2_lightning.fs2.type_definitions import ( + SynthesizeOutputFormats, +) +from everyvoice.model.vocoder.HiFiGAN_iSTFT_lightning.hfgl.utils import ( + load_hifigan_from_checkpoint, +) +from everyvoice.utils.heavy import get_device_from_accelerator + +os.environ["no_proxy"] = "localhost,127.0.0.1,::1" + + +def synthesize_audio( + text, + duration_control, + text_to_spec_model, + vocoder_model, + vocoder_config, + accelerator, + device, + language=None, + speaker=None, + output_dir=None, +): + config, device, predictions = synthesize_helper( + model=text_to_spec_model, + vocoder_model=vocoder_model, + vocoder_config=vocoder_config, + texts=[text], + language=language, + accelerator=accelerator, + devices="1", + device=device, + global_step=1, + output_type=[], + text_representation=TargetTrainingTextRepresentationLevel.characters, + output_dir=output_dir, + speaker=speaker, + duration_control=duration_control, + filelist=None, + teacher_forcing_directory=None, + batch_size=1, + num_workers=1, + ) + output_key = "postnet_output" if text_to_spec_model.config.model.use_postnet else "output" + wav_writer = PredictionWritingWavCallback( + output_dir=output_dir, + config=config, + output_key=output_key, + device=device, + global_step=1, + vocoder_model=vocoder_model, + vocoder_config=vocoder_config, + ) + # move to device because lightning accumulates predictions on cpu + predictions[0][output_key] = predictions[0][output_key].to(device) + wav, sr = wav_writer.synthesize_audio(predictions[0]) + return sr, wav[0] + + +def create_demo_app( + text_to_spec_model_path, + spec_to_wav_model_path, + language, + speaker, + output_dir, + accelerator, +) -> gr.Interface: + device = get_device_from_accelerator(accelerator) + vocoder_ckpt = torch.load(spec_to_wav_model_path, map_location=device) + vocoder_model, vocoder_config = load_hifigan_from_checkpoint(vocoder_ckpt, device) + model: FastSpeech2 = FastSpeech2.load_from_checkpoint(text_to_spec_model_path).to( + device + ) + model.eval() + return gr.Interface( + partial( + synthesize_audio, + text_to_spec_model=model, + vocoder_model=vocoder_model, + vocoder_config=vocoder_config, + language=language, + speaker=speaker, + output_dir=output_dir, + accelerator=accelerator, + device=device, + ), + [ + "textbox", + gr.Slider(0.75, 1.75, 1.0, step=0.25), + ], + gr.Audio(format="mp3"), + title="EveryVoice Demo", + ) diff --git a/everyvoice/model/feature_prediction/FastSpeech2_lightning b/everyvoice/model/feature_prediction/FastSpeech2_lightning index 87c3c9d6..0500b84b 160000 --- a/everyvoice/model/feature_prediction/FastSpeech2_lightning +++ b/everyvoice/model/feature_prediction/FastSpeech2_lightning @@ -1 +1 @@ -Subproject commit 87c3c9d69559cd7ea982d0ab4fe4d92ea9b97fcd +Subproject commit 0500b84b2c89f9fa2147a8a5f6ea2a217588312c diff --git a/everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning b/everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning index 01d31213..709f2e92 160000 --- a/everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning +++ b/everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning @@ -1 +1 @@ -Subproject commit 01d31213037fd23167076838b905e447eb53e568 +Subproject commit 709f2e9250d7dba27b3e10a2ec61c0c8124e563f diff --git a/requirements.txt b/requirements.txt index 4a1b08d4..af98dd74 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ deepdiff>=6.5.0 anytree>=2.8.0 einops==0.5.0 g2p~=2.0.0 +gradio>=4.12.0 grapheme>=0.6.0 ipatok>=0.4.1 librosa==0.9.2