Skip to content

Commit

Permalink
Merge pull request #442 from roedoejet/dev.ap/demo-gradio
Browse files Browse the repository at this point in the history
Dev.ap/demo gradio
  • Loading branch information
roedoejet authored May 29, 2024
2 parents de413a7 + 05b5dab commit 6b4f601
Show file tree
Hide file tree
Showing 7 changed files with 167 additions and 7 deletions.
49 changes: 49 additions & 0 deletions everyvoice/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,55 @@ def test(suite: TestSuites = typer.Argument(TestSuites.dev)):
SCHEMAS_TO_OUTPUT: dict[str, Any] = {} # dict[str, type[BaseModel]]


@app.command()
def demo(
text_to_spec_model: Path = typer.Argument(
...,
file_okay=True,
exists=True,
dir_okay=False,
help="The path to a trained text-to-spec EveryVoice model.",
autocompletion=complete_path,
),
spec_to_wav_model: Path = typer.Argument(
...,
help="The path to a trained vocoder.",
dir_okay=False,
file_okay=True,
autocompletion=complete_path,
),
language: str = typer.Argument(
...,
help="Specify which language to use in a multilingual system. TODO: let this be selectable in app",
),
speaker: str = typer.Argument(
...,
help="Specify which speaker to use in a multispeaker system. TODO: let this be selectable in app",
),
output_dir: Path = typer.Option(
"synthesis_output",
"--output-dir",
"-o",
file_okay=False,
dir_okay=True,
help="The directory where your synthesized audio should be written",
autocompletion=complete_path,
),
accelerator: str = typer.Option("auto", "--accelerator", "-a"),
):
from everyvoice.demo.app import create_demo_app

demo = create_demo_app(
text_to_spec_model_path=text_to_spec_model,
spec_to_wav_model_path=spec_to_wav_model,
language=language,
speaker=speaker,
output_dir=output_dir,
accelerator=accelerator,
)
demo.launch()


@app.command(hidden=True)
def update_schemas(
out_dir: Path = typer.Option(
Expand Down
11 changes: 6 additions & 5 deletions everyvoice/dataloader/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from pathlib import Path
from typing import Callable, Optional, Union

import pytorch_lightning as pl
Expand All @@ -18,11 +19,16 @@ def __init__(
config: Union[
AlignerConfig, VocoderConfig, FeaturePredictionConfig, EveryVoiceConfig
],
inference_output_dir: Optional[Path] = None,
):
super().__init__()
self.collate_fn: Union[Callable, None] = None
self.config = config
self.use_weighted_sampler = False
self.inference_output_dir = inference_output_dir
if self.inference_output_dir is not None:
self.inference_output_dir.mkdir(exist_ok=True, parents=True)
self.predict_path = self.inference_output_dir / "latest_predict_data.pth"
self.train_path = os.path.join(
self.config.training.logger.save_dir,
self.config.training.logger.name,
Expand All @@ -33,11 +39,6 @@ def __init__(
self.config.training.logger.name,
"val_data.pth",
)
self.predict_path = os.path.join(
self.config.training.logger.save_dir,
self.config.training.logger.name,
"latest_predict_data.pth",
)

def setup(self, stage: Optional[str] = None):
# load it back here
Expand Down
Empty file added everyvoice/demo/__init__.py
Empty file.
109 changes: 109 additions & 0 deletions everyvoice/demo/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import os
from functools import partial

import gradio as gr
import torch

from everyvoice.config.type_definitions import TargetTrainingTextRepresentationLevel
from everyvoice.model.feature_prediction.FastSpeech2_lightning.fs2.cli.synthesize import (
synthesize_helper,
)
from everyvoice.model.feature_prediction.FastSpeech2_lightning.fs2.model import (
FastSpeech2,
)
from everyvoice.model.feature_prediction.FastSpeech2_lightning.fs2.prediction_writing_callback import (
PredictionWritingWavCallback,
)
from everyvoice.model.feature_prediction.FastSpeech2_lightning.fs2.type_definitions import (
SynthesizeOutputFormats,
)
from everyvoice.model.vocoder.HiFiGAN_iSTFT_lightning.hfgl.utils import (
load_hifigan_from_checkpoint,
)
from everyvoice.utils.heavy import get_device_from_accelerator

os.environ["no_proxy"] = "localhost,127.0.0.1,::1"


def synthesize_audio(
text,
duration_control,
text_to_spec_model,
vocoder_model,
vocoder_config,
accelerator,
device,
language=None,
speaker=None,
output_dir=None,
):
config, device, predictions = synthesize_helper(
model=text_to_spec_model,
vocoder_model=vocoder_model,
vocoder_config=vocoder_config,
texts=[text],
language=language,
accelerator=accelerator,
devices="1",
device=device,
global_step=1,
output_type=[],
text_representation=TargetTrainingTextRepresentationLevel.characters,
output_dir=output_dir,
speaker=speaker,
duration_control=duration_control,
filelist=None,
teacher_forcing_directory=None,
batch_size=1,
num_workers=1,
)
output_key = "postnet_output" if text_to_spec_model.config.model.use_postnet else "output"
wav_writer = PredictionWritingWavCallback(
output_dir=output_dir,
config=config,
output_key=output_key,
device=device,
global_step=1,
vocoder_model=vocoder_model,
vocoder_config=vocoder_config,
)
# move to device because lightning accumulates predictions on cpu
predictions[0][output_key] = predictions[0][output_key].to(device)
wav, sr = wav_writer.synthesize_audio(predictions[0])
return sr, wav[0]


def create_demo_app(
text_to_spec_model_path,
spec_to_wav_model_path,
language,
speaker,
output_dir,
accelerator,
) -> gr.Interface:
device = get_device_from_accelerator(accelerator)
vocoder_ckpt = torch.load(spec_to_wav_model_path, map_location=device)
vocoder_model, vocoder_config = load_hifigan_from_checkpoint(vocoder_ckpt, device)
model: FastSpeech2 = FastSpeech2.load_from_checkpoint(text_to_spec_model_path).to(
device
)
model.eval()
return gr.Interface(
partial(
synthesize_audio,
text_to_spec_model=model,
vocoder_model=vocoder_model,
vocoder_config=vocoder_config,
language=language,
speaker=speaker,
output_dir=output_dir,
accelerator=accelerator,
device=device,
),
[
"textbox",
gr.Slider(0.75, 1.75, 1.0, step=0.25),
],
gr.Audio(format="mp3"),
title="EveryVoice Demo",
)
2 changes: 1 addition & 1 deletion everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ deepdiff>=6.5.0
anytree>=2.8.0
einops==0.5.0
g2p~=2.0.0
gradio>=4.12.0
grapheme>=0.6.0
ipatok>=0.4.1
librosa==0.9.2
Expand Down

0 comments on commit 6b4f601

Please sign in to comment.