From 5cde4145a896d4a9973ad6dd9deb41428ac230fd Mon Sep 17 00:00:00 2001
From: Aidan Pine <hello@aidanpine.ca>
Date: Mon, 13 May 2024 18:41:10 +0000
Subject: [PATCH] refactor!: simplify codebase by removing original hifigan
 code

fixes: https://github.com/roedoejet/EveryVoice/issues/425
---
 docs/guides/custom.md                         |   4 +-
 .../original_hifigan_helper/__init__.py       | 235 ------------------
 2 files changed, 2 insertions(+), 237 deletions(-)
 delete mode 100644 everyvoice/model/vocoder/original_hifigan_helper/__init__.py

diff --git a/docs/guides/custom.md b/docs/guides/custom.md
index 54be0848..b322581c 100644
--- a/docs/guides/custom.md
+++ b/docs/guides/custom.md
@@ -100,10 +100,10 @@ everyvoice train text-to-spec config/{{ config_filename('text-to-spec') }}
 
 ## Step 8: Synthesize Speech in Your Language!
 
-You can synthesize by pointing the CLI to your trained feature prediction network and passing in the text. You can export to wav, npy, or pt files.
+You can synthesize by pointing the CLI to your trained feature prediction network and passing in the text. You can export the wav or spectrogram (pt) files.
 
 ```bash
-everyvoice synthesize from-text logs_and_checkpoints/FeaturePredictionExperiment/base/checkpoints/last.ckpt -t "මෙදා සැරේ සාකච්ඡාවක් විදියට නෙවෙයි නේද පල කරල තියෙන්නෙ" -a gpu -d 1
+everyvoice synthesize from-text logs_and_checkpoints/FeaturePredictionExperiment/base/checkpoints/last.ckpt -t "මෙදා සැරේ සාකච්ඡාවක් විදියට නෙවෙයි නේද පල කරල තියෙන්නෙ" -a gpu -d 1 --output-type wav
 ```
 
 <!-- % Step 10 (optional): Finetune your vocoder
diff --git a/everyvoice/model/vocoder/original_hifigan_helper/__init__.py b/everyvoice/model/vocoder/original_hifigan_helper/__init__.py
deleted file mode 100644
index 1e94957b..00000000
--- a/everyvoice/model/vocoder/original_hifigan_helper/__init__.py
+++ /dev/null
@@ -1,235 +0,0 @@
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn import Conv1d, ConvTranspose1d
-from torch.nn.utils import remove_weight_norm, weight_norm
-from torch.serialization import FILE_LIKE
-
-LRELU_SLOPE = 0.1
-
-
-def init_weights(m, mean=0.0, std=0.01):
-    classname = m.__class__.__name__
-    if classname.find("Conv") != -1:
-        m.weight.data.normal_(mean, std)
-
-
-def get_padding(kernel_size, dilation=1):
-    return int((kernel_size * dilation - dilation) / 2)
-
-
-class ResBlock(torch.nn.Module):
-    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
-        super(ResBlock, self).__init__()
-        self.h = h
-        self.convs1 = nn.ModuleList(
-            [
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[0],
-                        padding=get_padding(kernel_size, dilation[0]),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[1],
-                        padding=get_padding(kernel_size, dilation[1]),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=dilation[2],
-                        padding=get_padding(kernel_size, dilation[2]),
-                    )
-                ),
-            ]
-        )
-        self.convs1.apply(init_weights)
-
-        self.convs2 = nn.ModuleList(
-            [
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-                weight_norm(
-                    Conv1d(
-                        channels,
-                        channels,
-                        kernel_size,
-                        1,
-                        dilation=1,
-                        padding=get_padding(kernel_size, 1),
-                    )
-                ),
-            ]
-        )
-        self.convs2.apply(init_weights)
-
-    def forward(self, x):
-        for c1, c2 in zip(self.convs1, self.convs2):
-            xt = F.leaky_relu(x, LRELU_SLOPE)
-            xt = c1(xt)
-            xt = F.leaky_relu(xt, LRELU_SLOPE)
-            xt = c2(xt)
-            x = xt + x
-        return x
-
-    def remove_weight_norm(self):
-        for layer in self.convs1:
-            remove_weight_norm(layer)
-        for layer in self.convs2:
-            remove_weight_norm(layer)
-
-
-class Generator(torch.nn.Module):
-    def __init__(self, h):
-        super(Generator, self).__init__()
-        self.h = h
-        self.num_kernels = len(h.resblock_kernel_sizes)
-        self.num_upsamples = len(h.upsample_rates)
-        self.conv_pre = weight_norm(
-            Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3)
-        )
-        resblock = ResBlock
-
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
-            self.ups.append(
-                weight_norm(
-                    ConvTranspose1d(
-                        h.upsample_initial_channel // (2**i),
-                        h.upsample_initial_channel // (2 ** (i + 1)),
-                        k,
-                        u,
-                        padding=(k - u) // 2,
-                    )
-                )
-            )
-
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = h.upsample_initial_channel // (2 ** (i + 1))
-            for j, (k, d) in enumerate(
-                zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
-            ):
-                self.resblocks.append(resblock(h, ch, k, d))
-
-        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
-        self.ups.apply(init_weights)
-        self.conv_post.apply(init_weights)
-
-    def forward(self, x):
-        x = self.conv_pre(x)
-        for i in range(self.num_upsamples):
-            x = F.leaky_relu(x, LRELU_SLOPE)
-            x = self.ups[i](x)
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        x = torch.tanh(x)
-
-        return x
-
-    def remove_weight_norm(self):
-        print("Removing weight norm...")
-        for layer in self.ups:
-            remove_weight_norm(layer)
-        for layer in self.resblocks:
-            layer.remove_weight_norm()
-        remove_weight_norm(self.conv_pre)
-        remove_weight_norm(self.conv_post)
-
-
-class AttrDict(dict):
-    def __init__(self, *args, **kwargs):
-        super(AttrDict, self).__init__(*args, **kwargs)
-        self.__dict__ = self
-
-
-UNIVERSAL_CONFIG = {
-    "resblock": "1",
-    "num_gpus": 0,
-    "batch_size": 16,
-    "learning_rate": 0.0002,
-    "adam_b1": 0.8,
-    "adam_b2": 0.99,
-    "lr_decay": 0.999,
-    "seed": 1234,
-    "upsample_rates": [8, 8, 2, 2],
-    "upsample_kernel_sizes": [16, 16, 4, 4],
-    "upsample_initial_channel": 512,
-    "resblock_kernel_sizes": [3, 7, 11],
-    "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
-    "segment_size": 8192,
-    "num_mels": 80,
-    "num_freq": 1025,
-    "n_fft": 1024,
-    "hop_size": 256,
-    "win_size": 1024,
-    "sampling_rate": 22050,
-    "fmin": 0,
-    "fmax": 8000,
-    "fmax_for_loss": None,
-    "num_workers": 4,
-    "dist_config": {
-        "dist_backend": "nccl",
-        "dist_url": "tcp://localhost:54321",
-        "world_size": 1,
-    },
-}
-
-
-def get_vocoder(path: FILE_LIKE, device: torch.device) -> Generator:
-    vocoder = Generator(AttrDict(UNIVERSAL_CONFIG))
-    ckpt = torch.load(path, map_location=device)
-    vocoder.load_state_dict(ckpt["generator"])
-    vocoder.eval()
-    vocoder.remove_weight_norm()
-    vocoder.to(device)
-
-    return vocoder
-
-
-def vocoder_infer(mels: torch.Tensor, vocoder: Generator) -> np.ndarray:
-    # mels (1, 80, 111) normal
-    # mels small (1, 80, 5)
-    with torch.no_grad():
-        wavs = vocoder(mels.transpose(1, 2)).squeeze(1)
-    wavs = wavs.cpu().numpy()  # B, T
-    return wavs