Skip to content

Commit

Permalink
perf: speed up g2p processing by caching the results (#464)
Browse files Browse the repository at this point in the history
The cache is implemented on a token basis to keep the results
identical but maximize reuse potential.

Fixes: #446
  • Loading branch information
joanise authored Jun 13, 2024
1 parent 1beb279 commit 689f324
Showing 1 changed file with 46 additions and 28 deletions.
74 changes: 46 additions & 28 deletions everyvoice/text/phonemizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
""" EveryVoice performs grapheme-to-phoneme conversion based on language IDs
All g2p engines must return tokenized characters.
"""

import re
from typing import Callable
from unicodedata import normalize

Expand All @@ -20,6 +22,49 @@
# IMPORTANT: Your g2p engine must return a list of tokenized symbols, and all of the returned symbols must be defined in your everyvoice-shared-text-config.yaml file.


class CachingG2PEngine:
"""caching tokenizing g2p engine"""

def __init__(self, lang_id):
self._cache = {}
self.phonemizer = make_g2p(lang_id, f"{lang_id}-ipa")

def process_one_token(self, input_token: str) -> list[str]:
"""Process one input token, dumbly split on whitespace.
The output can be multiple tokens, since a proper tokenizer is used."""
# ipatok strips some important characters, so as a hack,
# we convert them to the private use area first
PUA_CHARS = ["_", " ", ".", "ˈ", "ˌ"]
PUA_START_NUMBER = 983040 # U+F0000
text = self.phonemizer(input_token).output_string
for i, char in enumerate(PUA_CHARS):
text = text.replace(char, chr(PUA_START_NUMBER + i))
tokens = tokenise(text, replace=False, tones=True, strict=False, unknown=True)
# normalize the output since ipatok applies NFD
unicode_normalization_form = self.phonemizer.transducers[-1].norm_form.value
if unicode_normalization_form != "none":
tokens = [normalize(unicode_normalization_form, token) for token in tokens]
# convert the pua tokens back to their originals
for i, token in enumerate(tokens):
# PUA tokens have length 1
if len(token) == 1:
token_ord = ord(token)
if token_ord >= PUA_START_NUMBER:
tokens[i] = PUA_CHARS[token_ord - PUA_START_NUMBER]
return tokens

def __call__(self, normalized_input_text: str) -> list[str]:
input_tokens = re.split(r"(\s+)", normalized_input_text)
output_tokens = []
for token in input_tokens:
cached = self._cache.get(token, None)
if cached is None:
cached = self.process_one_token(token)
self._cache[token] = cached
output_tokens += cached
return output_tokens


def get_g2p_engine(lang_id: str):

if lang_id not in AVAILABLE_G2P_ENGINES:
Expand All @@ -29,34 +74,7 @@ def get_g2p_engine(lang_id: str):
)

if AVAILABLE_G2P_ENGINES[lang_id] == "DEFAULT_G2P":
phonemizer = make_g2p(lang_id, f"{lang_id}-ipa")

def g2p_engine(normalized_input_text: str) -> list[str]:
# ipatok strips some important characters, so as a hack, we convert them to the private use area first
PUA_CHARS = ["_", " ", ".", "ˈ", "ˌ"]
PUA_START_NUMBER = 983040 # U+F0000
text = phonemizer(normalized_input_text).output_string
for i, char in enumerate(PUA_CHARS):
text = text.replace(char, chr(PUA_START_NUMBER + i))
tokens = tokenise(
text, replace=False, tones=True, strict=False, unknown=True
)
# normalize the output since ipatok applies NFD
unicode_normalization_form = phonemizer.transducers[-1].norm_form.value
if unicode_normalization_form != "none":
tokens = [
normalize(unicode_normalization_form, token) for token in tokens
]
# convert the pua tokens back to their originals
for i, token in enumerate(tokens):
# PUA tokens have length 1
if len(token) == 1:
token_ord = ord(token)
if token_ord >= PUA_START_NUMBER:
tokens[i] = PUA_CHARS[token_ord - PUA_START_NUMBER]
return tokens

# Register the engine so we don't have to build it next time
AVAILABLE_G2P_ENGINES[lang_id] = g2p_engine
AVAILABLE_G2P_ENGINES[lang_id] = CachingG2PEngine(lang_id)

return AVAILABLE_G2P_ENGINES[lang_id]

0 comments on commit 689f324

Please sign in to comment.