-
Notifications
You must be signed in to change notification settings - Fork 9
/
recognize.py
executable file
·59 lines (51 loc) · 1.9 KB
/
recognize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from espnet2.bin.asr_inference import Speech2Text
import argparse
import numpy as np
import wave, time, os
def get_args():
parser = argparse.ArgumentParser(description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--wav_path", help="path to wav audio", required=True)
print(' '.join(sys.argv))
args = parser.parse_args()
return
def recognize(wavfile):
timer = time.perf_counter()
with wave.open(wavfile, 'rb') as wavfile:
ch=wavfile.getnchannels()
bits=wavfile.getsampwidth()
rate=wavfile.getframerate()
nframes=wavfile.getnframes()
buf = wavfile.readframes(-1)
data=np.frombuffer(buf, dtype='int16')
speech = data.astype(np.float16)/32767.0 #32767 is the upper limit of 16-bit binary numbers and is used for the normalization of int to float.
results = speech2text(speech)
print('time passed:', time.perf_counter()-timer)
print("RECOGNIZED", results[0][0])
return results[0][0]
if __name__ == "__main__":
args = get_args()
wav_file = args.wav_path
if os.path.exists(wav_file):
asr_model_path="exp/asr_train_ksc2_raw_ksc2_char_sp"
lm_model_path="exp/lm_train_lm_ksc2_char"
train_config=asr_model_path + "/config.yaml"
model_file=asr_model_path + "/valid.acc.ave.pth"
lm_config = lm_model_path + "/config.yaml"
lm_file = lm_model_path + "/valid.loss.ave.pth"
speech2text = Speech2Text(
asr_train_config=train_config,
asr_model_file=model_file,
lm_train_config=lm_config,
lm_file=lm_file,
token_type=None,
bpemodel=None,
maxlenratio=0.0,
minlenratio=0.0,
beam_size=10,
ctc_weight=0.5,
lm_weight=0.3,
penalty=0.0,
nbest=1,
device = "cpu"
)
recognize(wav_file)