diff --git a/.vscode/launch.json b/.vscode/launch.json index 7506f56..5464db1 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -55,10 +55,12 @@ "module": "subplz", "args": [ "sync", - "--audio ", - "'/mnt/v/test/single/1.mkv'", + "--audio", + "/mnt/v/test/single/1.mkv", "--text", - "'/mnt/v/test/single/1.srt'", + "/mnt/v/test/single/1.srt", + "--output-format", + "vtt" // "--audio", // "/mnt/d/sync/変な家/変な家.m4b", // "--text", diff --git a/anki_importer/anki-importer.py b/anki_importer/anki-importer.py index 67d08c3..f306efd 100644 --- a/anki_importer/anki-importer.py +++ b/anki_importer/anki-importer.py @@ -8,7 +8,7 @@ import multiprocessing from tqdm import tqdm from tqdm.contrib.concurrent import process_map -from utils import grab_files, get_mapping +from subplz.utils import grab_files ANKI_CONNECT_URL = "" @@ -101,6 +101,11 @@ def validate_args(args): print("[E] --col is only supported with --no-anki-connect") exit(1) +def get_mapping(mapping_path): + with open(mapping_path) as f: + mapping = json.load(f) + print(f"Reading mapping: {mapping}") + return mapping def parse_ac_response(response): if len(response) != 2: diff --git a/deprecated/align-v1.py b/deprecated/align-v1.py new file mode 100644 index 0000000..773e039 --- /dev/null +++ b/deprecated/align-v1.py @@ -0,0 +1,373 @@ +from fuzzywuzzy import fuzz +import argparse +import sys +import re + +parser = argparse.ArgumentParser(description="Align a script to vtt subs") +parser.add_argument("--mode", dest="mode", type=int, default=2, help="matching mode") +parser.add_argument( + "--max-merge", + dest="max_merge", + type=int, + default=6, + help="max subs to merge into one line", +) + +parser.add_argument( + "script", type=argparse.FileType("r", encoding="UTF-8"), help="script file path" +) +parser.add_argument( + "subs", + type=argparse.FileType("r", encoding="UTF-8"), + help=".vtt subtitle file path", +) +parser.add_argument( + "out", + type=argparse.FileType("w", encoding="UTF-8"), + help="aligned output file path", +) + +args = parser.parse_args(sys.argv[1:]) + +MAX_MERGE_COUNT = ( + args.max_merge +) # Larger gives better results, but takes longer to process. +MAX_SEARCH_CONTEXT = MAX_MERGE_COUNT * 2 + + +class ScriptLine: + def __init__(self, line): + self.line = line + self.txt = re.sub("「|」|『|』|、|。|・|?|…|―", "", line) + + def __repr__(self): + return "ScriptLine(%s)" % self.line + + +class Subtitle: + def __init__(self, start, end, line): + self.start = start + self.end = end + self.line = line + + +def get_lines(file): + for line in file: + yield line.rstrip("\n") + + +def read_script(file): + for line in file: + line = line.rstrip("\n") + if line == "": + continue + yield line + + +def remove_tags(line): + return re.sub("<[^>]*>", "", line) + + +def read_vtt(file): + lines = get_lines(file) + + subs = [] + header = next(lines) + assert header == "WEBVTT" + # assert next(lines) == "Kind: captions" + # assert next(lines).startswith("Language:") + assert next(lines) == "" + + last_sub = " " + + while True: + # for t in range(0, 10): + line = next(lines, None) + if line == None: # EOF + break + # print(line) + m = re.findall( + r"(\d\d:\d\d:\d\d.\d\d\d) --> (\d\d:\d\d:\d\d.\d\d\d)|(\d\d:\d\d.\d\d\d) --> (\d\d:\d\d.\d\d\d)|(\d\d:\d\d.\d\d\d) --> (\d\d:\d\d:\d\d.\d\d\d)", + line, + ) + if not m: + print( + f'Warning: Line "{line}" did not look like a valid VTT input. There could be issues parsing this sub' + ) + continue + + matchPair = [list(filter(None, x)) for x in m][0] + sub_start = matchPair[0] # .replace('.', ',') + sub_end = matchPair[1] + + line = next(lines) + while line: + sub = remove_tags(line) + if last_sub != sub and sub not in [" ", "[音楽]"]: + last_sub = sub + # print("sub:", sub_start, sub_end, sub) + subs.append(Subtitle(sub_start, sub_end, sub)) + elif last_sub == sub and subs: + subs[-1].end = sub_end + # print("Update sub:", subs[-1].start, subs[-1].end, subs[-1].text) + try: + line = next(lines) + except StopIteration: + line = None + + return subs + + +script = [ScriptLine(line.strip()) for line in read_script(args.script)] +subs = read_vtt(args.subs) + +# Trim script for quick testing +# script = script[:500] +# subs = subs[:1000] + +# Use dynamic programming to pick best subs mapping +memo = {} + + +def get_script(script_pos, num_used, sep=""): + end = min(len(script), script_pos + num_used) + return sep.join([sub.line for sub in script[script_pos:end]]) + + +def get_base(sub_pos, num_used, sep=""): + end = min(len(subs), sub_pos + num_used) + return sep.join([sub.line for sub in subs[sub_pos:end]]) + + +def get_best_sub_n( + script_pos, num_used_script, last_script_pos, sub_pos, max_subs, last_sub_to_test +): + t_best_score = 0 + t_best_used_sub = 1 + + line = get_script(script_pos, num_used_script) + + remaining_subs = last_sub_to_test - sub_pos + + for num_used_sub in range(1, min(max_subs, remaining_subs) + 1): + base = get_base(sub_pos, num_used_sub) + curr_score = fuzz.ratio(base, line) / 100.0 * min(len(line), len(base)) + tot_score = curr_score + calc_best_score( + script_pos + num_used_script, + last_script_pos, + sub_pos + num_used_sub, + last_sub_to_test, + ) + if tot_score > t_best_score: + t_best_score = tot_score + t_best_used_sub = num_used_sub + + return (t_best_score, t_best_used_sub) + + +best_script_score_and_sub = {} + + +def calc_best_score(script_pos, last_script_pos, sub_pos, last_sub_to_test): + if script_pos >= len(script) or sub_pos >= len(subs): + return 0 + + key = (script_pos, sub_pos) + if key in memo: + return memo[key][0] + + best_score = 0 + best_used_sub = 1 + best_used_script = 1 + + remaining_script = last_script_pos - script_pos + + for num_used_script in range(1, min(MAX_MERGE_COUNT, remaining_script) + 1): + max_subs = MAX_MERGE_COUNT if num_used_script == 1 else 1 + t_best_score, t_best_used_sub = get_best_sub_n( + script_pos, + num_used_script, + last_script_pos, + sub_pos, + max_subs, + last_sub_to_test, + ) + + if t_best_score > best_score: + best_score = t_best_score + best_used_sub = t_best_used_sub + best_used_script = num_used_script + + if best_used_script > 1: + # Do one more fitting + t_best_score, t_best_used_sub = get_best_sub_n( + script_pos, + best_used_script, + last_script_pos, + sub_pos, + MAX_MERGE_COUNT, + last_sub_to_test, + ) + if t_best_score > best_score: + best_score = t_best_score + best_used_sub = t_best_used_sub + + key = (script_pos, sub_pos) + memo[key] = (best_score, best_used_sub, best_used_script) + + # Save best sub pos for this script pos + best_prev_score, best_sub = best_script_score_and_sub.get(script_pos, (0, None)) + if best_score >= best_prev_score: + best_script_score_and_sub[script_pos] = (best_score, key) + + return best_score + + +def get_best_sub_path(script_pos, n, last_script_pos, last_sub_to_test): + _, key = best_script_score_and_sub[script_pos] + ret = [] + sub_pos = key[1] + + i = 0 + while i < n and script_pos < last_script_pos and sub_pos < last_sub_to_test: + ret.append((script_pos, sub_pos)) + decision = memo[(script_pos, sub_pos)] + num_used_sub = decision[1] + num_used_script = decision[2] + sub_pos += num_used_sub + script_pos += num_used_script + i += 1 + return ret + + +def test_sub_pos(script_pos, last_script_pos, first_sub_to_test, last_sub_to_test): + for sub_pos in range(last_sub_to_test - 1, first_sub_to_test - 1, -1): + calc_best_score(script_pos, last_script_pos, sub_pos, last_sub_to_test) + + +def recursively_find_match(result, first_script, last_script, first_sub, last_sub): + if first_script == last_script or first_sub == last_sub: + return + + memo.clear() + best_script_score_and_sub.clear() + + mid = (first_script + last_script) // 2 + start = max(first_script, mid - MAX_SEARCH_CONTEXT) + end = min(mid + MAX_SEARCH_CONTEXT, last_script) + + # print('testing first %d last %d mid %d' % (first_script, last_script, mid)) + for script_pos in range(end - 1, start - 1, -1): + test_sub_pos(script_pos, end, first_sub, last_sub) + + best_path = get_best_sub_path(start, end - start, end, last_sub) + if len(best_path) > 0: + for p in best_path: + if p[0] > mid: + break + mid_key = p + + mid_memo = memo[mid_key] + script_pos = mid_key[0] + sub_pos = mid_key[1] + num_used_script = mid_memo[2] + num_used_sub = mid_memo[1] + + # Recurse before + recursively_find_match(result, first_script, script_pos, first_sub, sub_pos) + + scr = get_script(script_pos, num_used_script, " ‖ ") + scr_out = get_script(script_pos, num_used_script, "") + base = get_base(sub_pos, num_used_sub, " ‖ ") + + print((script_pos, num_used_script, sub_pos, num_used_sub), scr, "==", base) + result.append((script_pos, num_used_script, sub_pos, num_used_sub)) + + # Recurse after + recursively_find_match( + result, + script_pos + num_used_script, + last_script, + sub_pos + num_used_sub, + last_sub, + ) + + +new_subs = [] + +if args.mode == 1: + last_script_to_test = len(script) + last_sub_to_test = len(subs) + first_sub_to_test = 0 + for script_pos in range(len(script) - 1, -1, -1): + if script_pos == 0: + first_sub_to_test = 0 + if (script_pos % 10) == 0: + print( + "%d/%d testing %d - %d subs " + % (script_pos, len(script), first_sub_to_test, last_sub_to_test) + ) + + test_sub_pos( + script_pos, last_script_to_test, first_sub_to_test, last_sub_to_test + ) + + # Construct new subs using the memo trace. + script_pos = 0 + sub_pos = 0 + + while script_pos < len(script) and sub_pos < len(subs): + try: + decision = memo[(script_pos, sub_pos)] + except: + print("Missing key?", script_pos, sub_pos) + break + # print(decision, subs[sub_pos].line) + num_used_sub = decision[1] + num_used_script = decision[2] + scr_out = get_script(script_pos, num_used_script, "") + scr = get_script(script_pos, num_used_script, " ‖ ") + + if num_used_sub: + base = get_base(sub_pos, num_used_sub, " ‖ ") + print("Record:", script_pos, scr, "==", base) + new_subs.append( + Subtitle( + subs[sub_pos].start, subs[sub_pos + num_used_sub - 1].end, scr_out + ) + ) + sub_pos += num_used_sub + else: + print("Skip: ", script[script_pos].line) + script_pos += num_used_script +elif args.mode == 2: + result = [] + recursively_find_match(result, 0, len(script), 0, len(subs)) + + for i, (script_pos, num_used_script, sub_pos, num_used_sub) in enumerate(result): + if i == 0: + script_pos = 0 + sub_pos = 0 + + if i + 1 < len(result): + num_used_script = result[i + 1][0] - script_pos + num_used_sub = result[i + 1][2] - sub_pos + else: + num_used_script = len(script) - script_pos + num_used_sub = len(subs) - sub_pos + + scr_out = get_script(script_pos, num_used_script, "") + scr = get_script(script_pos, num_used_script, " ‖ ") + base = get_base(sub_pos, num_used_sub, " ‖ ") + + print("Record:", script_pos, scr, "==", base) + new_subs.append( + Subtitle(subs[sub_pos].start, subs[sub_pos + num_used_sub - 1].end, scr_out) + ) +else: + sys.exit("Unknown mode %d" % args.mode) + +for n, sub in enumerate(new_subs): + args.out.write("%d\n" % (n + 1)) + args.out.write("%s --> %s\n" % (sub.start, sub.end)) + args.out.write("%s\n\n" % (sub.line)) diff --git a/deprecated/align.py b/deprecated/align.py deleted file mode 100644 index bf38680..0000000 --- a/deprecated/align.py +++ /dev/null @@ -1,339 +0,0 @@ -# from fuzzywuzzy import fuzz -# import argparse -# import sys -# import re -# from utils import Subtitle, read_vtt, write_sub -# from tqdm import tqdm - - -# MAX_MERGE_COUNT = ( -# 6 -# ) # Larger gives better results, but takes longer to process. -# MAX_SEARCH_CONTEXT = MAX_MERGE_COUNT * 2 - -# # Trim script for quick testing -# # script = script[:500] -# # subs = subs[:1000] - -# # Use dynamic programming to pick best subs mapping -# memo = {} - - -# class ScriptLine: -# def __init__(self, line): -# self.line = line -# self.txt = re.sub("「|」|『|』|、|。|・|?|…|―|─|!|(|)", "", line) - -# def __repr__(self): -# return "ScriptLine(%s)" % self.line - - -# def read_script(file): -# for line in file: -# line = line.rstrip("\n") -# if line == "": -# continue -# yield line - - -# def get_script(script, script_pos, num_used, sep=""): -# end = min(len(script), script_pos + num_used) -# return sep.join([sub.line for sub in script[script_pos:end]]) - - -# def get_base(subs, sub_pos, num_used, sep=""): -# end = min(len(subs), sub_pos + num_used) -# return sep.join([sub.line for sub in subs[sub_pos:end]]) - - -# def get_best_sub_n( -# script, subs, script_pos, num_used_script, last_script_pos, sub_pos, max_subs, last_sub_to_test -# ): -# t_best_score = 0 -# t_best_used_sub = 1 - -# line = get_script(script, script_pos, num_used_script) - -# remaining_subs = last_sub_to_test - sub_pos - -# for num_used_sub in range(1, min(max_subs, remaining_subs) + 1): -# base = get_base(subs, sub_pos, num_used_sub) -# curr_score = fuzz.ratio(base, line) / 100.0 * min(len(line), len(base)) -# tot_score = curr_score + calc_best_score( -# script, -# subs, -# script_pos + num_used_script, -# last_script_pos, -# sub_pos + num_used_sub, -# last_sub_to_test, -# ) -# if tot_score > t_best_score: -# t_best_score = tot_score -# t_best_used_sub = num_used_sub - -# return (t_best_score, t_best_used_sub) - - -# best_script_score_and_sub = {} - - -# def calc_best_score(script, subs, script_pos, last_script_pos, sub_pos, last_sub_to_test): -# if script_pos >= len(script) or sub_pos >= len(subs): -# return 0 - -# key = (script_pos, sub_pos) -# if key in memo: -# return memo[key][0] - -# best_score = 0 -# best_used_sub = 1 -# best_used_script = 1 - -# remaining_script = last_script_pos - script_pos - -# for num_used_script in range(1, min(MAX_MERGE_COUNT, remaining_script) + 1): -# max_subs = MAX_MERGE_COUNT if num_used_script == 1 else 1 -# t_best_score, t_best_used_sub = get_best_sub_n( -# script, -# subs, -# script_pos, -# num_used_script, -# last_script_pos, -# sub_pos, -# max_subs, -# last_sub_to_test, -# ) - -# if t_best_score > best_score: -# best_score = t_best_score -# best_used_sub = t_best_used_sub -# best_used_script = num_used_script - -# if best_used_script > 1: -# # Do one more fitting -# t_best_score, t_best_used_sub = get_best_sub_n( -# script, -# subs, -# script_pos, -# best_used_script, -# last_script_pos, -# sub_pos, -# MAX_MERGE_COUNT, -# last_sub_to_test, -# ) -# if t_best_score > best_score: -# best_score = t_best_score -# best_used_sub = t_best_used_sub - -# key = (script_pos, sub_pos) -# memo[key] = (best_score, best_used_sub, best_used_script) - -# # Save best sub pos for this script pos -# best_prev_score, best_sub = best_script_score_and_sub.get(script_pos, (0, None)) -# if best_score >= best_prev_score: -# best_script_score_and_sub[script_pos] = (best_score, key) - -# return best_score - - -# def get_best_sub_path(script_pos, n, last_script_pos, last_sub_to_test): -# _, key = best_script_score_and_sub[script_pos] -# ret = [] -# sub_pos = key[1] - -# i = 0 -# while i < n and script_pos < last_script_pos and sub_pos < last_sub_to_test: -# ret.append((script_pos, sub_pos)) -# decision = memo[(script_pos, sub_pos)] -# num_used_sub = decision[1] -# num_used_script = decision[2] -# sub_pos += num_used_sub -# script_pos += num_used_script -# i += 1 -# return ret - - -# def test_sub_pos(script, subs, script_pos, last_script_pos, first_sub_to_test, last_sub_to_test): -# for sub_pos in range(last_sub_to_test - 1, first_sub_to_test - 1, -1): -# calc_best_score(script, subs, script_pos, last_script_pos, sub_pos, last_sub_to_test) - - -# def recursively_find_match(script, subs, result, first_script, last_script, first_sub, last_sub, bar): -# bar.total += 1 -# bar.refresh() -# if first_script == last_script or first_sub == last_sub: -# return - -# memo.clear() -# best_script_score_and_sub.clear() - -# mid = (first_script + last_script) // 2 -# start = max(first_script, mid - MAX_SEARCH_CONTEXT) -# end = min(mid + MAX_SEARCH_CONTEXT, last_script) - -# # print('testing first %d last %d mid %d' % (first_script, last_script, mid)) -# for script_pos in range(end - 1, start - 1, -1): -# test_sub_pos(script, subs, script_pos, end, first_sub, last_sub) - -# best_path = get_best_sub_path(start, end - start, end, last_sub) -# if len(best_path) > 0: -# for p in best_path: -# if p[0] > mid: -# break -# mid_key = p - -# mid_memo = memo[mid_key] -# script_pos = mid_key[0] -# sub_pos = mid_key[1] -# num_used_script = mid_memo[2] -# num_used_sub = mid_memo[1] - -# # Recurse before -# recursively_find_match( -# script, subs, result, first_script, script_pos, first_sub, sub_pos, bar -# ) -# bar.update(1) -# scr = get_script(script, script_pos, num_used_script, " ‖ ") -# scr_out = get_script(script, script_pos, num_used_script, "") -# base = get_base(subs, sub_pos, num_used_sub, " ‖ ") - -# # print((script_pos, num_used_script, sub_pos, num_used_sub), scr, '==', base) -# result.append((script_pos, num_used_script, sub_pos, num_used_sub)) - -# # Recurse after -# recursively_find_match( -# script, -# subs, -# result, -# script_pos + num_used_script, -# last_script, -# sub_pos + num_used_sub, -# last_sub, -# bar, -# ) -# bar.update(1) -# bar.update(1) -# # t.total = new_total -# # t.refresh() - -# def run(split_script, subs_file, out, mode=2): -# with open(split_script, encoding='utf-8') as s: -# script = [ScriptLine(line.strip()) for line in read_script(s)] -# print(subs_file) -# with open(subs_file, encoding='utf-8') as vtt: -# subs = read_vtt(vtt) -# new_subs = [] - -# if mode == 1: -# last_script_to_test = len(script) -# last_sub_to_test = len(subs) -# first_sub_to_test = 0 -# for script_pos in range(len(script) - 1, -1, -1): -# if script_pos == 0: -# first_sub_to_test = 0 -# if (script_pos % 10) == 0: -# print( -# "%d/%d testing %d - %d subs " -# % (script_pos, len(script), first_sub_to_test, last_sub_to_test) -# ) - -# test_sub_pos( -# script, subs, script_pos, last_script_to_test, first_sub_to_test, last_sub_to_test -# ) - -# # Construct new subs using the memo trace. -# script_pos = 0 -# sub_pos = 0 - -# while script_pos < len(script) and sub_pos < len(subs): -# try: -# decision = memo[(script_pos, sub_pos)] -# except: -# print("Missing key?", script_pos, sub_pos) -# break -# # print(decision, subs[sub_pos].line) -# num_used_sub = decision[1] -# num_used_script = decision[2] -# scr_out = get_script(script, script_pos, num_used_script, "") -# scr = get_script(script, script_pos, num_used_script, " ‖ ") - -# if num_used_sub: -# base = get_base(subs, sub_pos, num_used_sub, " ‖ ") -# print("Record:", script_pos, scr, "==", base) -# new_subs.append( -# Subtitle( -# subs[sub_pos].start, subs[sub_pos + num_used_sub - 1].end, scr_out -# ) -# ) -# sub_pos += num_used_sub -# else: -# print("Skip: ", script[script_pos].line) -# script_pos += num_used_script -# elif mode == 2: -# result = [] -# print("Matching subs to sentences. This can take a while...") -# bar = tqdm(total=0) -# recursively_find_match(script, subs, result, 0, len(script), 0, len(subs), bar) -# bar.close() -# for i, (script_pos, num_used_script, sub_pos, num_used_sub) in enumerate( -# tqdm(result) -# ): -# if i == 0: -# script_pos = 0 -# sub_pos = 0 - -# if i + 1 < len(result): -# num_used_script = result[i + 1][0] - script_pos -# num_used_sub = result[i + 1][2] - sub_pos -# else: -# num_used_script = len(script) - script_pos -# num_used_sub = len(subs) - sub_pos - -# scr_out = get_script(script, script_pos, num_used_script, "") -# scr = get_script(script, script_pos, num_used_script, " ‖ ") -# base = get_base(subs, sub_pos, num_used_sub, " ‖ ") - -# # print('Record:', script_pos, scr, '==', base) -# new_subs.append( -# Subtitle(subs[sub_pos].start, subs[sub_pos + num_used_sub - 1].end, scr_out) -# ) -# else: -# sys.exit("Unknown mode %d" % mode) - -# write_sub(out, new_subs) - -# def get_args(): -# parser = argparse.ArgumentParser(description="Align a script to vtt subs") -# parser.add_argument( -# "--mode", dest="mode", type=int, default=2, help="matching mode" -# ) -# parser.add_argument( -# "--max-merge", -# dest="max_merge", -# type=int, -# default=6, -# help="max subs to merge into one line", -# ) - -# parser.add_argument( -# "script", type=argparse.FileType("r", encoding="UTF-8"), help="script file path" -# ) -# parser.add_argument( -# "subs", -# type=argparse.FileType("r", encoding="UTF-8"), -# help=".vtt subtitle file path", -# ) -# parser.add_argument( -# "out", -# type=argparse.FileType("w", encoding="UTF-8"), -# help="aligned output file path", -# ) - -# args = parser.parse_args(sys.argv[1:]) -# return args - - -# if __name__ == "__main__": -# args = get_args() -# # "$FOLDER/$SCRIPTNAME.split.txt" "$FOLDER/$TIMINGSUBS" "$FOLDER/matched.vtt" --mode 2 -# run(args.script, args.subs, args.out, args.mode) diff --git a/deprecated/readme.md b/deprecated/readme.md new file mode 100644 index 0000000..5ae612a --- /dev/null +++ b/deprecated/readme.md @@ -0,0 +1,91 @@ +# v1 Readme + +# Split m4b by chapter +`./split.sh "/mnt/d/Editing/Audiobooks/かがみの孤城/"` + +# Get a subtitle with synced transcript from split files +`subplz sync -d "/mnt/d/Editing/Audiobooks/かがみの孤城/"` + +`subplz sync -d ""` eg `subplz sync -d "$(wslpath -a "D:\Editing\Audiobooks\かがみの孤城\\")"` or `subplz sync -d "/mnt/d/sync/Harry Potter 1/" "/mnt/d/sync/Harry Potter The Sequel/"` + +# Generate subs for a folder of video or audio file +`python gen.py -d "/mnt/u/Videos/J-Shows/MAG Net/"` + +# Merge split files into a single m4b +`./merge.sh "/mnt/d/Editing/Audiobooks/medium霊媒探偵城塚翡翠"` + +# Merge split files into a single m4b for a library + +This assumes you just have mp4's in a folder like `/mnt/d/Editing/Audiobooks/medium霊媒探偵城塚翡翠`. It will run all of the folder's with mp4's and do a check on them after to make sure the chapters line up. Requires `docker` command to be available. + +`python ./helpers/merge.py "/mnt/d/Editing/Audiobooks/"` + +# Anki Support + +- Generates subs2srs style deck +- Imports the deck into Anki automatically + +The Anki support currently takes your m4b file in `` named `.m4b`, where `` is the name of the media, and it outputs srs audio and a TSV file that can is sent via AnkiConnect to Anki. This is useful for searching across [GoldenDict](https://www.youtube.com/playlist?list=PLV9y64Yrq5i-1ztReLQQ2oyg43uoeyri-) to find sentences that use a word, or to merge automatically with custom scripts (more releases to support this coming hopefully). + + +1. Install ankiconnect add-on to Anki. +2. I recommend using `ANKICONNECT` as an environment variable. Set `export ANKICONNECT=localhost:8755` or `export ANKICONNECT="$(hostname).local:8765"` in your `~/.zshrc` or bashrc & activate it. +3. Make sure you are in the project directory `cd ./AudiobookTextSync` +4. Install `pip install ./requirements.txt` (only needs to be done once) +5. Set `ANKI_MEDIA_DIR` to your anki profile's media path: `/mnt/f/Anki2/KanjiEater/collection.media/` +6. Run the command below + + + +Command: +`./anki.sh ""` + +Example: +`./anki.sh "/mnt/d/sync/kokoro/"` + + + + +# WSL2 + +If you're using WSL2 there a few networking quirks. + +1. Enable WSL2 to talk to your Windows machine. https://github.com/microsoft/WSL/issues/4585#issuecomment-610061194 +2. Set your `$ANKICONNECT` url to your windows machine url, `export ANKICONNECT="http://$(hostname).local:8765"`. https://github.com/microsoft/WSL/issues/5211 +3. Make sure inside of Anki's addon config `"webBindAddress": "0.0.0.0", "webBindPort": "8765"`. `0.0.0.0` binds to all network interfaces, so WSL2 can connect. + +# Testing connection to Anki from WSL2 + +``` +curl --header "Content-Type: application/json" \ + --request POST \ + --data '{ "action": "guiBrowse", "version": 6, "params": { "query": "flag:3 is:new -is:suspended -tag:重複 tag:重複3" } }' \ + http://172.18.224.1:8765 +``` +# Troubleshooting +You might see various issues while trying this out in the early state. Here are some of the pieces at work in sequence: +## Stages +1. Filter down audio to improve future results - slow & probably not heavy cpu or gpu usage. Heavier on cpu +2. split_run & stable-ts: Starts off heavy on CPU & RAM to identify the audio spectrum +3. stable-ts: GPU heavy & requires lots of vRAM depending on the model. This is the part with the long taskbar, where it tries to transcribe a text from the audio. Currently the default is [tiny](https://github.com/openai/whisper#available-models-and-languages). Ironically tiny, does a better job of keeping the phrases short, at the cost of accuracy of transcription, which since we are matching a script, doesn't matter. Also it runs 32x faster than large. +4. Merge vtt's for split subs +5. Split the script +6. match the script to the generated transcription to get good timestamps + +# Getting Book Scripts + +UPDATE: Books now have furigana automatically escaped in txt and epub. You can use calibre though to export them in appropriate formats. + +OLD: +This program supports `txt` files. You may need to use an external program like Calibre to convert your kindle formats like `azw3` to a `txt` of `epub` file. + +To convert in Calibre: +1. Right click on the book and convert the individual book (or use the batch option beneath it) +![image](https://user-images.githubusercontent.com/32607317/226463043-f2f89382-a75f-48ea-bb91-00efe0f05893.png) +2. At the top right for output format, select `txt` +![image](https://user-images.githubusercontent.com/32607317/226463797-1c19385d-c6e7-4564-a795-926e04716562.png) +3. Click Find & Replace. If your book has 《》for furigana as some aozora books do (戦場《せんじょう》), then add a regex. If they have rt for furigana use the rt one: `《(.+?)》` or `(.*?)<\/rt>`. When you copy the regex into the regex box, don't forget to click the Add button +![image](https://user-images.githubusercontent.com/32607317/226463912-48bcfd57-4935-48fb-af7e-13d2a024cdee.png) +4. You can add multiple regexes to strip any extra content or furigana as need be. +![image](https://user-images.githubusercontent.com/32607317/226464346-a752970e-0f1c-42db-b64d-a3bc6df6ebdd.png) +5. Click ok and convert it & you should now be able to find the file wherever Calibre is saving your books \ No newline at end of file diff --git a/deprecated/split_sentences.py b/deprecated/split_sentences.py deleted file mode 100644 index 4c7369e..0000000 --- a/deprecated/split_sentences.py +++ /dev/null @@ -1,45 +0,0 @@ -import pysbd - -# import sys -from tqdm import tqdm - - -# inputs = [sys.argv[1]] -def split_sentences(inputs): - for file_name in inputs: - with open(file_name, "r", encoding="UTF-8") as file: - ilines = file.readlines() - - seg = pysbd.Segmenter(language="en", clean=False) - with open(file_name + ".split.txt", "w", encoding="UTF-8") as fo: - lines = [] - print("Splitting script into sentences") - for i, text in enumerate(tqdm(ilines)): - # if (i % 10) == 0: - # print('%d/%d' % (i, len(ilines))) - text.rstrip("\n") - lines += seg.segment(text) - - # Fix end of quotes - fixed = [] - for i, line in enumerate(lines): - if i > 0 and len(line) > 0 and line[0] in ["」", "’"]: - fixed[-1] += line[0] - line = line[1:] - if len(line): - fixed.append(line) - lines = fixed - - # Merge short lines with quotes - fixed = [] - for i, line in enumerate(lines): - if len(fixed) > 0: - if (fixed[-1][0] in ["」", "’"]) and len(fixed[-1] + line) <= 1: - fixed[-1] += line - continue - fixed.append(line) - lines = fixed - - for line in lines: - if line != "": - fo.write(line + "\n") diff --git a/deprecated/utils.py b/deprecated/utils.py deleted file mode 100644 index 274fbff..0000000 --- a/deprecated/utils.py +++ /dev/null @@ -1,99 +0,0 @@ -import re -from natsort import os_sorted -from glob import glob, escape -from os import path -import json - -audio_formats = ['aac', 'ac3', 'alac', 'ape', 'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'm4b'] -video_formats = ['3g2', '3gp', 'avi', 'flv', 'm4v', 'mkv', 'mov', 'mp4', 'mpeg', 'webm'] -subtitle_formats = ['ass', 'srt', 'vtt'] - - -class Subtitle: - def __init__(self, start, end, line): - self.start = start - self.end = end - self.line = line - -def check_workdir_content(workdir, formats): - workdir_stripped = path.basename(path.normpath(workdir)) - - files = [] - for format in formats: - result = glob(f"{workdir.rstrip('/')}/*{workdir_stripped}.{format}") - if len(result) > 0: - files.append(result) - - return len(files) > 0 - -def remove_tags(line): - return re.sub("<[^>]*>", "", line) - -def get_lines(file): - for line in file: - yield line.rstrip("\n") - -def read_vtt(file): - lines = get_lines(file) - - subs = [] - header = next(lines) - assert header == "WEBVTT" - # assert next(lines) == "Kind: captions" - # assert next(lines).startswith("Language:") - assert next(lines) == "" - - last_sub = " " - - while True: - # for t in range(0, 10): - line = next(lines, None) - if line == None: # EOF - break - # print(line) - m = re.findall( - r"(\d\d:\d\d:\d\d.\d\d\d) --> (\d\d:\d\d:\d\d.\d\d\d)|(\d\d:\d\d.\d\d\d) --> (\d\d:\d\d.\d\d\d)|(\d\d:\d\d.\d\d\d) --> (\d\d:\d\d:\d\d.\d\d\d)", - line, - ) - if not m: - print( - f'Warning: Line "{line}" did not look like a valid VTT input. There could be issues parsing this sub' - ) - continue - - matchPair = [list(filter(None, x)) for x in m][0] - sub_start = matchPair[0] # .replace('.', ',') - sub_end = matchPair[1] - - line = next(lines) - while line: - sub = remove_tags(line) - if last_sub != sub and sub not in [" ", "[音楽]"]: - last_sub = sub - # print("sub:", sub_start, sub_end, sub) - subs.append(Subtitle(sub_start, sub_end, sub)) - elif last_sub == sub and subs: - subs[-1].end = sub_end - # print("Update sub:", subs[-1].start, subs[-1].end, subs[-1].line) - try: - line = next(lines) - except StopIteration: - line = None - - return subs - -def write_sub(output_file_path, subs): - with open(output_file_path, "w", encoding="utf-8") as outfile: - outfile.write("WEBVTT\n\n") - for n, sub in enumerate(subs): - # outfile.write('%d\n' % (n + 1)) - outfile.write("%s --> %s\n" % (sub.start, sub.end)) - outfile.write("%s\n\n" % (sub.line)) - - - -def get_mapping(mapping_path): - with open(mapping_path) as f: - mapping = json.load(f) - print(f"Reading mapping: {mapping}") - return mapping diff --git a/pyproject.toml b/pyproject.toml index b326d63..e2fce82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ classifiers = [ dependencies = [ "natsort ~= 8.2.0", "tqdm~=4.66.3", + "pysbd-0.3.4", "ats@git+https://github.com/kanjieater/AudiobookTextSync#egg=master" # "ats @ file:///home/ke/code/AudiobookTextSync" ] diff --git a/readme.md b/readme.md index 210e721..6cfbc8d 100644 --- a/readme.md +++ b/readme.md @@ -6,11 +6,11 @@ https://user-images.githubusercontent.com/32607317/219973521-5a5c2bf2-4df1-422b- Generate accurate subtitles from audio, align existing subs to videos, generate your own Kindle's Immersion Reading like audiobook subs. -This tool allows you to use AI models to generate subtitles from only audio, then match the subtitles to an accurate text, like a book. You can also just generate subtitles for videos with it, without needing any existing subtitles. Soon, it will support syncronizing existing subs as well. Currently I am only developing this tool for Japanese use. +This tool allows you to use AI models to generate subtitles from only audio, then match the subtitles to an accurate text, like a book. It supports syncronizing existing subs as well. Soon, You can also just generate subtitles for videos with it, without needing any existing subtitles. Currently I am only developing this tool for Japanese use, though rumor has it, the `language` flag can be used for other languages too. -It requires a modern GPU with decent VRAM, CPU, and RAM. There's also a communty built Google Colab notebook available on discord. +It requires a modern GPU with decent VRAM, CPU, and RAM. There's also a community built Google Colab notebook available on discord. -Current State: The transcript will be extremely accurate. The timings will be mostly accurate, but may come late or leave early. Accuracy has improved tremendously with the latest updates to the AI tooling used. Sometimes the first few lines will be off slightly, but will quickly autocorrect. +Current State: The transcript will be extremely accurate. The timings will be mostly accurate, but may come late or leave early. Accuracy has improved tremendously with the latest updates to the AI tooling used. Sometimes the first few lines will be off slightly, but will quickly autocorrect. If it get's off midway, it autocorrects. Sometimes multiple lines get bundled together making large subtitles, but it's not usually an issue. Support for this tool can be found [on KanjiEater's thread](https://discord.com/channels/617136488840429598/1076966013268148314) [on The Moe Way Discord](https://learnjapanese.moe/join/) @@ -118,6 +118,11 @@ Currently supports Docker (preferred), Windows, and unix based OS's like Ubuntu - This can be GPU intense, RAM intense, and CPU intense script part. `subplz sync -d ""` eg `subplz sync -d "/mnt/d/Editing/Audiobooks/かがみの孤城/"`. This runs each file to get a character level transcript. It then creates a sub format that can be matched to the `script.txt`. Each character level subtitle is merged into a phrase level, and your result should be a `.srt` file. The video or audio file then can be watched with `MPV`, playing audio in time with the subtitle. - Users with a modern CPU with lots of threads won't notice much of a difference between using CUDA/GPU & CPU +# Sort Order +By default, the `-d` parameter will pick up the supported files in the directory(s) given. Ensure that your OS sorts them in an order that you would want them to be patched together in. Sort them by name, and as long as all of the audio files are in order and the all of the text files are in the same order, they'll be "zipped" up individually with each other. + +The exception to this, is if we find one transcript and multiple audio files. We'll assume that's something like a bunch of `mp3`s or other audio files that you want to sync to a single transcript like an `epub` or `txt`. + # Overwrite By default the tool will overwrite any existing srt named after the audio file's name. If you don't want it to do this you must explicitly tell it not to. @@ -128,96 +133,21 @@ SubPlz writes a file in the same folder to the audio with the `.subpl Alternatively you can use the flag `--rerun` to ignore these files. If you want to prevent them from being created, you can run the tool with `--no-rerun-files`. +# Respect Transcript Grouping +By default, the `sync` command will trust the original transcript and group lines based on it. If you want to allow the tool to break lines up into smaller chunks, you can use this flag. `--no-respect-grouping` -# Split m4b by chapter -`./split.sh "/mnt/d/Editing/Audiobooks/かがみの孤城/"` - -# Get a subtitle with synced transcript from split files -`subplz sync -d "/mnt/d/Editing/Audiobooks/かがみの孤城/"` - -`subplz sync -d ""` eg `subplz sync -d "$(wslpath -a "D:\Editing\Audiobooks\かがみの孤城\\")"` or `subplz sync -d "/mnt/d/sync/Harry Potter 1/" "/mnt/d/sync/Harry Potter The Sequel/"` - -# Generate subs for a folder of video or audio file -`python gen.py -d "/mnt/u/Videos/J-Shows/MAG Net/"` - -# Merge split files into a single m4b -`./merge.sh "/mnt/d/Editing/Audiobooks/medium霊媒探偵城塚翡翠"` - -# Merge split files into a single m4b for a library - -This assumes you just have mp4's in a folder like `/mnt/d/Editing/Audiobooks/medium霊媒探偵城塚翡翠`. It will run all of the folder's with mp4's and do a check on them after to make sure the chapters line up. Requires `docker` command to be available. - -`python ./helpers/merge.py "/mnt/d/Editing/Audiobooks/"` - -# Anki Support - -- Generates subs2srs style deck -- Imports the deck into Anki automatically - -The Anki support currently takes your m4b file in `` named `.m4b`, where `` is the name of the media, and it outputs srs audio and a TSV file that can is sent via AnkiConnect to Anki. This is useful for searching across [GoldenDict](https://www.youtube.com/playlist?list=PLV9y64Yrq5i-1ztReLQQ2oyg43uoeyri-) to find sentences that use a word, or to merge automatically with custom scripts (more releases to support this coming hopefully). - +# Tuning Recommendations +For different use cases, different parameters may be optimal. -1. Install ankiconnect add-on to Anki. -2. I recommend using `ANKICONNECT` as an environment variable. Set `export ANKICONNECT=localhost:8755` or `export ANKICONNECT="$(hostname).local:8765"` in your `~/.zshrc` or bashrc & activate it. -3. Make sure you are in the project directory `cd ./AudiobookTextSync` -4. Install `pip install ./requirements.txt` (only needs to be done once) -5. Set `ANKI_MEDIA_DIR` to your anki profile's media path: `/mnt/f/Anki2/KanjiEater/collection.media/` -6. Run the command below +## For Audiobooks +- A chapter `m4b` file will allow us to split up the audio and do things in parallel +- There can be slight variations between `epub` and `txt` files, like where full character spaces aren't pickedup in `epub` but are in `txt`. A chaptered `epub` may be faster, but you can have more control over what text gets synced from a `txt` file if you need to manually remove things (but `epub` is still probably the easier option, and very reliable) +- If the audio and the text differ greatly - like full sections of the book are read in different order, you will want to use `--no-respect-grouping` to let the algorithm remove content for you +- The default `--model "tiny"` seems to work well, and is much faster than other models. If your transcript is inaccurate, consider using a larger model to compensate - - -Command: -`./anki.sh ""` - -Example: -`./anki.sh "/mnt/d/sync/kokoro/"` - - - - -# WSL2 - -If you're using WSL2 there a few networking quirks. - -1. Enable WSL2 to talk to your Windows machine. https://github.com/microsoft/WSL/issues/4585#issuecomment-610061194 -2. Set your `$ANKICONNECT` url to your windows machine url, `export ANKICONNECT="http://$(hostname).local:8765"`. https://github.com/microsoft/WSL/issues/5211 -3. Make sure inside of Anki's addon config `"webBindAddress": "0.0.0.0", "webBindPort": "8765"`. `0.0.0.0` binds to all network interfaces, so WSL2 can connect. - -# Testing connection to Anki from WSL2 - -``` -curl --header "Content-Type: application/json" \ - --request POST \ - --data '{ "action": "guiBrowse", "version": 6, "params": { "query": "flag:3 is:new -is:suspended -tag:重複 tag:重複3" } }' \ - http://172.18.224.1:8765 -``` -# Troubleshooting -You might see various issues while trying this out in the early state. Here are some of the pieces at work in sequence: -## Stages -1. Filter down audio to improve future results - slow & probably not heavy cpu or gpu usage. Heavier on cpu -2. split_run & stable-ts: Starts off heavy on CPU & RAM to identify the audio spectrum -3. stable-ts: GPU heavy & requires lots of vRAM depending on the model. This is the part with the long taskbar, where it tries to transcribe a text from the audio. Currently the default is [tiny](https://github.com/openai/whisper#available-models-and-languages). Ironically tiny, does a better job of keeping the phrases short, at the cost of accuracy of transcription, which since we are matching a script, doesn't matter. Also it runs 32x faster than large. -4. Merge vtt's for split subs -5. Split the script -6. match the script to the generated transcription to get good timestamps - -# Getting Book Scripts - -UPDATE: Books now have furigana automatically escaped in txt and epub. You can use calibre though to export them in appropriate formats. - -OLD: -This program supports `txt` files. You may need to use an external program like Calibre to convert your kindle formats like `azw3` to a `txt` of `epub` file. - -To convert in Calibre: -1. Right click on the book and convert the individual book (or use the batch option beneath it) -![image](https://user-images.githubusercontent.com/32607317/226463043-f2f89382-a75f-48ea-bb91-00efe0f05893.png) -2. At the top right for output format, select `txt` -![image](https://user-images.githubusercontent.com/32607317/226463797-1c19385d-c6e7-4564-a795-926e04716562.png) -3. Click Find & Replace. If your book has 《》for furigana as some aozora books do (戦場《せんじょう》), then add a regex. If they have rt for furigana use the rt one: `《(.+?)》` or `(.*?)<\/rt>`. When you copy the regex into the regex box, don't forget to click the Add button -![image](https://user-images.githubusercontent.com/32607317/226463912-48bcfd57-4935-48fb-af7e-13d2a024cdee.png) -4. You can add multiple regexes to strip any extra content or furigana as need be. -![image](https://user-images.githubusercontent.com/32607317/226464346-a752970e-0f1c-42db-b64d-a3bc6df6ebdd.png) -5. Click ok and convert it & you should now be able to find the file wherever Calibre is saving your books +## For Realigning Subtitles +- Highly recommend running with something like `--model "large-v3"` as subtitles often have sound effects or other things that won't be picked up by transcription models. By using a large model, it will take much longer (a 24 min episode can go from 30 seconds to 4 mins for me), but it will be much more accurate. +- Subs can be cut off in strange ways if you have an unreliable transcript, so by default `--respect-grouping` is on. If you find your subs frequently have very long subtitle lines, consider using `--no-respect-grouping` # Thanks diff --git a/subplz/align.py b/subplz/align.py new file mode 100644 index 0000000..c1c6036 --- /dev/null +++ b/subplz/align.py @@ -0,0 +1,307 @@ +from rapidfuzz import fuzz +import re +from datetime import datetime +from tqdm import tqdm +from ats.main import Segment +from subplz.files import get_tmp_path + +MAX_MERGE_COUNT = ( + 25 +) # Larger gives better results, but takes longer to process. +MAX_SEARCH_CONTEXT = MAX_MERGE_COUNT * 2 + +# Trim script for quick testing +# script = script[:500] +# subs = subs[:1000] + +# Use dynamic programming to pick best subs mapping +memo = {} + + +class ScriptLine: + def __init__(self, line): + self.text = line + # self.txt = re.sub("「|」|『|』|、|。|・|?|…|―|─|!|(|)", "", line) + + def __repr__(self): + return "ScriptLine(%s)" % self.text + + +def read_script(file): + for line in file: + line = line.rstrip("\n") + if line == "": + continue + yield line + + +def get_script(script, script_pos, num_used, sep=""): + end = min(len(script), script_pos + num_used) + return sep.join([sub.text for sub in script[script_pos:end]]) + + +def get_base(subs, sub_pos, num_used, sep=""): + end = min(len(subs), sub_pos + num_used) + return sep.join([sub.text for sub in subs[sub_pos:end]]) + + +def get_best_sub_n( + script, subs, script_pos, num_used_script, last_script_pos, sub_pos, max_subs, last_sub_to_test +): + t_best_score = 0 + t_best_used_sub = 1 + + line = get_script(script, script_pos, num_used_script) + + remaining_subs = last_sub_to_test - sub_pos + + for num_used_sub in range(1, min(max_subs, remaining_subs) + 1): + base = get_base(subs, sub_pos, num_used_sub) + curr_score = fuzz.ratio(base, line) / 100.0 * min(len(line), len(base)) + tot_score = curr_score + calc_best_score( + script, + subs, + script_pos + num_used_script, + last_script_pos, + sub_pos + num_used_sub, + last_sub_to_test, + ) + if tot_score > t_best_score: + t_best_score = tot_score + t_best_used_sub = num_used_sub + + return (t_best_score, t_best_used_sub) + + +best_script_score_and_sub = {} + + +def calc_best_score(script, subs, script_pos, last_script_pos, sub_pos, last_sub_to_test): + if script_pos >= len(script) or sub_pos >= len(subs): + return 0 + + key = (script_pos, sub_pos) + if key in memo: + return memo[key][0] + + best_score = 0 + best_used_sub = 1 + best_used_script = 1 + + remaining_script = last_script_pos - script_pos + + for num_used_script in range(1, min(MAX_MERGE_COUNT, remaining_script) + 1): + max_subs = MAX_MERGE_COUNT if num_used_script == 1 else 1 + t_best_score, t_best_used_sub = get_best_sub_n( + script, + subs, + script_pos, + num_used_script, + last_script_pos, + sub_pos, + max_subs, + last_sub_to_test, + ) + + if t_best_score > best_score: + best_score = t_best_score + best_used_sub = t_best_used_sub + best_used_script = num_used_script + + if best_used_script > 1: + # Do one more fitting + t_best_score, t_best_used_sub = get_best_sub_n( + script, + subs, + script_pos, + best_used_script, + last_script_pos, + sub_pos, + MAX_MERGE_COUNT, + last_sub_to_test, + ) + if t_best_score > best_score: + best_score = t_best_score + best_used_sub = t_best_used_sub + + key = (script_pos, sub_pos) + memo[key] = (best_score, best_used_sub, best_used_script) + + # Save best sub pos for this script pos + best_prev_score, best_sub = best_script_score_and_sub.get(script_pos, (0, None)) + if best_score >= best_prev_score: + best_script_score_and_sub[script_pos] = (best_score, key) + + return best_score + + +def get_best_sub_path(script_pos, n, last_script_pos, last_sub_to_test): + _, key = best_script_score_and_sub[script_pos] + ret = [] + sub_pos = key[1] + + i = 0 + while i < n and script_pos < last_script_pos and sub_pos < last_sub_to_test: + ret.append((script_pos, sub_pos)) + decision = memo[(script_pos, sub_pos)] + num_used_sub = decision[1] + num_used_script = decision[2] + sub_pos += num_used_sub + script_pos += num_used_script + i += 1 + return ret + + +def test_sub_pos(script, subs, script_pos, last_script_pos, first_sub_to_test, last_sub_to_test): + for sub_pos in range(last_sub_to_test - 1, first_sub_to_test - 1, -1): + calc_best_score(script, subs, script_pos, last_script_pos, sub_pos, last_sub_to_test) + + +def recursively_find_match(script, subs, result, first_script, last_script, first_sub, last_sub, bar=None): + if bar is None: + bar = tqdm(total=1, position=0, leave=True) + + if first_script == last_script or first_sub == last_sub: + bar.close() + return + + memo.clear() + best_script_score_and_sub.clear() + + mid = (first_script + last_script) // 2 + start = max(first_script, mid - MAX_SEARCH_CONTEXT) + end = min(mid + MAX_SEARCH_CONTEXT, last_script) + + for script_pos in tqdm(range(end - 1, start - 1, -1), position=1, leave=False): + test_sub_pos(script, subs, script_pos, end, first_sub, last_sub) + + best_path = get_best_sub_path(start, end - start, end, last_sub) + if len(best_path) > 0: + for p in best_path: + if p[0] > mid: + break + mid_key = p + + mid_memo = memo[mid_key] + script_pos = mid_key[0] + sub_pos = mid_key[1] + num_used_script = mid_memo[2] + num_used_sub = mid_memo[1] + + recursively_find_match( + script, subs, result, first_script, script_pos, first_sub, sub_pos, bar + ) + + scr_out = get_script(script, script_pos, num_used_script, "") + scr = get_script(script, script_pos, num_used_script, " ‖ ") + base = get_base(subs, sub_pos, num_used_sub, " ‖ ") + + result.append((script_pos, num_used_script, sub_pos, num_used_sub)) + + recursively_find_match( + script, subs, result, script_pos + num_used_script, last_script, sub_pos + num_used_sub, last_sub, bar + ) + bar.close() + +def remove_tags(line): + return re.sub("<[^>]*>", "", line) + +def get_lines(file): + for line in file: + yield line.rstrip("\n") + +def read_subtitles(file): + lines = get_lines(file) + subs = [] + first_line = next(lines) + is_vtt = first_line == "WEBVTT" + if is_vtt: + assert next(lines) == "" + last_sub = " " + while True: + line = next(lines, None) + if line is None: # EOF + break + # Match timestamp lines for both VTT and SRT formats + m = re.findall( + r"(\d\d:\d\d:\d\d.\d\d\d) --> (\d\d:\d\d:\d\d.\d\d\d)|(\d\d:\d\d.\d\d\d) --> (\d\d:\d\d.\d\d\d)|(\d\d:\d\d.\d\d\d) --> (\d\d:\d\d:\d\d.\d\d\d)|(\d\d:\d\d:\d\d,\d\d\d) --> (\d\d:\d\d:\d\d,\d\d\d)|(\d\d:\d\d,\d\d\d) --> (\d\d:\d\d,\d\d\d)|(\d\d:\d\d,\d\d\d) --> (\d\d:\d\d:\d\d,\d\d\d)", + line, + ) + if not m: + if not line.isdigit() and line: + print( + f'Warning: Line "{line}" did not look like a valid VTT/SRT input. There could be issues parsing this sub' + ) + continue + + match_pair = [list(filter(None, x)) for x in m][0] + sub_start = match_pair[0].replace(',', '.') # Convert SRT to VTT format + sub_end = match_pair[1].replace(',', '.') + + # Read the subtitle text + line = next(lines) + sub_text = [] + while line: + sub_text.append(remove_tags(line)) + try: + line = next(lines) + except StopIteration: + line = None + if line == "": + break + + sub = ' '.join(sub_text).strip() + if sub and last_sub != sub and sub not in [" ", "[音楽]"]: + last_sub = sub + subs.append(Segment(sub, sub_start, sub_end)) + elif last_sub == sub and subs: + subs[-1].end = sub_end + + return subs + + +def to_float(time_str): + time_obj = datetime.strptime(time_str, '%H:%M:%S.%f') + time_delta = time_obj - datetime(1900, 1, 1) + float_time = time_delta.total_seconds() + return float_time + + + +def nc_align(split_script, subs_file): + with open(split_script, encoding='utf-8') as s: + script = [ScriptLine(line.strip()) for line in read_script(s)] + print(subs_file) + with open(subs_file, encoding='utf-8') as vtt: + subs = read_subtitles(vtt) + new_subs = [] + + result = [] + print("🤝 Grouping based on transcript...") + bar = tqdm(total=0) + recursively_find_match(script, subs, result, 0, len(script), 0, len(subs), bar) + bar.close() + for i, (script_pos, num_used_script, sub_pos, num_used_sub) in enumerate( + tqdm(result) + ): + if i == 0: + script_pos = 0 + sub_pos = 0 + + if i + 1 < len(result): + num_used_script = result[i + 1][0] - script_pos + num_used_sub = result[i + 1][2] - sub_pos + else: + num_used_script = len(script) - script_pos + num_used_sub = len(subs) - sub_pos + + scr_out = get_script(script, script_pos, num_used_script, "") + scr = get_script(script, script_pos, num_used_script, " ‖ ") + base = get_base(subs, sub_pos, num_used_sub, " ‖ ") + + # print('Record:', script_pos, scr, '==', base) + new_subs.append( + Segment(scr_out, to_float(subs[sub_pos].start), to_float(subs[sub_pos + num_used_sub - 1].end)) + ) + + return new_subs diff --git a/subplz/cli.py b/subplz/cli.py index f0eb304..e022c13 100644 --- a/subplz/cli.py +++ b/subplz/cli.py @@ -67,6 +67,12 @@ def setup_advanced_cli(parser): ) # Behaviors + optional_group.add_argument( + "--respect-grouping", + default=True, + help="Keep the lines in the same subtitle together, instead of breaking them apart", + action=argparse.BooleanOptionalAction, + ) optional_group.add_argument( "--overwrite", default=True, @@ -318,6 +324,8 @@ class backendParams: device: str # UI progress: bool + # Behavior + respect_grouping: bool # General Whisper language: str model_name: str @@ -393,6 +401,7 @@ def get_inputs(): fast_decoder=args.fast_decoder, fast_decoder_overlap=args.fast_decoder_overlap, fast_decoder_batches=args.fast_decoder_batches, + respect_grouping=args.respect_grouping, ), cache=SimpleNamespace( overwrite_cache=args.overwrite_cache, @@ -408,6 +417,7 @@ def get_inputs(): overwrite=args.overwrite, rerun=args.rerun, rerun_files=args.rerun_files, + lang=args.language, ), ) validate_source_inputs(inputs.sources) diff --git a/subplz/files.py b/subplz/files.py index f238acb..86d237c 100644 --- a/subplz/files.py +++ b/subplz/files.py @@ -10,15 +10,14 @@ from pprint import pformat from ats.main import ( TextFile, - Epub, AudioStream, TextFile, TextParagraph, write_srt, write_vtt, ) -from functools import partial import ffmpeg +from subplz.text import split_sentences, split_sentences_from_input, Epub AUDIO_FORMATS = [ "aac", @@ -68,12 +67,12 @@ def get_matching_audio_stream(streams, lang): stream for stream in streams if stream.get("codec_type", None) == "audio" ] audio_lang = lang - if lang == 'ja': # TODO support other languages - audio_lang = 'jpn' + if lang == "ja": # TODO support other languages + audio_lang = "jpn" target_streams = [ stream for stream in audio_streams - if stream.get("tags", {}).get("language", None) == audio_lang + if stream.get("tags", {}).get("language", "").lower() == audio_lang.lower() ] return next((stream for stream in target_streams + audio_streams), None) @@ -161,6 +160,7 @@ class sourceData: writer: Writer chapters: List streams: List + lang: str def grab_files(folder, types, sort=True): @@ -179,17 +179,12 @@ def get_streams(audio): return streams -def convert_to_srt(file_path, output_path): - - stream = ffmpeg.input(str(file_path)) - stream = ffmpeg.output( - stream, - str(output_path), - vn=None, - loglevel="error", - ).global_args("-hide_banner") - - return ffmpeg.run(stream, overwrite_output=True) +def convert_sub_format(full_original_path, full_sub_path): + stream = ffmpeg.input(full_original_path) + stream = ffmpeg.output(stream, full_sub_path, loglevel="error").global_args( + "-hide_banner" + ) + ffmpeg.run(stream, overwrite_output=True) def remove_timing_and_metadata(srt_path, txt_path): @@ -219,13 +214,13 @@ def normalize_text(file_path): filename = file_path.stem srt_path = get_tmp_path(file_path.parent / f"{filename}.srt") txt_path = get_tmp_path(file_path.parent / f"{filename}.txt") - convert_to_srt(file_path, srt_path) + convert_sub_format(str(file_path), str(srt_path)) txt_path = remove_timing_and_metadata(srt_path, txt_path) srt_path.unlink() return str(txt_path) -def get_chapters(text: List[str]): +def get_chapters(text: List[str], lang): # print("📖 Finding chapters...") #log sub_exts = ["." + extension for extension in SUBTITLE_FORMATS] chapters = [] @@ -234,10 +229,17 @@ def get_chapters(text: List[str]): file_ext = splitext(file_name)[-1].lower() if file_ext == ".epub": - chapters.append((file_name, Epub.from_file(file_path))) + txt_path = get_tmp_path(Path(file_path).parent / f"{Path(file_path).stem}.txt") + epub = Epub.from_file(file_path) + chapters.append((txt_path, epub.chapters)) + split_sentences_from_input([p.text() for p in epub.text()], txt_path, lang) + # chapters.append((txt_path, [TextFile(path=file_path, title=file_name)])) + elif file_ext in sub_exts: try: txt_path = normalize_text(file_path) + split_sentences(txt_path, txt_path, lang) + except ffmpeg.Error as e: print( f"Failed to normalize the subs. We can't process them. Try to get subs from a different source and try again: {e}" @@ -245,7 +247,9 @@ def get_chapters(text: List[str]): return [] chapters.append((txt_path, [TextFile(path=txt_path, title=file_name)])) else: - chapters.append((file_name, [TextFile(path=file_path, title=file_name)])) + txt_path = get_tmp_path(Path(file_path).parent / f"{Path(file_path).stem}.txt") + split_sentences(file_path, txt_path, lang) + chapters.append((txt_path, [TextFile(path=file_path, title=file_name)])) return chapters @@ -255,15 +259,6 @@ def get_working_folders(dirs): if not isdir(dir): raise Exception(f"{dir} is not a valid directory") full_folder = join(dir, "") - # content_name = get_content_name(dir) - # split_folder = path.join(full_folder, f"{content_name}_splitted") - - # if path.exists(split_folder) and path.isdir(split_folder): - # working_folder = split_folder - # print( - # f"Warning: Using split files causes a fixed delay for every split file. This is a known bug. Use the single file method instead" - # ) - # else: working_folder = full_folder working_folders.append(working_folder) return working_folders @@ -280,12 +275,8 @@ def get_text(folder): return os_sorted(text) -def get_output_dir(folder, output_format): - pass - - def setup_output_dir(output_dir, first_audio=None): - if(not output_dir and first_audio): + if not output_dir and first_audio: output_dir = Path(first_audio).parent Path(output_dir).mkdir(parents=True, exist_ok=True) return output_dir @@ -295,22 +286,28 @@ def get_output_full_paths(audio, output_dir, output_format): return [Path(output_dir) / f"{Path(a).stem}.{output_format}" for a in audio] -def match_files(audios, texts, folder): - already_run = get_existing_rerun_files(folder) - already_run_text_paths = [] - already_run_audio_paths = [] - for ar in already_run: - arPath = Path(ar) - removed_second_stem = Path(arPath.stem).stem - already_run_audio_paths.append(str(arPath.parent / removed_second_stem)) - already_run_text_paths.append(str(arPath.parent / arPath.stem)) - destemed_audio = [str(Path(audio).parent / Path(audio).stem) for audio in audios] - audios_unique = list(set(destemed_audio) - set(already_run_audio_paths)) - texts_filtered = list(set(texts) - set(already_run_text_paths)) - texts_filtered.sort(key=lambda x: texts.index(x)) - audios_filtered = [ - a for a in audios if str(Path(a).parent / Path(a).stem) in audios_unique - ] +def match_files(audios, texts, folder, rerun): + if rerun: + audios_filtered = audios + texts_filtered = texts + else: + already_run = get_existing_rerun_files(folder) + already_run_text_paths = [] + already_run_audio_paths = [] + for ar in already_run: + arPath = Path(ar) + removed_second_stem = Path(arPath.stem).stem + already_run_audio_paths.append(str(arPath.parent / removed_second_stem)) + already_run_text_paths.append(str(arPath.parent / arPath.stem)) + destemed_audio = [ + str(Path(audio).parent / Path(audio).stem) for audio in audios + ] + audios_unique = list(set(destemed_audio) - set(already_run_audio_paths)) + texts_filtered = list(set(texts) - set(already_run_text_paths)) + texts_filtered.sort(key=lambda x: texts.index(x)) + audios_filtered = [ + a for a in audios if str(Path(a).parent / Path(a).stem) in audios_unique + ] if len(audios_filtered) > 1 and len(texts_filtered) == 1 and len(already_run) == 0: print("🤔 Multiple audio files found, but only one text...") return [audios_filtered], [[t] for t in texts_filtered] @@ -329,7 +326,7 @@ def get_sources_from_dirs(input): for folder in working_folders: audios = get_audio(folder) texts = get_text(folder) - a, t = match_files(audios, texts, folder) + a, t = match_files(audios, texts, folder, input.rerun) for matched_audio, matched_text in zip(a, t): output_full_paths = get_output_full_paths( matched_audio, folder, input.output_format @@ -337,7 +334,7 @@ def get_sources_from_dirs(input): writer = Writer(input.output_format) streams = get_streams(matched_audio) - chapters = get_chapters(matched_text) + chapters = get_chapters(matched_text, input.lang) s = sourceData( dirs=input.dirs, audio=matched_audio, @@ -351,6 +348,7 @@ def get_sources_from_dirs(input): writer=writer, chapters=chapters, streams=streams, + lang=input.lang, ) sources.append(s) return sources @@ -365,7 +363,7 @@ def setup_sources(input) -> List[sourceData]: input.audio, output_dir, input.output_format ) writer = Writer(input.output_format) - chapters = get_chapters(input.text) + chapters = get_chapters(input.text, input.lang) streams = get_streams(input.audio) sources = [ sourceData( @@ -381,6 +379,7 @@ def setup_sources(input) -> List[sourceData]: writer=writer, streams=streams, chapters=chapters, + lang=input.lang, ) ] return sources @@ -488,7 +487,9 @@ def post_process(sources: List[sourceData]): print(f"❗ No text matched for '{source.text}'") if not sources: - print("""😐 We didn't do anything. This may or may not be intentional""") + print( + """😐 We didn't do anything. This may or may not be intentional. If this was unintentional, check if you had a .subplz file preventing rerun""" + ) elif complete_success: print("🎉 Everything went great!") else: diff --git a/subplz/sync.py b/subplz/sync.py index d01a47d..7a68d6c 100644 --- a/subplz/sync.py +++ b/subplz/sync.py @@ -7,8 +7,9 @@ ) import warnings from ats import align - from ats.lang import get_lang + +from subplz.align import nc_align from subplz.files import sourceData from subplz.utils import get_tqdm @@ -107,3 +108,8 @@ def sync(source: sourceData, model, streams, cache, be): if not segments: continue source.writer.write_sub(segments, source.output_full_paths[ai]) + if(len(source.chapters) == 1 and be.respect_grouping): + new_segments = nc_align(chapters[0][0], source.output_full_paths[ai]) + source.writer.write_sub(new_segments, source.output_full_paths[ai]) + + diff --git a/subplz/text.py b/subplz/text.py new file mode 100644 index 0000000..a32c1fd --- /dev/null +++ b/subplz/text.py @@ -0,0 +1,168 @@ +from pathlib import Path +from bs4 import element +from bs4 import BeautifulSoup +from dataclasses import dataclass +from ebooklib import epub +import urllib + +import pysbd +from tqdm import tqdm + + +def fix_end_of_quotes(lines): + fixed = [] + for i, line in enumerate(lines): + if i > 0 and line and line[0] in ["」", "’"]: + fixed[-1] += line[0] + line = line[1:] + if line: + fixed.append(line) + return fixed + + +def merge_short_lines_with_quotes(lines): + fixed = [] + for line in lines: + if fixed and fixed[-1][0] in ["」", "’"] and len(fixed[-1] + line) <= 1: + fixed[-1] += line + else: + fixed.append(line) + return fixed + + +def split_sentences(input_file, output_path, lang): + # for file_name in input_file: + with open(input_file, "r", encoding="UTF-8") as file: + input_lines = file.readlines() + + split_sentences_from_input(input_lines, output_path, lang) + +def get_segments(input_lines, lang): + seg = pysbd.Segmenter(language=lang, clean=False) + lines = [] + print("✂️ Splitting transcript into sentences") + for text in tqdm(input_lines): + text = text.rstrip("\n") + s = seg.segment(text) + lines += s + return lines + +def split_sentences_from_input(input_lines, output_path, lang): + lines = get_segments(input_lines, lang) + lines = fix_end_of_quotes(lines) + lines = merge_short_lines_with_quotes(lines) + + with open(output_path, "w", encoding="utf-8") as fo: + for line in lines: + if line: + fo.write(line + "\n") + + +def flatten(t): + return ( + [j for i in t for j in flatten(i)] + if isinstance(t, (tuple, list)) + else [t] if isinstance(t, epub.Link) else [] + ) + + +@dataclass(eq=True, frozen=True) +class EpubParagraph: + chapter: int + element: element.Tag + references: list + + def text(self): + return "".join(self.element.strings) + + +@dataclass(eq=True, frozen=True) +class EpubChapter: + content: BeautifulSoup + title: str + is_linear: bool + idx: int + + def text(self): + paragraphs = self.content.find("body").find_all( + ["p", "li", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6"] + ) + r = [] + for p in paragraphs: + if "id" in p.attrs: + continue + r.append(EpubParagraph(chapter=self.idx, element=p, references=[])) + return r + + +@dataclass(eq=True, frozen=True) +class Epub: + epub: epub.EpubBook + path: Path + title: str + chapters: list + + def text(self): + return [p for c in self.chapters for p in c.text()] + + @classmethod + def from_file(cls, path): + file = epub.read_epub(path, {"ignore_ncx": True}) + + flat_toc = flatten(file.toc) + m = { + it.id: i + for i, e in enumerate(flat_toc) + if ( + it := file.get_item_with_href( + urllib.parse.unquote(e.href.split("#")[0]) + ) + ) + } + if len(m) != len(flat_toc): + print( + "WARNING: Couldn't fully map toc to chapters, contact the dev, preferably with the epub" + ) + + chapters = [] + prev_title = "" + for i, v in enumerate(file.spine): + item = file.get_item_with_id(v[0]) + title = flat_toc[m[v[0]]].title if v[0] in m else "" + + if item.media_type != "application/xhtml+xml": + if title: + prev_title = title + continue + + content = BeautifulSoup(item.get_content(), "html.parser") + + r = content.find("body").find_all( + ["p", "li", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6"] + ) + # Most of the time chapter names are on images + idx = 0 + while idx < len(r) and not r[idx].get_text().strip(): + idx += 1 + if idx >= len(r): + if title: + prev_title = title + continue + + if not title: + if t := prev_title.strip(): + title = t + prev_title = "" + elif len(t := r[idx].get_text().strip()) < 25: + title = t + else: + title = item.get_name() + + chapter = EpubChapter(content=content, title=title, is_linear=v[1], idx=i) + chapters.append(chapter) + return cls( + epub=file, + path=path, + title=file.title.strip() or path.name, + chapters=chapters, + )