Skip to content

Commit c084ff6

Browse files
committed
add an alternative local method to compress and stretch cues
1 parent 24eb816 commit c084ff6

File tree

6 files changed

+178
-6
lines changed

6 files changed

+178
-6
lines changed

requirements-arm64.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ PyYAML>=4.2b1
4949
rsa==4.7
5050
scipy<1.11.0
5151
scikit-learn<1.2.0
52-
setuptools>=41.0.0
52+
setuptools>=41.0.0,<65.0.0
5353
six~=1.15.0
5454
tensorflow-macos~=2.12.0
5555
termcolor==1.1.0

requirements-stretch.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
aeneas~=1.7.3.0
1+
aeneas~=1.7.3.0
2+
dtw-python~=1.5.3

subaligner/__main__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ def main():
270270
if FLAGS.stretch_on or FLAGS.mode == "script":
271271
try:
272272
import aeneas
273+
import dtw
273274
except ModuleNotFoundError:
274275
print('ERROR: Alignment has been configured to use extra features. Please install "subaligner[stretch]" and run your command again.')
275276
sys.exit(21)

subaligner/lib/language.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,3 +406,63 @@ class Language(object):
406406

407407
CODE_TO_HUMAN_LIST = sorted([u"%s\t%s" % (k, v) for k, v in CODE_TO_HUMAN.items()])
408408
""" List of all language codes with their human-readable names """
409+
410+
LANGUAGE_TO_VOICE_CODE = {
411+
AFR: "af",
412+
ARG: "an",
413+
BOS: "bs",
414+
BUL: "bg",
415+
CAT: "ca",
416+
CES: "cs",
417+
CMN: "zh",
418+
CYM: "cy",
419+
DAN: "da",
420+
DEU: "de",
421+
ELL: "el",
422+
ENG: "en",
423+
EPO: "eo",
424+
EST: "et",
425+
FAS: "fa",
426+
FIN: "fi",
427+
FRA: "fr",
428+
GLE: "ga",
429+
GRC: "grc",
430+
HIN: "hi",
431+
HRV: "hr",
432+
HUN: "hu",
433+
HYE: "hy",
434+
IND: "id",
435+
ISL: "is",
436+
ITA: "it",
437+
JBO: "jbo",
438+
KAN: "kn",
439+
KAT: "ka",
440+
KUR: "ku",
441+
LAT: "la",
442+
LAV: "lv",
443+
LFN: "lfn",
444+
LIT: "lt",
445+
MAL: "ml",
446+
MKD: "mk",
447+
MSA: "ms",
448+
NEP: "ne",
449+
NLD: "nl",
450+
NOR: "no",
451+
PAN: "pa",
452+
POL: "pl",
453+
POR: "pt",
454+
RON: "ro",
455+
RUS: "ru",
456+
SLK: "sk",
457+
SPA: "es",
458+
SQI: "sq",
459+
SRP: "sr",
460+
SWA: "sw",
461+
SWE: "sv",
462+
TAM: "ta",
463+
TUR: "tr",
464+
UKR: "uk",
465+
VIE: "vi",
466+
YUE: "zh-yue",
467+
ZHO: "zh",
468+
}

subaligner/predictor.py

Lines changed: 113 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,23 @@
66
import gc
77
import math
88
import logging
9+
import tempfile
10+
import librosa
911
import numpy as np
1012
import multiprocessing as mp
13+
import soundfile as sf
1114
from typing import Tuple, List, Optional, Dict, Any, Iterable, Union
15+
from copy import deepcopy
1216
from pysrt import SubRipTime, SubRipItem, SubRipFile
1317
from sklearn.metrics import log_loss
14-
from copy import deepcopy
1518
from .network import Network
1619
from .embedder import FeatureEmbedder
1720
from .media_helper import MediaHelper
1821
from .subtitle import Subtitle
1922
from .hyperparameters import Hyperparameters
20-
from .exception import TerminalException
21-
from .exception import NoFrameRateException
23+
from .lib.language import Language
24+
from .utils import Utils
25+
from .exception import TerminalException, NoFrameRateException
2226
from .logger import Logger
2327

2428

@@ -445,7 +449,7 @@ def _predict_in_multithreads(
445449
gc.collect()
446450

447451
if stretch:
448-
subs_new = self.__adjust_durations(subs_new, audio_file_path, stretch_in_lang, lock)
452+
subs_new = self.__compress_and_stretch(subs_new, audio_file_path, stretch_in_lang, lock)
449453
self.__LOGGER.info("[{}] Segment {} stretched".format(os.getpid(), segment_index))
450454
return subs_new
451455
except Exception as e:
@@ -715,6 +719,111 @@ def __adjust_durations(self, subs: List[SubRipItem], audio_file_path: str, stret
715719
if task.sync_map_file_path_absolute is not None and os.path.exists(task.sync_map_file_path_absolute):
716720
os.remove(task.sync_map_file_path_absolute)
717721

722+
def __compress_and_stretch(self, subs: List[SubRipItem], audio_file_path: str, stretch_in_lang: str, lock: threading.RLock) -> List[SubRipItem]:
723+
from dtw import dtw
724+
try:
725+
with lock:
726+
segment_path, _ = self.__media_helper.extract_audio_from_start_to_end(
727+
audio_file_path,
728+
str(subs[0].start),
729+
str(subs[len(subs) - 1].end),
730+
)
731+
732+
# Create a text file for DTW alignments
733+
root, _ = os.path.splitext(segment_path)
734+
text_file_path = "{}.txt".format(root)
735+
736+
with open(text_file_path, "w", encoding="utf8") as text_file:
737+
text_file.write("*****".join([sub_new.text for sub_new in subs]))
738+
739+
sample_rate = self.__feature_embedder.frequency
740+
hop_length = self.__feature_embedder.hop_len
741+
n_mfcc = self.__feature_embedder.n_mfcc
742+
743+
file_script_duration_mapping = []
744+
with tempfile.TemporaryDirectory() as temp_dir:
745+
with open(text_file_path, "r") as f:
746+
script_lines = f.read().split("*****")
747+
wav_data = []
748+
for i, line in enumerate(script_lines):
749+
normalised_line = line.replace('"', "'")
750+
espeak_output_file = f"espeak_part_{i}.wav"
751+
espeak_cmd = f"espeak -v {Language.LANGUAGE_TO_VOICE_CODE[stretch_in_lang]} --stdout -- \"{normalised_line}\" | ffmpeg -y -i - -af 'aresample={sample_rate}' {os.path.join(temp_dir, espeak_output_file)}"
752+
os.system(espeak_cmd)
753+
y, sr = librosa.load(os.path.join(temp_dir, espeak_output_file), sr=None)
754+
wav_data.append(y)
755+
duration = librosa.get_duration(y=y, sr=sr)
756+
file_script_duration_mapping.append((os.path.join(temp_dir, espeak_output_file), line, duration))
757+
data = np.concatenate(wav_data)
758+
sf.write(os.path.join(temp_dir, "espeak-all.wav"), data, sr)
759+
760+
y_query, sr_query = librosa.load(os.path.join(temp_dir, "espeak-all.wav"), sr=None)
761+
query_mfcc_features = librosa.feature.mfcc(y=y_query, sr=sr_query, n_mfcc=n_mfcc, hop_length=hop_length).T
762+
y_reference, sr_reference = librosa.load(segment_path, sr=sample_rate)
763+
reference_mfcc_features = librosa.feature.mfcc(y=y_reference, sr=sr_reference, n_mfcc=n_mfcc, hop_length=hop_length).T
764+
765+
alignment = dtw(query_mfcc_features, reference_mfcc_features, keep_internals=False)
766+
assert len(alignment.index1) == len(alignment.index2), "Mismatch in lengths of alignment indices"
767+
assert sr_query == sr_reference
768+
frame_duration = hop_length / sr_query
769+
770+
mapped_times = []
771+
start_frame_index = 0
772+
for index, (wav_file, line_text, duration) in enumerate(file_script_duration_mapping):
773+
num_frames_in_query = int(np.ceil(duration / frame_duration))
774+
775+
query_start_frame = start_frame_index
776+
query_end_frame = start_frame_index + num_frames_in_query - 1
777+
reference_frame_indices = [r for q, r in zip(alignment.index1, alignment.index2) if
778+
query_start_frame <= q <= query_end_frame]
779+
reference_start_frame = min(reference_frame_indices)
780+
reference_end_frame = max(reference_frame_indices)
781+
782+
# TODO: Handle cases where mapped frames are not found in the reference audio
783+
784+
new_reference_start_time = reference_start_frame * frame_duration
785+
new_reference_end_time = (reference_end_frame + 1) * frame_duration
786+
787+
mapped_times.append({
788+
"new_reference_start_time": new_reference_start_time,
789+
"new_reference_end_time": new_reference_end_time
790+
})
791+
792+
start_frame_index = query_end_frame + 1
793+
794+
with open(os.path.join(temp_dir, "synced_subtitles.srt"), "w") as f:
795+
for index, entry in enumerate(mapped_times):
796+
start_srt = Utils.format_timestamp(entry["new_reference_start_time"])
797+
end_srt = Utils.format_timestamp(entry["new_reference_end_time"])
798+
f.write(f"{index + 1}\n")
799+
f.write(f"{start_srt} --> {end_srt}\n")
800+
f.write(f"{script_lines[index]}\n")
801+
f.write(f"\n")
802+
f.flush()
803+
804+
adjusted_subs = Subtitle._get_srt_subs(
805+
subrip_file_path=os.path.join(temp_dir, "synced_subtitles.srt"),
806+
encoding="utf-8"
807+
)
808+
809+
for index, sub_new_loaded in enumerate(adjusted_subs):
810+
sub_new_loaded.index = subs[index].index
811+
812+
adjusted_subs.shift(
813+
seconds=self.__media_helper.get_duration_in_seconds(
814+
start=None, end=str(subs[0].start)
815+
)
816+
)
817+
return adjusted_subs
818+
except KeyboardInterrupt:
819+
raise TerminalException("Subtitle compress and stretch interrupted by the user")
820+
finally:
821+
# Housekeep intermediate files
822+
if text_file_path is not None and os.path.exists(
823+
text_file_path
824+
):
825+
os.remove(text_file_path)
826+
718827
def __predict(
719828
self,
720829
video_file_path: Optional[str],

subaligner/subaligner_2pass/__main__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ def main():
171171
if FLAGS.stretch_on:
172172
try:
173173
import aeneas
174+
import dtw
174175
except ModuleNotFoundError:
175176
print('ERROR: Alignment has been configured to use extra features. Please install "subaligner[stretch]" and run your command again.')
176177
sys.exit(21)

0 commit comments

Comments
 (0)