add: full example of forced alignment for music
This commit is contained in:
parent
aeab34f84b
commit
65123d1b39
3
.gitignore
vendored
3
.gitignore
vendored
@ -16,4 +16,5 @@ translate/result
|
|||||||
dataset/raw
|
dataset/raw
|
||||||
translate/special-spiders
|
translate/special-spiders
|
||||||
ugNMT/BPE/output*
|
ugNMT/BPE/output*
|
||||||
ugNMT/BPE/codes
|
ugNMT/BPE/codes
|
||||||
|
forced-alignment/segments
|
82
forced-alignment/1.fff
Normal file
82
forced-alignment/1.fff
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
[0]: 25.462-27.102
|
||||||
|
[1]: 27.202-28.382
|
||||||
|
[2]: 28.582-30.262
|
||||||
|
[3]: 30.622-32.303
|
||||||
|
[4]: 32.403-33.643
|
||||||
|
[5]: 33.703-35.483
|
||||||
|
[6]: 35.843-37.503
|
||||||
|
[7]: 37.643-38.803
|
||||||
|
[8]: 38.903-40.563
|
||||||
|
[9]: 41.023-42.683
|
||||||
|
[10]: 42.803-43.883
|
||||||
|
[11]: 44.063-45.884
|
||||||
|
[12]: 56.584-58.245
|
||||||
|
[13]: 58.345-59.585
|
||||||
|
[14]: 59.685-61.345
|
||||||
|
[15]: 61.785-63.405
|
||||||
|
[16]: 63.525-64.785
|
||||||
|
[17]: 64.825-66.565
|
||||||
|
[18]: 66.925-68.665
|
||||||
|
[19]: 68.785-69.826
|
||||||
|
[20]: 70.005-71.686
|
||||||
|
[21]: 72.106-73.846
|
||||||
|
[22]: 73.946-75.086
|
||||||
|
[23]: 75.226-76.986
|
||||||
|
[24]: 77.346-79.526
|
||||||
|
[25]: 79.646-81.426
|
||||||
|
[26]: 81.566-83.527
|
||||||
|
[27]: 84.307-86.847
|
||||||
|
[28]: 86.927-89.487
|
||||||
|
[29]: 89.527-91.947
|
||||||
|
[30]: 92.107-94.667
|
||||||
|
[31]: 94.707-97.128
|
||||||
|
[32]: 97.288-99.848
|
||||||
|
[33]: 99.908-102.328
|
||||||
|
[34]: 102.448-116.229
|
||||||
|
[35]: 116.249-117.949
|
||||||
|
[36]: 118.029-119.129
|
||||||
|
[37]: 119.369-121.07
|
||||||
|
[38]: 121.45-123.13
|
||||||
|
[39]: 123.23-124.47
|
||||||
|
[40]: 124.53-126.27
|
||||||
|
[41]: 126.67-128.29
|
||||||
|
[42]: 128.41-129.59
|
||||||
|
[43]: 129.71-131.51
|
||||||
|
[44]: 131.81-133.511
|
||||||
|
[45]: 133.611-134.671
|
||||||
|
[46]: 134.891-136.651
|
||||||
|
[47]: 137.011-138.671
|
||||||
|
[48]: 138.791-139.991
|
||||||
|
[49]: 140.091-141.811
|
||||||
|
[50]: 142.191-143.871
|
||||||
|
[51]: 143.971-145.071
|
||||||
|
[52]: 145.271-146.952
|
||||||
|
[53]: 147.372-149.052
|
||||||
|
[54]: 149.132-150.412
|
||||||
|
[55]: 150.492-152.212
|
||||||
|
[56]: 152.552-154.272
|
||||||
|
[57]: 154.372-155.492
|
||||||
|
[58]: 155.652-157.412
|
||||||
|
[59]: 157.792-159.953
|
||||||
|
[60]: 160.093-161.873
|
||||||
|
[61]: 162.013-163.953
|
||||||
|
[62]: 164.733-167.293
|
||||||
|
[63]: 167.353-169.913
|
||||||
|
[64]: 169.953-172.394
|
||||||
|
[65]: 172.534-175.114
|
||||||
|
[66]: 175.154-177.594
|
||||||
|
[67]: 177.714-180.294
|
||||||
|
[68]: 180.314-182.854
|
||||||
|
[69]: 182.934-185.575
|
||||||
|
[70]: 195.875-198.436
|
||||||
|
[71]: 198.476-201.056
|
||||||
|
[72]: 201.096-203.516
|
||||||
|
[73]: 203.656-206.196
|
||||||
|
[74]: 206.276-208.796
|
||||||
|
[75]: 208.857-211.417
|
||||||
|
[76]: 211.457-213.917
|
||||||
|
[77]: 214.057-216.617
|
||||||
|
[78]: 216.677-219.077
|
||||||
|
[79]: 219.217-221.817
|
||||||
|
[80]: 221.817-224.378
|
||||||
|
[81]: 224.458-227.098
|
86
forced-alignment/1.lrc
Normal file
86
forced-alignment/1.lrc
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
[00:25.513] 东汉末狼烟不休
|
||||||
|
[00:27.178] 常侍乱 朝野陷
|
||||||
|
[00:28.499] 阿瞒挟天子 令诸侯
|
||||||
|
[00:30.838] 踞江东志在九州
|
||||||
|
[00:32.457] 继祖业 承父兄
|
||||||
|
[00:33.793] 既冕主吴越 万兜鍪
|
||||||
|
[00:35.878] 纵天下几变春秋
|
||||||
|
[00:37.536] 稳东南 面中原
|
||||||
|
[00:38.839] 水师锁长江 抗曹刘
|
||||||
|
[00:40.978] 镇赤壁雄风赳赳
|
||||||
|
[00:42.872] 夺荆楚 抚山越
|
||||||
|
[00:44.043] 驱金戈铁马 灭仇雠
|
||||||
|
[00:46.740]
|
||||||
|
[00:56.690] 紫发髯碧色眼眸
|
||||||
|
[00:58.324] 射猛虎 倚黄龙
|
||||||
|
[00:59.694] 胆识过凡人 谁敌手
|
||||||
|
[01:01.723] 御天下半百之久
|
||||||
|
[01:03.624] 选贤臣 任能将
|
||||||
|
[01:04.865] 覆江东云雨 尽风流
|
||||||
|
[01:07.034] 千秋过再难回首
|
||||||
|
[01:08.819] 问古今 兴亡事
|
||||||
|
[01:10.059] 几人耀青史 芳名留
|
||||||
|
[01:12.075] 笑谈间云烟已旧
|
||||||
|
[01:13.915] 终留下 万古叹
|
||||||
|
[01:15.278] 生子该当如 孙仲谋
|
||||||
|
[01:17.458] 运帷幄 英雄几拂袖
|
||||||
|
[01:19.644] 阴谋 阳谋 明仇 暗斗
|
||||||
|
[01:21.637] 化作一江浊浪东流
|
||||||
|
[01:24.443] 君不见军赤壁纵野火铁索连环
|
||||||
|
[01:26.739] 也不见御北敌联西蜀长江上鏖战
|
||||||
|
[01:29.596] 继遗志领江东屹立于神州东南
|
||||||
|
[01:32.188] 尽心力洒英血展伟业剑气指苍天
|
||||||
|
[01:34.840] 军帐内公瑾智张昭谋奇策频献
|
||||||
|
[01:37.241] 沙场上太史勇甘宁霸一骑当十千
|
||||||
|
[01:39.965] 纵使有千万种寂寞和孤单相伴
|
||||||
|
[01:42.425] 既受终冠帝冕龙椅上成败也笑看
|
||||||
|
[01:46.351]
|
||||||
|
[01:56.184] 铁瓮城难攻易守
|
||||||
|
[01:58.069] 旌旗立 苍空蔽
|
||||||
|
[01:59.438] 逾百千雄师 万蒙舟
|
||||||
|
[02:01.462] 善制衡眼光独秀
|
||||||
|
[02:03.301] 擢鲁肃 劝阿蒙
|
||||||
|
[02:04.557] 聚贤成霸业 名利收
|
||||||
|
[02:06.500] 固疆土施德恩厚
|
||||||
|
[02:08.302] 军心定 百姓安
|
||||||
|
[02:09.712] 富国又强兵 重耕耨
|
||||||
|
[02:11.783] 交远好未雨绸缪
|
||||||
|
[02:13.642] 联南洋 合林邑
|
||||||
|
[02:14.914] 行军远渡海 驻夷洲
|
||||||
|
[02:16.944] 残垣下枯木雕朽
|
||||||
|
[02:18.784] 想当年 麦城边
|
||||||
|
[02:20.076] 截兵缚关羽 终其寿
|
||||||
|
[02:22.263] 凭栏倚横看吴钩
|
||||||
|
[02:23.993] 叹乱世 几时了
|
||||||
|
[02:25.398] 天下归一统 没其咎
|
||||||
|
[02:27.462] 称帝王壮心仍稠
|
||||||
|
[02:29.061] 却无奈 自孤傲
|
||||||
|
[02:30.438] 同室亦操戈 子嗣斗
|
||||||
|
[02:32.512] 千年后恚恨徒留
|
||||||
|
[02:34.291] 再何寻 军帐里
|
||||||
|
[02:35.698] 将士聚欢饮 赏箜篌
|
||||||
|
[02:37.763] 运帷幄 英雄几拂袖
|
||||||
|
[02:40.029] 阴谋 阳谋 明仇 暗斗
|
||||||
|
[02:42.146] 化作一江浊浪东流
|
||||||
|
[02:44.641] 君不见吕子明踏轻舟白衣渡川
|
||||||
|
[02:47.318] 也不见陆伯言烧联营火光上冲天
|
||||||
|
[02:49.870] 善制衡选贤臣任能将共谋江山
|
||||||
|
[02:52.410] 听忠言摒逆语树威严宝剑斫书案
|
||||||
|
[02:55.117] 夺荆州抗刘备合曹操共克襄樊
|
||||||
|
[02:57.692] 守夷陵任陆逊剿敌军 火计破蜀胆
|
||||||
|
[03:00.391] 固江河成帝业立国家终归于乱
|
||||||
|
[03:02.864] 光阴逝千载过功成者都付笑谈间
|
||||||
|
[03:06.250]
|
||||||
|
[03:15.700] 君不见军赤壁纵野火铁索连环
|
||||||
|
[03:18.545] 也不见御北敌联西蜀长江上鏖战
|
||||||
|
[03:21.080] 继遗志领江东屹立于神州东南
|
||||||
|
[03:23.649] 尽心力洒英血展伟业剑指苍天
|
||||||
|
[03:26.263] 君不见吕子明踏轻舟白衣渡川
|
||||||
|
[03:28.746] 也不见陆伯言烧联营火光上冲天
|
||||||
|
[03:31.395] 善制衡选贤臣任能将共谋江山
|
||||||
|
[03:34.021] 听忠言摒逆语树威严宝剑斫书案
|
||||||
|
[03:36.589] 纵使有千万种寂寞和孤单相伴
|
||||||
|
[03:39.103] 既受终冠帝冕龙椅上成败也笑看
|
||||||
|
[03:41.916] 固江河成帝业立国家终归于乱
|
||||||
|
[03:44.470] 光阴逝千载过功成者都付笑谈间
|
||||||
|
[03:48.436]
|
BIN
forced-alignment/1.mp3
Normal file
BIN
forced-alignment/1.mp3
Normal file
Binary file not shown.
82
forced-alignment/1.txt
Normal file
82
forced-alignment/1.txt
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
东汉末狼烟不休
|
||||||
|
常侍乱朝野陷
|
||||||
|
阿瞒挟天子令诸侯
|
||||||
|
踞江东志在九州
|
||||||
|
继祖业承父兄
|
||||||
|
既冕主吴越万兜鍪
|
||||||
|
纵天下几变春秋
|
||||||
|
稳东南面中原
|
||||||
|
水师锁长江抗曹刘
|
||||||
|
镇赤壁雄风赳赳
|
||||||
|
夺荆楚抚山越
|
||||||
|
驱金戈铁马灭仇雠
|
||||||
|
紫发髯碧色眼眸
|
||||||
|
射猛虎倚黄龙
|
||||||
|
胆识过凡人谁敌手
|
||||||
|
御天下半百之久
|
||||||
|
选贤臣任能将
|
||||||
|
覆江东云雨尽风流
|
||||||
|
千秋过再难回首
|
||||||
|
问古今兴亡事
|
||||||
|
几人耀青史芳名留
|
||||||
|
笑谈间云烟已旧
|
||||||
|
终留下万古叹
|
||||||
|
生子该当如孙仲谋
|
||||||
|
运帷幄英雄几拂袖
|
||||||
|
阴谋阳谋明仇暗斗
|
||||||
|
化作一江浊浪东流
|
||||||
|
君不见军赤壁纵野火铁索连环
|
||||||
|
也不见御北敌联西蜀长江上鏖战
|
||||||
|
继遗志领江东屹立于神州东南
|
||||||
|
尽心力洒英血展伟业剑气指苍天
|
||||||
|
军帐内公瑾智张昭谋奇策频献
|
||||||
|
沙场上太史勇甘宁霸一骑当十千
|
||||||
|
纵使有千万种寂寞和孤单相伴
|
||||||
|
既受终冠帝冕龙椅上成败也笑看
|
||||||
|
铁瓮城难攻易守
|
||||||
|
旌旗立苍空蔽
|
||||||
|
逾百千雄师万蒙舟
|
||||||
|
善制衡眼光独秀
|
||||||
|
擢鲁肃劝阿蒙
|
||||||
|
聚贤成霸业名利收
|
||||||
|
固疆土施德恩厚
|
||||||
|
军心定百姓安
|
||||||
|
富国又强兵重耕耨
|
||||||
|
交远好未雨绸缪
|
||||||
|
联南洋合林邑
|
||||||
|
行军远渡海驻夷洲
|
||||||
|
残垣下枯木雕朽
|
||||||
|
想当年麦城边
|
||||||
|
截兵缚关羽终其寿
|
||||||
|
凭栏倚横看吴钩
|
||||||
|
叹乱世几时了
|
||||||
|
天下归一统没其咎
|
||||||
|
称帝王壮心仍稠
|
||||||
|
却无奈自孤傲
|
||||||
|
同室亦操戈子嗣斗
|
||||||
|
千年后恚恨徒留
|
||||||
|
再何寻军帐里
|
||||||
|
将士聚欢饮赏箜篌
|
||||||
|
运帷幄英雄几拂袖
|
||||||
|
阴谋阳谋明仇暗斗
|
||||||
|
化作一江浊浪东流
|
||||||
|
君不见吕子明踏轻舟白衣渡川
|
||||||
|
也不见陆伯言烧联营火光上冲天
|
||||||
|
善制衡选贤臣任能将共谋江山
|
||||||
|
听忠言摒逆语树威严宝剑斫书案
|
||||||
|
夺荆州抗刘备合曹操共克襄樊
|
||||||
|
守夷陵任陆逊剿敌军火计破蜀胆
|
||||||
|
固江河成帝业立国家终归于乱
|
||||||
|
光阴逝千载过功成者都付笑谈间
|
||||||
|
君不见军赤壁纵野火铁索连环
|
||||||
|
也不见御北敌联西蜀长江上鏖战
|
||||||
|
继遗志领江东屹立于神州东南
|
||||||
|
尽心力洒英血展伟业剑指苍天
|
||||||
|
君不见吕子明踏轻舟白衣渡川
|
||||||
|
也不见陆伯言烧联营火光上冲天
|
||||||
|
善制衡选贤臣任能将共谋江山
|
||||||
|
听忠言摒逆语树威严宝剑斫书案
|
||||||
|
纵使有千万种寂寞和孤单相伴
|
||||||
|
既受终冠帝冕龙椅上成败也笑看
|
||||||
|
固江河成帝业立国家终归于乱
|
||||||
|
光阴逝千载过功成者都付笑谈间
|
Binary file not shown.
2
forced-alignment/output.ttml
Normal file
2
forced-alignment/output.ttml
Normal file
File diff suppressed because one or more lines are too long
179
forced-alignment/split.py
Normal file
179
forced-alignment/split.py
Normal file
@ -0,0 +1,179 @@
|
|||||||
|
import torch
|
||||||
|
import torchaudio
|
||||||
|
from typing import List
|
||||||
|
from pypinyin import lazy_pinyin
|
||||||
|
from pypinyin_dict.phrase_pinyin_data import cc_cedict
|
||||||
|
from torchaudio.transforms import Resample
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
def compute_alignments(waveform: torch.Tensor, transcript: List[str]):
|
||||||
|
with torch.inference_mode():
|
||||||
|
emission, _ = model(waveform.to(device))
|
||||||
|
token_spans = aligner(emission[0], tokenizer(transcript))
|
||||||
|
return emission, token_spans
|
||||||
|
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
from torchaudio.pipelines import MMS_FA as bundle
|
||||||
|
|
||||||
|
model = bundle.get_model()
|
||||||
|
model.to(device)
|
||||||
|
|
||||||
|
tokenizer = bundle.get_tokenizer()
|
||||||
|
aligner = bundle.get_aligner()
|
||||||
|
|
||||||
|
cc_cedict.load()
|
||||||
|
|
||||||
|
add_spaces = lambda s: ' '.join(s)
|
||||||
|
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
def get_audio_duration(file_path):
|
||||||
|
"""
|
||||||
|
读取音频文件并获取其时长(秒数)。
|
||||||
|
|
||||||
|
:param file_path: 音频文件的路径
|
||||||
|
:return: 音频文件的时长(秒数)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
audio = AudioSegment.from_file(file_path)
|
||||||
|
duration_in_seconds = len(audio) / 1000.0
|
||||||
|
return duration_in_seconds
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading audio file: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def timestamp(seconds):
|
||||||
|
"""
|
||||||
|
将浮点数的秒钟转换为TTML的时间戳格式(HH:MM:SS.sss)。
|
||||||
|
|
||||||
|
:param seconds: 浮点数的秒钟
|
||||||
|
:return: TTML时间戳格式字符串
|
||||||
|
"""
|
||||||
|
hours = int(seconds // 3600)
|
||||||
|
minutes = int((seconds % 3600) // 60)
|
||||||
|
seconds = seconds % 60
|
||||||
|
milliseconds = int((seconds % 1) * 1000)
|
||||||
|
seconds = int(seconds)
|
||||||
|
|
||||||
|
return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
|
||||||
|
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
def extract_numbers_from_files(directory):
|
||||||
|
"""
|
||||||
|
读取给定目录,提取文件名中的数字部分,并返回一个包含这些数字的列表。
|
||||||
|
|
||||||
|
:param directory: 目录路径
|
||||||
|
:return: 包含数字的列表
|
||||||
|
"""
|
||||||
|
numbers = []
|
||||||
|
pattern = re.compile(r'line-(\d+)\.wav')
|
||||||
|
|
||||||
|
try:
|
||||||
|
for filename in os.listdir(directory):
|
||||||
|
match = pattern.match(filename)
|
||||||
|
if match:
|
||||||
|
number = int(match.group(1))
|
||||||
|
numbers.append(number)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading directory: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return numbers
|
||||||
|
|
||||||
|
class TTMLGenerator:
|
||||||
|
def __init__(self, duration, xmlns="http://www.w3.org/ns/ttml", xmlns_ttm="http://www.w3.org/ns/ttml#metadata", xmlns_amll="http://www.example.com/ns/amll", xmlns_itunes="http://music.apple.com/lyric-ttml-internal"):
|
||||||
|
self.tt = ET.Element("tt", attrib={
|
||||||
|
"xmlns": xmlns,
|
||||||
|
"xmlns:ttm": xmlns_ttm,
|
||||||
|
"xmlns:amll": xmlns_amll,
|
||||||
|
"xmlns:itunes": xmlns_itunes
|
||||||
|
})
|
||||||
|
self.head = ET.SubElement(self.tt, "head")
|
||||||
|
self.metadata = ET.SubElement(self.head, "metadata")
|
||||||
|
self.body = ET.SubElement(self.tt, "body", attrib={"dur": duration})
|
||||||
|
self.div = ET.SubElement(self.body, "div")
|
||||||
|
|
||||||
|
def add_lyrics(self, begin, end, agent, itunes_key, words):
|
||||||
|
p = ET.SubElement(self.div, "p", attrib={
|
||||||
|
"begin": begin,
|
||||||
|
"end": end,
|
||||||
|
"ttm:agent": agent,
|
||||||
|
"itunes:key": itunes_key
|
||||||
|
})
|
||||||
|
for word, start, stop in words:
|
||||||
|
span = ET.SubElement(p, "span", attrib={"begin": start, "end": stop})
|
||||||
|
span.text = word
|
||||||
|
|
||||||
|
def save(self, filename):
|
||||||
|
tree = ET.ElementTree(self.tt)
|
||||||
|
tree.write(filename, encoding="utf-8", xml_declaration=True)
|
||||||
|
|
||||||
|
duration = get_audio_duration("霜雪千年_vocal.mp3")
|
||||||
|
|
||||||
|
# 示例使用
|
||||||
|
ttml_generator = TTMLGenerator(duration=timestamp(duration))
|
||||||
|
|
||||||
|
|
||||||
|
def process_line(line_idx, start_time):
|
||||||
|
with open(f"./segments/line-{line_idx}.txt", "r") as f:
|
||||||
|
text = f.read()
|
||||||
|
|
||||||
|
waveform, sample_rate = torchaudio.load(f"./segments/line-{line_idx}.wav")
|
||||||
|
|
||||||
|
waveform = waveform[0:1]
|
||||||
|
resampler = Resample(orig_freq=sample_rate, new_freq=16000)
|
||||||
|
waveform = resampler(waveform)
|
||||||
|
|
||||||
|
text_pinyin = lazy_pinyin(text)
|
||||||
|
text_normalized = " ".join(text_pinyin)
|
||||||
|
|
||||||
|
transcript = text_normalized.split()
|
||||||
|
emission, token_spans = compute_alignments(waveform, transcript)
|
||||||
|
num_frames = emission.size(1)
|
||||||
|
ratio = waveform.size(1) / num_frames
|
||||||
|
|
||||||
|
words = []
|
||||||
|
for i in range(len(token_spans)):
|
||||||
|
spans = token_spans[i]
|
||||||
|
x0 = start_time + int(ratio * spans[0].start) / 16000
|
||||||
|
x1 = start_time + int(ratio * spans[-1].end) / 16000
|
||||||
|
words.append({
|
||||||
|
"word": text[i],
|
||||||
|
"start": x0,
|
||||||
|
"end": x1
|
||||||
|
})
|
||||||
|
idx=0
|
||||||
|
for item in words:
|
||||||
|
if idx == len(words) - 1:
|
||||||
|
break
|
||||||
|
item["end"] = words[idx + 1]["start"]
|
||||||
|
idx+=1
|
||||||
|
result = []
|
||||||
|
for word in words:
|
||||||
|
result.append((word["word"], timestamp(word["start"]), timestamp(word["end"])))
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
lines_to_process = sorted(extract_numbers_from_files("segments"))
|
||||||
|
|
||||||
|
i=1
|
||||||
|
for line_num in tqdm(lines_to_process):
|
||||||
|
with open(f"./segments/line-{line_num}.time", "r") as f:
|
||||||
|
a = f.read()
|
||||||
|
b = a.split(",")
|
||||||
|
start_time = float(b[0])
|
||||||
|
end_time = float(b[1])
|
||||||
|
result = process_line(line_num, start_time)
|
||||||
|
ttml_generator.add_lyrics(
|
||||||
|
begin=timestamp(start_time), end=timestamp(end_time), agent="v1", itunes_key=f"L{i}",
|
||||||
|
words=result
|
||||||
|
)
|
||||||
|
i+=1
|
||||||
|
|
||||||
|
# 保存文件
|
||||||
|
ttml_generator.save("output.ttml")
|
57
forced-alignment/split_song.py
Normal file
57
forced-alignment/split_song.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
from pydub import AudioSegment
|
||||||
|
import re
|
||||||
|
|
||||||
|
def parse_lrc(lrc_file):
|
||||||
|
"""解析LRC文件,返回一个包含时间戳和歌词的列表"""
|
||||||
|
with open(lrc_file, 'r', encoding='utf-8') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
lrc_data = []
|
||||||
|
for line in lines:
|
||||||
|
# 使用正则表达式匹配时间戳和歌词
|
||||||
|
match = re.match(r'\[(\d+):(\d+\.\d+)\](.*)', line)
|
||||||
|
if match:
|
||||||
|
minutes = int(match.group(1))
|
||||||
|
seconds = float(match.group(2))
|
||||||
|
lyric = match.group(3).strip()
|
||||||
|
lyric = lyric.replace(" ", "")
|
||||||
|
timestamp = minutes * 60 + seconds
|
||||||
|
lrc_data.append((timestamp, lyric))
|
||||||
|
|
||||||
|
return lrc_data
|
||||||
|
|
||||||
|
def split_audio_by_lrc(audio_file, lrc_data, output_prefix):
|
||||||
|
"""根据LRC数据分割音频文件,并保存为单独的WAV文件"""
|
||||||
|
audio = AudioSegment.from_file(audio_file)
|
||||||
|
|
||||||
|
for i, (start_time, lyric) in enumerate(lrc_data):
|
||||||
|
# Skip empty line
|
||||||
|
if lyric.strip() == "":
|
||||||
|
continue
|
||||||
|
if i < len(lrc_data) - 1:
|
||||||
|
end_time = lrc_data[i + 1][0]
|
||||||
|
else:
|
||||||
|
end_time = len(audio) / 1000 # 最后一行歌词到音频结束
|
||||||
|
start_time = max(0, start_time - 0.1) # 前后各扩0.1秒
|
||||||
|
end_time = min(len(audio) / 1000, end_time + 0.1)
|
||||||
|
start_time_ms = start_time * 1000
|
||||||
|
end_time_ms = end_time * 1000
|
||||||
|
|
||||||
|
segment = audio[start_time_ms:end_time_ms]
|
||||||
|
output_file = f"{output_prefix}-{i+1}.wav"
|
||||||
|
output_script = f"{output_prefix}-{i+1}.txt"
|
||||||
|
output_time = f"{output_prefix}-{i+1}.time"
|
||||||
|
segment.export(output_file, format="wav")
|
||||||
|
with open(output_script, "w") as f:
|
||||||
|
f.write(lyric)
|
||||||
|
with open(output_time, "w") as f:
|
||||||
|
f.write(str(start_time)+","+str(end_time))
|
||||||
|
print(f"Saved {output_file}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
lrc_file = "霜雪千年.lrc" # LRC文件路径
|
||||||
|
audio_file = "霜雪千年.mp3" # 音频文件路径
|
||||||
|
output_prefix = "segments/line" # 输出文件名的前缀
|
||||||
|
|
||||||
|
lrc_data = parse_lrc(lrc_file)
|
||||||
|
split_audio_by_lrc(audio_file, lrc_data, output_prefix)
|
122
forced-alignment/split_whole.py
Normal file
122
forced-alignment/split_whole.py
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
import torch
|
||||||
|
import torchaudio
|
||||||
|
from typing import List
|
||||||
|
from pypinyin import lazy_pinyin
|
||||||
|
from pypinyin_dict.phrase_pinyin_data import cc_cedict
|
||||||
|
from torchaudio.transforms import Resample
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
def get_audio_duration(file_path):
|
||||||
|
"""
|
||||||
|
读取音频文件并获取其时长(秒数)。
|
||||||
|
|
||||||
|
:param file_path: 音频文件的路径
|
||||||
|
:return: 音频文件的时长(秒数)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
audio = AudioSegment.from_file(file_path)
|
||||||
|
duration_in_seconds = len(audio) / 1000.0
|
||||||
|
return duration_in_seconds
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading audio file: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def timestamp(seconds):
|
||||||
|
"""
|
||||||
|
将浮点数的秒钟转换为TTML的时间戳格式(HH:MM:SS.sss)。
|
||||||
|
|
||||||
|
:param seconds: 浮点数的秒钟
|
||||||
|
:return: TTML时间戳格式字符串
|
||||||
|
"""
|
||||||
|
hours = int(seconds // 3600)
|
||||||
|
minutes = int((seconds % 3600) // 60)
|
||||||
|
seconds = seconds % 60
|
||||||
|
milliseconds = int((seconds % 1) * 1000)
|
||||||
|
seconds = int(seconds)
|
||||||
|
|
||||||
|
return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
|
||||||
|
|
||||||
|
class TTMLGenerator:
|
||||||
|
def __init__(self, duration, xmlns="http://www.w3.org/ns/ttml", xmlns_ttm="http://www.w3.org/ns/ttml#metadata", xmlns_amll="http://www.example.com/ns/amll", xmlns_itunes="http://music.apple.com/lyric-ttml-internal"):
|
||||||
|
self.tt = ET.Element("tt", attrib={
|
||||||
|
"xmlns": xmlns,
|
||||||
|
"xmlns:ttm": xmlns_ttm,
|
||||||
|
"xmlns:amll": xmlns_amll,
|
||||||
|
"xmlns:itunes": xmlns_itunes
|
||||||
|
})
|
||||||
|
self.head = ET.SubElement(self.tt, "head")
|
||||||
|
self.metadata = ET.SubElement(self.head, "metadata")
|
||||||
|
self.body = ET.SubElement(self.tt, "body", attrib={"dur": duration})
|
||||||
|
self.div = ET.SubElement(self.body, "div")
|
||||||
|
|
||||||
|
def add_lyrics(self, begin, end, agent, itunes_key, words):
|
||||||
|
p = ET.SubElement(self.div, "p", attrib={
|
||||||
|
"begin": begin,
|
||||||
|
"end": end,
|
||||||
|
"ttm:agent": agent,
|
||||||
|
"itunes:key": itunes_key
|
||||||
|
})
|
||||||
|
for word, start, stop in words:
|
||||||
|
span = ET.SubElement(p, "span", attrib={"begin": start, "end": stop})
|
||||||
|
span.text = word
|
||||||
|
|
||||||
|
def save(self, filename):
|
||||||
|
tree = ET.ElementTree(self.tt)
|
||||||
|
tree.write(filename, encoding="utf-8", xml_declaration=True)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_alignments(waveform: torch.Tensor, transcript: List[str]):
|
||||||
|
with torch.inference_mode():
|
||||||
|
emission, _ = model(waveform.to(device))
|
||||||
|
token_spans = aligner(emission[0], tokenizer(transcript))
|
||||||
|
return emission, token_spans
|
||||||
|
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
from torchaudio.pipelines import MMS_FA as bundle
|
||||||
|
|
||||||
|
model = bundle.get_model()
|
||||||
|
model.to(device)
|
||||||
|
|
||||||
|
tokenizer = bundle.get_tokenizer()
|
||||||
|
aligner = bundle.get_aligner()
|
||||||
|
|
||||||
|
cc_cedict.load()
|
||||||
|
|
||||||
|
add_spaces = lambda s: ' '.join(s)
|
||||||
|
|
||||||
|
with open("./1.txt", "r") as f:
|
||||||
|
text_lines = f.readlines()
|
||||||
|
|
||||||
|
text_pinyin = []
|
||||||
|
|
||||||
|
for line in text_lines:
|
||||||
|
text_pinyin.append("".join(lazy_pinyin(line.strip())))
|
||||||
|
|
||||||
|
text_normalized = " ".join(text_pinyin)
|
||||||
|
|
||||||
|
print(text_normalized)
|
||||||
|
|
||||||
|
waveform, sample_rate = torchaudio.load("./权御天下 [vocals].mp3")
|
||||||
|
|
||||||
|
waveform = waveform[0:1]
|
||||||
|
resampler = Resample(orig_freq=sample_rate, new_freq=16000)
|
||||||
|
waveform = resampler(waveform)
|
||||||
|
|
||||||
|
transcript = text_normalized.split()
|
||||||
|
emission, token_spans = compute_alignments(waveform, transcript)
|
||||||
|
num_frames = emission.size(1)
|
||||||
|
|
||||||
|
ratio = waveform.size(1) / num_frames
|
||||||
|
|
||||||
|
duration = get_audio_duration("权御天下 [vocals].mp3")
|
||||||
|
|
||||||
|
ttml_generator = TTMLGenerator(duration=timestamp(duration))
|
||||||
|
|
||||||
|
for i in range(len(token_spans)):
|
||||||
|
spans = token_spans[i]
|
||||||
|
x0 = round(int(ratio * spans[0].start) / 16000, 3)
|
||||||
|
x1 = round(int(ratio * spans[-1].end) / 16000, 3)
|
||||||
|
with open("1.fff", "a") as f:
|
||||||
|
f.write(f"{[i]}: {x0}-{x1}\n")
|
File diff suppressed because one or more lines are too long
60
forced-alignment/test_split.py
Normal file
60
forced-alignment/test_split.py
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
import torch
|
||||||
|
import torchaudio
|
||||||
|
from typing import List
|
||||||
|
from pypinyin import lazy_pinyin
|
||||||
|
from pypinyin_dict.phrase_pinyin_data import cc_cedict
|
||||||
|
from torchaudio.transforms import Resample
|
||||||
|
|
||||||
|
|
||||||
|
def compute_alignments(waveform: torch.Tensor, transcript: List[str]):
|
||||||
|
with torch.inference_mode():
|
||||||
|
emission, _ = model(waveform.to(device))
|
||||||
|
token_spans = aligner(emission[0], tokenizer(transcript))
|
||||||
|
return emission, token_spans
|
||||||
|
|
||||||
|
# Compute average score weighted by the span length
|
||||||
|
def _score(spans):
|
||||||
|
return sum(s.score * len(s) for s in spans) / sum(len(s) for s in spans)
|
||||||
|
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
from torchaudio.pipelines import MMS_FA as bundle
|
||||||
|
|
||||||
|
model = bundle.get_model()
|
||||||
|
model.to(device)
|
||||||
|
|
||||||
|
tokenizer = bundle.get_tokenizer()
|
||||||
|
aligner = bundle.get_aligner()
|
||||||
|
|
||||||
|
cc_cedict.load()
|
||||||
|
|
||||||
|
add_spaces = lambda s: ' '.join(s)
|
||||||
|
|
||||||
|
with open("./segments/line-1.txt", "r") as f:
|
||||||
|
text = f.read()
|
||||||
|
|
||||||
|
text_raw = add_spaces(text)
|
||||||
|
text_list = list(text)
|
||||||
|
text_pinyin = lazy_pinyin(text)
|
||||||
|
text_normalized = " ".join(text_pinyin)
|
||||||
|
|
||||||
|
waveform, sample_rate = torchaudio.load("./segments/line-1.wav")
|
||||||
|
|
||||||
|
waveform = waveform[0:1]
|
||||||
|
resampler = Resample(orig_freq=sample_rate, new_freq=16000)
|
||||||
|
waveform = resampler(waveform)
|
||||||
|
|
||||||
|
transcript = text_normalized.split()
|
||||||
|
emission, token_spans = compute_alignments(waveform, transcript)
|
||||||
|
num_frames = emission.size(1)
|
||||||
|
|
||||||
|
|
||||||
|
print("Raw Transcript: ", text_raw)
|
||||||
|
print("Normalized Transcript: ", text_normalized)
|
||||||
|
|
||||||
|
ratio = waveform.size(1) / num_frames
|
||||||
|
|
||||||
|
for i in range(len(token_spans)):
|
||||||
|
spans = token_spans[i]
|
||||||
|
x0 = round(int(ratio * spans[0].start) / 16000, 3)
|
||||||
|
x1 = round(int(ratio * spans[-1].end) / 16000, 3)
|
||||||
|
print(f"{text[i]}: {x0}-{x1}")
|
BIN
forced-alignment/权御天下 [vocals].mp3
Normal file
BIN
forced-alignment/权御天下 [vocals].mp3
Normal file
Binary file not shown.
BIN
forced-alignment/权御天下.mp3
Normal file
BIN
forced-alignment/权御天下.mp3
Normal file
Binary file not shown.
2
forced-alignment/权御天下.ttml
Normal file
2
forced-alignment/权御天下.ttml
Normal file
File diff suppressed because one or more lines are too long
55
forced-alignment/霜雪千年.lrc
Normal file
55
forced-alignment/霜雪千年.lrc
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
[tool: 歌词滚动姬 https://lrc-maker.github.io]
|
||||||
|
[ti: 霜雪千年]
|
||||||
|
[ar: COP]
|
||||||
|
[00:13.876] 梨花香缠着衣角掠过熙攘
|
||||||
|
[00:19.385] 复悄入红帘深帐
|
||||||
|
[00:22.775] 听枝头黄鹂逗趣儿细风绕指淌
|
||||||
|
[00:27.437] 坐船舫兰桨拨开雾霭迷茫
|
||||||
|
[00:33.000] 不觉已一日过半
|
||||||
|
[00:36.361] 过眼的葱郁风光悉数泛了黄
|
||||||
|
[00:41.590] 褪尽温度的风无言牵引中
|
||||||
|
[00:44.898] 便清晰了在此的眉目
|
||||||
|
[00:48.721] 暮色的消融隐约了晦朔葱茏
|
||||||
|
[00:53.888] 在这老街回眸烟云中追溯我是谁
|
||||||
|
[00:58.582] 只消暮雨点滴便足以粉饰这是非
|
||||||
|
[01:02.158] 待这月色涌起谁人轻叩这门扉
|
||||||
|
[01:09.000] 苔绿青石板街
|
||||||
|
[01:10.444] 斑驳了流水般岁月
|
||||||
|
[01:12.389] 小酌三盏两杯理不清缠绕的情结
|
||||||
|
[01:15.868] 在你淡漠眉间
|
||||||
|
[01:19.277] 瞥见离人的喜悲霜雪
|
||||||
|
[01:25.766]
|
||||||
|
[01:36.176] 楼阁现尘飞雾散荧光蹁跹
|
||||||
|
[01:41.500] 显露出斑驳石阶
|
||||||
|
[01:44.865] 入眼是落英纷然芳草入深院
|
||||||
|
[01:49.571] 凭栏杆小桌上置琼觞两盏
|
||||||
|
[01:55.277] 阖眼听清风疏叶
|
||||||
|
[01:58.603] 似曾有欢声笑言萦绕这高轩
|
||||||
|
[02:03.691] 云动寂静鸣蝉雨坠激漪涟
|
||||||
|
[02:07.298] 皴擦点染勾勒这世间
|
||||||
|
[02:11.084] 缘起的一眼定格了三生千年
|
||||||
|
[02:16.245] 在这老街回眸烟云中追溯我是谁
|
||||||
|
[02:20.988] 只消暮雨点滴便足以粉饰这是非
|
||||||
|
[02:24.463] 待这月色涌起谁人轻叩这门扉
|
||||||
|
[02:31.390] 苔绿青石板街
|
||||||
|
[02:32.752] 斑驳了流水般岁月
|
||||||
|
[02:34.687] 小酌三盏两杯理不清缠绕的情结
|
||||||
|
[02:38.162] 在你淡漠眉间
|
||||||
|
[02:41.495] 瞥见离人的喜悲霜雪
|
||||||
|
[02:48.066]
|
||||||
|
[02:58.863] 三月梨花雪几载开了又败
|
||||||
|
[03:02.206] 笔锋走黑白丹青中穿插无奈
|
||||||
|
[03:05.594] 彼时那弯月何时初现于江畔
|
||||||
|
[03:08.966] 而我又在待何人
|
||||||
|
[03:11.201] 在这亭台回眸千年后
|
||||||
|
[03:14.508] 忆起你是谁
|
||||||
|
[03:15.816] 只消月色隐约便足以勾勒这是非
|
||||||
|
[03:19.323] 待这回忆涌起恍惚之间已下泪
|
||||||
|
[03:26.171] 枫红十里长街
|
||||||
|
[03:27.662] 红帘后谁人蹙着眉
|
||||||
|
[03:29.608] 遥梦桑竹桃源
|
||||||
|
[03:31.188] 轮回中曾道别的地点
|
||||||
|
[03:32.832] 愿今生再相见
|
||||||
|
[03:36.478] 消融你眉间
|
||||||
|
[03:38.585] 悲戚霜雪
|
||||||
|
[03:44.184]
|
BIN
forced-alignment/霜雪千年.mp3
Normal file
BIN
forced-alignment/霜雪千年.mp3
Normal file
Binary file not shown.
2
forced-alignment/霜雪千年.ttml
Normal file
2
forced-alignment/霜雪千年.ttml
Normal file
File diff suppressed because one or more lines are too long
BIN
forced-alignment/霜雪千年_vocal.mp3
Normal file
BIN
forced-alignment/霜雪千年_vocal.mp3
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user