add: AI lyrics aligning
This commit is contained in:
commit
2698a75277
178
.gitignore
vendored
Normal file
178
.gitignore
vendored
Normal file
@ -0,0 +1,178 @@
|
|||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# UV
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
#uv.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
||||||
|
.pdm.toml
|
||||||
|
.pdm-python
|
||||||
|
.pdm-build/
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
||||||
|
|
||||||
|
# PyPI configuration file
|
||||||
|
.pypirc
|
||||||
|
|
||||||
|
|
||||||
|
# Project specific
|
||||||
|
/temp
|
||||||
|
/data
|
||||||
|
|
||||||
|
.DS_Store
|
3
align-pipeline.md
Normal file
3
align-pipeline.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
1. prepare `1.mp3`, `1.txt`, `1.group` in `./data`
|
||||||
|
2. `whisperAlignment/alignWithGroup`
|
||||||
|
3. `mmsAlignment
|
6
mmsAlignment/align2LRC.py
Normal file
6
mmsAlignment/align2LRC.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
from utils.ttml import extract_lrc_from_ttml
|
||||||
|
|
||||||
|
lrc_output = extract_lrc_from_ttml('./data/1.ttml')
|
||||||
|
|
||||||
|
with open('./data/1-final.lrc', 'w') as f:
|
||||||
|
f.write(lrc_output)
|
167
mmsAlignment/alignWithMMS.py
Normal file
167
mmsAlignment/alignWithMMS.py
Normal file
@ -0,0 +1,167 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
import torch
|
||||||
|
import torchaudio
|
||||||
|
from typing import List
|
||||||
|
from pypinyin import lazy_pinyin
|
||||||
|
from pypinyin_dict.phrase_pinyin_data import cc_cedict
|
||||||
|
from torchaudio.transforms import Resample
|
||||||
|
from tqdm import tqdm
|
||||||
|
from utils.ttml import TTMLGenerator
|
||||||
|
from utils.audio import get_audio_duration
|
||||||
|
|
||||||
|
# 初始化设备、模型、分词器、对齐器等
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
bundle = torchaudio.pipelines.MMS_FA
|
||||||
|
model = bundle.get_model().to(device)
|
||||||
|
tokenizer = bundle.get_tokenizer()
|
||||||
|
aligner = bundle.get_aligner()
|
||||||
|
|
||||||
|
cc_cedict.load()
|
||||||
|
|
||||||
|
def timestamp(seconds: float) -> str:
|
||||||
|
"""将浮点数秒钟转换为TTML时间戳格式"""
|
||||||
|
hours = int(seconds // 3600)
|
||||||
|
minutes = int((seconds % 3600) // 60)
|
||||||
|
seconds = seconds % 60
|
||||||
|
milliseconds = int((seconds % 1) * 1000)
|
||||||
|
seconds = int(seconds)
|
||||||
|
return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
|
||||||
|
|
||||||
|
def compute_alignments(waveform: torch.Tensor, transcript: List[str]):
|
||||||
|
with torch.inference_mode():
|
||||||
|
emission, _ = model(waveform.to(device))
|
||||||
|
token_spans = aligner(emission[0], tokenizer(transcript))
|
||||||
|
return emission, token_spans
|
||||||
|
|
||||||
|
def parse_lrc(lrc_file, audio_len):
|
||||||
|
"""解析LRC文件,返回一个包含时间戳和歌词的列表"""
|
||||||
|
with open(lrc_file, 'r', encoding='utf-8') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
lrc_data = []
|
||||||
|
for line in lines:
|
||||||
|
# 使用正则表达式匹配时间戳和歌词
|
||||||
|
match = re.match(r'\[(\d+):(\d+\.\d+)\](.*)', line)
|
||||||
|
if match:
|
||||||
|
minutes = int(match.group(1))
|
||||||
|
seconds = float(match.group(2))
|
||||||
|
lyric = match.group(3).strip()
|
||||||
|
lyric = lyric.replace(" ", "")
|
||||||
|
timestamp = minutes * 60 + seconds
|
||||||
|
lrc_data.append((lyric, timestamp))
|
||||||
|
|
||||||
|
for i, (lyric, start_time) in enumerate(lrc_data):
|
||||||
|
# Skip empty line
|
||||||
|
if lyric.strip() == "":
|
||||||
|
continue
|
||||||
|
if i < len(lrc_data) - 1:
|
||||||
|
end_time = lrc_data[i + 1][1]
|
||||||
|
else:
|
||||||
|
end_time = audio_len
|
||||||
|
lrc_data[i] = (lyric, start_time, end_time)
|
||||||
|
|
||||||
|
# Filter empty lines again
|
||||||
|
lrc_data = [line for line in lrc_data if line[0].strip() != ""]
|
||||||
|
|
||||||
|
return lrc_data
|
||||||
|
|
||||||
|
def extract_numbers_from_files(directory):
|
||||||
|
"""
|
||||||
|
读取给定目录,提取文件名中的数字部分,并返回一个包含这些数字的列表。
|
||||||
|
|
||||||
|
:param directory: 目录路径
|
||||||
|
:return: 包含数字的列表
|
||||||
|
"""
|
||||||
|
numbers = []
|
||||||
|
pattern = re.compile(r'line-(\d+)\.wav')
|
||||||
|
|
||||||
|
try:
|
||||||
|
for filename in os.listdir(directory):
|
||||||
|
match = pattern.match(filename)
|
||||||
|
if match:
|
||||||
|
number = int(match.group(1))
|
||||||
|
numbers.append(number)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading directory: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return numbers
|
||||||
|
|
||||||
|
def process_line(line_idx, start_time):
|
||||||
|
with open(f"./temp/lines/line-{line_idx}.txt", "r") as f:
|
||||||
|
text = f.read()
|
||||||
|
|
||||||
|
waveform, sample_rate = torchaudio.load(f"./temp/lines/line-{line_idx}.wav")
|
||||||
|
|
||||||
|
waveform = waveform[0:1]
|
||||||
|
resampler = Resample(orig_freq=sample_rate, new_freq=16000)
|
||||||
|
waveform = resampler(waveform)
|
||||||
|
|
||||||
|
text_pinyin = lazy_pinyin(text)
|
||||||
|
text_normalized = " ".join(text_pinyin)
|
||||||
|
|
||||||
|
transcript = text_normalized.split()
|
||||||
|
emission, token_spans = compute_alignments(waveform, transcript)
|
||||||
|
num_frames = emission.size(1)
|
||||||
|
ratio = waveform.size(1) / num_frames
|
||||||
|
|
||||||
|
words = []
|
||||||
|
for i in range(len(token_spans)):
|
||||||
|
spans = token_spans[i]
|
||||||
|
x0 = start_time + int(ratio * spans[0].start) / 16000
|
||||||
|
x1 = start_time + int(ratio * spans[-1].end) / 16000
|
||||||
|
words.append({
|
||||||
|
"word": text[i],
|
||||||
|
"start": x0,
|
||||||
|
"end": x1
|
||||||
|
})
|
||||||
|
idx=0
|
||||||
|
for item in words:
|
||||||
|
if idx == len(words) - 1:
|
||||||
|
break
|
||||||
|
item["end"] = words[idx + 1]["start"]
|
||||||
|
idx+=1
|
||||||
|
result = []
|
||||||
|
for word in words:
|
||||||
|
result.append((word["word"], timestamp(word["start"]), timestamp(word["end"])))
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def align(audio_file: str, lrc_file: str, output_ttml: str, segments_dir: str = "./temp/lines"):
|
||||||
|
"""
|
||||||
|
对齐音频和歌词,并输出TTML文件。
|
||||||
|
|
||||||
|
:param audio_file: 音频文件路径
|
||||||
|
:param lrc_file: LRC歌词文件路径
|
||||||
|
:param output_ttml: 输出TTML文件路径
|
||||||
|
:param segments_dir: 存放分割后音频片段的目录,默认为"./segments"
|
||||||
|
"""
|
||||||
|
# 获取音频时长
|
||||||
|
duration = get_audio_duration(audio_file)
|
||||||
|
|
||||||
|
# 解析LRC文件
|
||||||
|
lrc_data = parse_lrc(lrc_file, duration)
|
||||||
|
|
||||||
|
# 提取要处理的行号
|
||||||
|
lines_to_process = sorted(extract_numbers_from_files(segments_dir))
|
||||||
|
|
||||||
|
# 创建TTML生成器实例
|
||||||
|
ttml_generator = TTMLGenerator(duration=timestamp(duration))
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
for line_num in tqdm(lines_to_process):
|
||||||
|
start_time = lrc_data[i][1]
|
||||||
|
end_time = lrc_data[i][2]
|
||||||
|
result = process_line(line_num, start_time)
|
||||||
|
ttml_generator.add_lyrics(
|
||||||
|
begin=timestamp(start_time), end=timestamp(end_time), agent="v1", itunes_key=f"L{i+1}",
|
||||||
|
words=result
|
||||||
|
)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# 保存TTML文件
|
||||||
|
ttml_generator.save(output_ttml)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
align("./data/1.flac", "./data/1.lrc", "./data/output.ttml", "./temp/lines")
|
58
mmsAlignment/splitSong.py
Normal file
58
mmsAlignment/splitSong.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
from pydub import AudioSegment
|
||||||
|
from utils.cleanTempDir import cleanTempDir
|
||||||
|
import re
|
||||||
|
|
||||||
|
def parse_lrc(lrc_file):
|
||||||
|
"""解析LRC文件,返回一个包含时间戳和歌词的列表"""
|
||||||
|
with open(lrc_file, 'r', encoding='utf-8') as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
lrc_data = []
|
||||||
|
for line in lines:
|
||||||
|
# 使用正则表达式匹配时间戳和歌词
|
||||||
|
match = re.match(r'\[(\d+):(\d+\.\d+)\](.*)', line)
|
||||||
|
if match:
|
||||||
|
minutes = int(match.group(1))
|
||||||
|
seconds = float(match.group(2))
|
||||||
|
lyric = match.group(3).strip()
|
||||||
|
lyric = lyric.replace(" ", "")
|
||||||
|
timestamp = minutes * 60 + seconds
|
||||||
|
lrc_data.append((timestamp, lyric))
|
||||||
|
|
||||||
|
return lrc_data
|
||||||
|
|
||||||
|
def split_audio_by_lrc(audio_file, lrc_data):
|
||||||
|
"""根据LRC数据分割音频文件,并保存为单独的WAV文件"""
|
||||||
|
audio = AudioSegment.from_file(audio_file)
|
||||||
|
cleanTempDir("./temp/lines")
|
||||||
|
|
||||||
|
for i, (start_time, lyric) in enumerate(lrc_data):
|
||||||
|
# Skip empty line
|
||||||
|
if lyric.strip() == "":
|
||||||
|
continue
|
||||||
|
if i < len(lrc_data) - 1:
|
||||||
|
end_time = lrc_data[i + 1][0]
|
||||||
|
else:
|
||||||
|
end_time = len(audio) / 1000 # 最后一行歌词到音频结束
|
||||||
|
start_time = max(0, start_time - 0.1) # 前后各扩0.1秒
|
||||||
|
end_time = min(len(audio) / 1000, end_time + 0.1)
|
||||||
|
start_time_ms = start_time * 1000
|
||||||
|
end_time_ms = end_time * 1000
|
||||||
|
|
||||||
|
segment = audio[start_time_ms:end_time_ms]
|
||||||
|
output_file = f"./temp/lines/line-{i+1}.wav"
|
||||||
|
output_script = f"./temp/lines/line-{i+1}.txt"
|
||||||
|
output_time = f"./temp/lines/line-{i+1}.time"
|
||||||
|
segment.export(output_file, format="wav")
|
||||||
|
with open(output_script, "w") as f:
|
||||||
|
f.write(lyric)
|
||||||
|
with open(output_time, "w") as f:
|
||||||
|
f.write(str(start_time)+","+str(end_time))
|
||||||
|
print(f"Saved {output_file}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
lrc_file = "./data/1.lrc" # LRC文件路径
|
||||||
|
audio_file = "./data/1.flac" # 音频文件路径
|
||||||
|
|
||||||
|
lrc_data = parse_lrc(lrc_file)
|
||||||
|
split_audio_by_lrc(audio_file, lrc_data)
|
16
utils/audio.py
Normal file
16
utils/audio.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
def get_audio_duration(file_path):
|
||||||
|
"""
|
||||||
|
读取音频文件并获取其时长(秒数)。
|
||||||
|
|
||||||
|
:param file_path: 音频文件的路径
|
||||||
|
:return: 音频文件的时长(秒数)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
audio = AudioSegment.from_file(file_path)
|
||||||
|
duration_in_seconds = len(audio) / 1000.0
|
||||||
|
return duration_in_seconds
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading audio file: {e}")
|
||||||
|
return None
|
6
utils/cleanTempDir.py
Normal file
6
utils/cleanTempDir.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
def cleanTempDir(dir_path):
|
||||||
|
shutil.rmtree(dir_path, ignore_errors=True)
|
||||||
|
os.makedirs(dir_path, exist_ok=True)
|
57
utils/ttml.py
Normal file
57
utils/ttml.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
class TTMLGenerator:
|
||||||
|
def __init__(self, duration, xmlns="http://www.w3.org/ns/ttml", xmlns_ttm="http://www.w3.org/ns/ttml#metadata", xmlns_amll="http://www.example.com/ns/amll", xmlns_itunes="http://music.apple.com/lyric-ttml-internal"):
|
||||||
|
self.tt = ET.Element("tt", attrib={
|
||||||
|
"xmlns": xmlns,
|
||||||
|
"xmlns:ttm": xmlns_ttm,
|
||||||
|
"xmlns:amll": xmlns_amll,
|
||||||
|
"xmlns:itunes": xmlns_itunes
|
||||||
|
})
|
||||||
|
self.head = ET.SubElement(self.tt, "head")
|
||||||
|
self.metadata = ET.SubElement(self.head, "metadata")
|
||||||
|
self.body = ET.SubElement(self.tt, "body", attrib={"dur": duration})
|
||||||
|
self.div = ET.SubElement(self.body, "div")
|
||||||
|
|
||||||
|
def add_lyrics(self, begin, end, agent, itunes_key, words):
|
||||||
|
p = ET.SubElement(self.div, "p", attrib={
|
||||||
|
"begin": begin,
|
||||||
|
"end": end,
|
||||||
|
"ttm:agent": agent,
|
||||||
|
"itunes:key": itunes_key
|
||||||
|
})
|
||||||
|
for word, start, stop in words:
|
||||||
|
span = ET.SubElement(p, "span", attrib={"begin": start, "end": stop})
|
||||||
|
span.text = word
|
||||||
|
|
||||||
|
def save(self, filename):
|
||||||
|
tree = ET.ElementTree(self.tt)
|
||||||
|
tree.write(filename, encoding="utf-8", xml_declaration=True)
|
||||||
|
|
||||||
|
def extract_lrc_from_ttml(ttml_file):
|
||||||
|
def format_time(ttml_time):
|
||||||
|
return ttml_time[3:]
|
||||||
|
|
||||||
|
tree = ET.parse(ttml_file)
|
||||||
|
root = tree.getroot()
|
||||||
|
namespace = {"": "http://www.w3.org/ns/ttml", "ttm": "http://www.w3.org/ns/ttml#metadata"}
|
||||||
|
|
||||||
|
lrc_lines = []
|
||||||
|
|
||||||
|
for p in root.findall(".//p", namespace):
|
||||||
|
begin = p.attrib.get("begin")
|
||||||
|
end = p.attrib.get("end")
|
||||||
|
text_content = ""
|
||||||
|
|
||||||
|
for span in p.findall("span", namespace):
|
||||||
|
text_content += span.text or ""
|
||||||
|
|
||||||
|
# Format begin and end times
|
||||||
|
begin_time = format_time(begin)
|
||||||
|
end_time = format_time(end)
|
||||||
|
|
||||||
|
# Add formatted lines to the LRC list
|
||||||
|
lrc_lines.append(f"[{begin_time}] {text_content}")
|
||||||
|
lrc_lines.append(f"[{end_time}]") # Add the end time as a separate line
|
||||||
|
|
||||||
|
return "\n".join(lrc_lines)
|
6
whisperAlignment/align2srt.py
Normal file
6
whisperAlignment/align2srt.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
import stable_whisper
|
||||||
|
|
||||||
|
def align2srt(lyrics, audio_path, output_path):
|
||||||
|
model = stable_whisper.load_model('large-v3')
|
||||||
|
result = model.align(audio_path, lyrics, language="Chinese", regroup=False)
|
||||||
|
result.to_srt_vtt(output_path, segment_level=False)
|
47
whisperAlignment/alignWithGroup.py
Normal file
47
whisperAlignment/alignWithGroup.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
import os
|
||||||
|
from whisperAlignment.splitGroups import split_audio_and_text
|
||||||
|
from whisperAlignment.align2srt import align2srt
|
||||||
|
from whisperAlignment.srt2lrc import srt2lrc
|
||||||
|
from utils.cleanTempDir import cleanTempDir
|
||||||
|
|
||||||
|
def alignWithGroup(segments_file, audio_file, lyrics_file, output_file):
|
||||||
|
# Clean temp/segments dir (Insure it exists)
|
||||||
|
cleanTempDir('./temp/segments')
|
||||||
|
|
||||||
|
# Split groups
|
||||||
|
split_audio_and_text(segments_file, audio_file, lyrics_file, 'temp/segments')
|
||||||
|
|
||||||
|
# Get numbers of segments by count "txt" files in temp/segments
|
||||||
|
nums = len([name for name in os.listdir('./temp/segments') if name.endswith('.txt')])
|
||||||
|
|
||||||
|
for i in range(1, int(nums) + 1):
|
||||||
|
segment_lyric = f"./temp/segments/segment_{str(i)}.txt"
|
||||||
|
segment_audio = f"./temp/segments/segment_{str(i)}.mp3"
|
||||||
|
segment_srt = f"./temp/segments/segment_{str(i)}.srt"
|
||||||
|
segment_lrc = f"./temp/segments/segment_{str(i)}.lrc"
|
||||||
|
segment_start = f"./temp/segments/segment_{str(i)}.start"
|
||||||
|
with open(segment_lyric, 'r') as f:
|
||||||
|
lyrics = f.read()
|
||||||
|
align2srt(lyrics, segment_audio, segment_srt)
|
||||||
|
with open(segment_start, 'r') as f:
|
||||||
|
offset = float(f.read())
|
||||||
|
srt2lrc(lyrics, segment_srt, segment_lrc, offset)
|
||||||
|
|
||||||
|
# Combine lrc files
|
||||||
|
lrcs = []
|
||||||
|
for i in range(1, int(nums) + 1):
|
||||||
|
lrcs.append(f"./temp/segments/segment_{str(i)}.lrc")
|
||||||
|
|
||||||
|
with open(output_file, 'w') as f:
|
||||||
|
for lrc in lrcs:
|
||||||
|
with open(lrc, 'r') as lrc_file:
|
||||||
|
f.write(lrc_file.read())
|
||||||
|
f.write('\n')
|
||||||
|
|
||||||
|
SEGMENTS_FILE = './data/1.group'
|
||||||
|
AUDIO_FILE = './data/1.mp3'
|
||||||
|
LYRICS_FILE = './data/1.txt'
|
||||||
|
OUTPUT_FILE = './data/1.lrc'
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
alignWithGroup(SEGMENTS_FILE, AUDIO_FILE, LYRICS_FILE, OUTPUT_FILE)
|
65
whisperAlignment/splitGroups.py
Normal file
65
whisperAlignment/splitGroups.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
from pydub import AudioSegment
|
||||||
|
import os
|
||||||
|
|
||||||
|
def parse_line(line):
|
||||||
|
"""Parse a line in the format '1-26|00:42-02:07'."""
|
||||||
|
line_range, time_range = line.split('|')
|
||||||
|
start_line, end_line = map(int, line_range.split('-'))
|
||||||
|
start_time, end_time = time_range.split('-')
|
||||||
|
return (start_line, end_line, start_time, end_time)
|
||||||
|
|
||||||
|
def time_to_milliseconds(time_str):
|
||||||
|
"""Convert a time string in HH:MM:SS or MM:SS format to milliseconds."""
|
||||||
|
parts = list(map(int, time_str.split(':')))
|
||||||
|
if len(parts) == 2:
|
||||||
|
minutes, seconds = parts
|
||||||
|
hours = 0
|
||||||
|
elif len(parts) == 3:
|
||||||
|
hours, minutes, seconds = parts
|
||||||
|
else:
|
||||||
|
raise ValueError("Invalid time format")
|
||||||
|
return ((hours * 3600 + minutes * 60 + seconds) * 1000)
|
||||||
|
|
||||||
|
def split_audio_and_text(mapping_file, audio_file, text_file, output_dir):
|
||||||
|
"""Split audio and text into corresponding segments based on mapping_file."""
|
||||||
|
# Read mapping file
|
||||||
|
with open(mapping_file, 'r') as f:
|
||||||
|
mappings = [parse_line(line.strip()) for line in f if line.strip()]
|
||||||
|
|
||||||
|
# Load audio file
|
||||||
|
audio = AudioSegment.from_file(audio_file)
|
||||||
|
|
||||||
|
# Read text file lines
|
||||||
|
with open(text_file, 'r') as f:
|
||||||
|
text_lines = f.readlines()
|
||||||
|
|
||||||
|
# Remove empty lines
|
||||||
|
text_lines = [line for line in text_lines if line.strip()]
|
||||||
|
|
||||||
|
# Ensure output directory exists
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
for i, (start_line, end_line, start_time, end_time) in enumerate(mappings):
|
||||||
|
# Extract text segment
|
||||||
|
text_segment = text_lines[start_line - 1:end_line]
|
||||||
|
|
||||||
|
# Extract audio segment
|
||||||
|
start_ms = time_to_milliseconds(start_time)
|
||||||
|
end_ms = time_to_milliseconds(end_time)
|
||||||
|
audio_segment = audio[start_ms:end_ms]
|
||||||
|
|
||||||
|
# Save text segment
|
||||||
|
text_output_path = os.path.join(output_dir, f'segment_{i + 1}.txt')
|
||||||
|
with open(text_output_path, 'w') as text_file:
|
||||||
|
text_file.writelines(text_segment)
|
||||||
|
|
||||||
|
# Save audio segment
|
||||||
|
audio_output_path = os.path.join(output_dir, f'segment_{i + 1}.mp3')
|
||||||
|
audio_segment.export(audio_output_path, format='mp3')
|
||||||
|
|
||||||
|
# Save segment start time
|
||||||
|
start_time_output_path = os.path.join(output_dir, f'segment_{i + 1}.start')
|
||||||
|
with open(start_time_output_path, 'w') as start_time_file:
|
||||||
|
start_time_file.write(str(start_ms / 1000))
|
||||||
|
|
||||||
|
print(f"Saved segment {i + 1}: {text_output_path}, {audio_output_path}")
|
48
whisperAlignment/srt2lrc.py
Normal file
48
whisperAlignment/srt2lrc.py
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
import pysrt
|
||||||
|
|
||||||
|
def parseTime(object):
|
||||||
|
return object.hours * 3600 + object.minutes * 60 + object.seconds + object.milliseconds / 1000
|
||||||
|
|
||||||
|
def serializeTime(time):
|
||||||
|
minutes = int(time / 60)
|
||||||
|
seconds = int(time % 60)
|
||||||
|
milliseconds = int((time - int(time)) * 1000)
|
||||||
|
return f"{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
|
||||||
|
|
||||||
|
def srt2lrc(lyrics, srt_file, lrc_file, time_offset=0):
|
||||||
|
subs = pysrt.open(srt_file, encoding='utf-8')
|
||||||
|
|
||||||
|
# 加载歌词并按行分割
|
||||||
|
lyrics_lines = lyrics.splitlines()
|
||||||
|
|
||||||
|
# 初始化
|
||||||
|
aligned_lines = []
|
||||||
|
current_line = ""
|
||||||
|
start_time = None
|
||||||
|
|
||||||
|
# 遍历 SRT 的每一项
|
||||||
|
for sub in subs:
|
||||||
|
word = sub.text.strip()
|
||||||
|
if not current_line:
|
||||||
|
start_time = parseTime(sub.start) # 记录行的开始时间
|
||||||
|
|
||||||
|
current_line += word
|
||||||
|
|
||||||
|
# 如果当前行匹配到歌词中的一行
|
||||||
|
if lyrics_lines and current_line == lyrics_lines[0]:
|
||||||
|
end_time = parseTime(sub.end) # 记录行的结束时间
|
||||||
|
aligned_lines.append(f"[{serializeTime(start_time+time_offset)}] {current_line}\n[{serializeTime(end_time+time_offset)}]")
|
||||||
|
|
||||||
|
# 移除已匹配的歌词行并重置
|
||||||
|
lyrics_lines.pop(0)
|
||||||
|
current_line = ""
|
||||||
|
start_time = None
|
||||||
|
|
||||||
|
result = []
|
||||||
|
# 后处理,只留下最后一行的结束时间
|
||||||
|
for i in range(len(aligned_lines) - 1):
|
||||||
|
result.append(aligned_lines[i].split('\n')[0])
|
||||||
|
result.append(aligned_lines[-1])
|
||||||
|
|
||||||
|
with open(lrc_file, 'w') as f:
|
||||||
|
f.write('\n'.join(result))
|
Loading…
Reference in New Issue
Block a user