add: AI lyrics aligning

This commit is contained in:
alikia2x (寒寒) 2024-12-27 19:46:22 +08:00
commit 2698a75277
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
12 changed files with 657 additions and 0 deletions

178
.gitignore vendored Normal file
View File

@ -0,0 +1,178 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
#uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# PyPI configuration file
.pypirc
# Project specific
/temp
/data
.DS_Store

3
align-pipeline.md Normal file
View File

@ -0,0 +1,3 @@
1. prepare `1.mp3`, `1.txt`, `1.group` in `./data`
2. `whisperAlignment/alignWithGroup`
3. `mmsAlignment

View File

@ -0,0 +1,6 @@
from utils.ttml import extract_lrc_from_ttml
lrc_output = extract_lrc_from_ttml('./data/1.ttml')
with open('./data/1-final.lrc', 'w') as f:
f.write(lrc_output)

View File

@ -0,0 +1,167 @@
import os
import re
import torch
import torchaudio
from typing import List
from pypinyin import lazy_pinyin
from pypinyin_dict.phrase_pinyin_data import cc_cedict
from torchaudio.transforms import Resample
from tqdm import tqdm
from utils.ttml import TTMLGenerator
from utils.audio import get_audio_duration
# 初始化设备、模型、分词器、对齐器等
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bundle = torchaudio.pipelines.MMS_FA
model = bundle.get_model().to(device)
tokenizer = bundle.get_tokenizer()
aligner = bundle.get_aligner()
cc_cedict.load()
def timestamp(seconds: float) -> str:
"""将浮点数秒钟转换为TTML时间戳格式"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
seconds = seconds % 60
milliseconds = int((seconds % 1) * 1000)
seconds = int(seconds)
return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
def compute_alignments(waveform: torch.Tensor, transcript: List[str]):
with torch.inference_mode():
emission, _ = model(waveform.to(device))
token_spans = aligner(emission[0], tokenizer(transcript))
return emission, token_spans
def parse_lrc(lrc_file, audio_len):
"""解析LRC文件返回一个包含时间戳和歌词的列表"""
with open(lrc_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
lrc_data = []
for line in lines:
# 使用正则表达式匹配时间戳和歌词
match = re.match(r'\[(\d+):(\d+\.\d+)\](.*)', line)
if match:
minutes = int(match.group(1))
seconds = float(match.group(2))
lyric = match.group(3).strip()
lyric = lyric.replace(" ", "")
timestamp = minutes * 60 + seconds
lrc_data.append((lyric, timestamp))
for i, (lyric, start_time) in enumerate(lrc_data):
# Skip empty line
if lyric.strip() == "":
continue
if i < len(lrc_data) - 1:
end_time = lrc_data[i + 1][1]
else:
end_time = audio_len
lrc_data[i] = (lyric, start_time, end_time)
# Filter empty lines again
lrc_data = [line for line in lrc_data if line[0].strip() != ""]
return lrc_data
def extract_numbers_from_files(directory):
"""
读取给定目录提取文件名中的数字部分并返回一个包含这些数字的列表
:param directory: 目录路径
:return: 包含数字的列表
"""
numbers = []
pattern = re.compile(r'line-(\d+)\.wav')
try:
for filename in os.listdir(directory):
match = pattern.match(filename)
if match:
number = int(match.group(1))
numbers.append(number)
except Exception as e:
print(f"Error reading directory: {e}")
return None
return numbers
def process_line(line_idx, start_time):
with open(f"./temp/lines/line-{line_idx}.txt", "r") as f:
text = f.read()
waveform, sample_rate = torchaudio.load(f"./temp/lines/line-{line_idx}.wav")
waveform = waveform[0:1]
resampler = Resample(orig_freq=sample_rate, new_freq=16000)
waveform = resampler(waveform)
text_pinyin = lazy_pinyin(text)
text_normalized = " ".join(text_pinyin)
transcript = text_normalized.split()
emission, token_spans = compute_alignments(waveform, transcript)
num_frames = emission.size(1)
ratio = waveform.size(1) / num_frames
words = []
for i in range(len(token_spans)):
spans = token_spans[i]
x0 = start_time + int(ratio * spans[0].start) / 16000
x1 = start_time + int(ratio * spans[-1].end) / 16000
words.append({
"word": text[i],
"start": x0,
"end": x1
})
idx=0
for item in words:
if idx == len(words) - 1:
break
item["end"] = words[idx + 1]["start"]
idx+=1
result = []
for word in words:
result.append((word["word"], timestamp(word["start"]), timestamp(word["end"])))
return result
def align(audio_file: str, lrc_file: str, output_ttml: str, segments_dir: str = "./temp/lines"):
"""
对齐音频和歌词并输出TTML文件
:param audio_file: 音频文件路径
:param lrc_file: LRC歌词文件路径
:param output_ttml: 输出TTML文件路径
:param segments_dir: 存放分割后音频片段的目录默认为"./segments"
"""
# 获取音频时长
duration = get_audio_duration(audio_file)
# 解析LRC文件
lrc_data = parse_lrc(lrc_file, duration)
# 提取要处理的行号
lines_to_process = sorted(extract_numbers_from_files(segments_dir))
# 创建TTML生成器实例
ttml_generator = TTMLGenerator(duration=timestamp(duration))
i = 0
for line_num in tqdm(lines_to_process):
start_time = lrc_data[i][1]
end_time = lrc_data[i][2]
result = process_line(line_num, start_time)
ttml_generator.add_lyrics(
begin=timestamp(start_time), end=timestamp(end_time), agent="v1", itunes_key=f"L{i+1}",
words=result
)
i += 1
# 保存TTML文件
ttml_generator.save(output_ttml)
if __name__ == "__main__":
align("./data/1.flac", "./data/1.lrc", "./data/output.ttml", "./temp/lines")

58
mmsAlignment/splitSong.py Normal file
View File

@ -0,0 +1,58 @@
from pydub import AudioSegment
from utils.cleanTempDir import cleanTempDir
import re
def parse_lrc(lrc_file):
"""解析LRC文件返回一个包含时间戳和歌词的列表"""
with open(lrc_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
lrc_data = []
for line in lines:
# 使用正则表达式匹配时间戳和歌词
match = re.match(r'\[(\d+):(\d+\.\d+)\](.*)', line)
if match:
minutes = int(match.group(1))
seconds = float(match.group(2))
lyric = match.group(3).strip()
lyric = lyric.replace(" ", "")
timestamp = minutes * 60 + seconds
lrc_data.append((timestamp, lyric))
return lrc_data
def split_audio_by_lrc(audio_file, lrc_data):
"""根据LRC数据分割音频文件并保存为单独的WAV文件"""
audio = AudioSegment.from_file(audio_file)
cleanTempDir("./temp/lines")
for i, (start_time, lyric) in enumerate(lrc_data):
# Skip empty line
if lyric.strip() == "":
continue
if i < len(lrc_data) - 1:
end_time = lrc_data[i + 1][0]
else:
end_time = len(audio) / 1000 # 最后一行歌词到音频结束
start_time = max(0, start_time - 0.1) # 前后各扩0.1秒
end_time = min(len(audio) / 1000, end_time + 0.1)
start_time_ms = start_time * 1000
end_time_ms = end_time * 1000
segment = audio[start_time_ms:end_time_ms]
output_file = f"./temp/lines/line-{i+1}.wav"
output_script = f"./temp/lines/line-{i+1}.txt"
output_time = f"./temp/lines/line-{i+1}.time"
segment.export(output_file, format="wav")
with open(output_script, "w") as f:
f.write(lyric)
with open(output_time, "w") as f:
f.write(str(start_time)+","+str(end_time))
print(f"Saved {output_file}")
if __name__ == "__main__":
lrc_file = "./data/1.lrc" # LRC文件路径
audio_file = "./data/1.flac" # 音频文件路径
lrc_data = parse_lrc(lrc_file)
split_audio_by_lrc(audio_file, lrc_data)

16
utils/audio.py Normal file
View File

@ -0,0 +1,16 @@
from pydub import AudioSegment
def get_audio_duration(file_path):
"""
读取音频文件并获取其时长秒数
:param file_path: 音频文件的路径
:return: 音频文件的时长秒数
"""
try:
audio = AudioSegment.from_file(file_path)
duration_in_seconds = len(audio) / 1000.0
return duration_in_seconds
except Exception as e:
print(f"Error reading audio file: {e}")
return None

6
utils/cleanTempDir.py Normal file
View File

@ -0,0 +1,6 @@
import os
import shutil
def cleanTempDir(dir_path):
shutil.rmtree(dir_path, ignore_errors=True)
os.makedirs(dir_path, exist_ok=True)

57
utils/ttml.py Normal file
View File

@ -0,0 +1,57 @@
import xml.etree.ElementTree as ET
class TTMLGenerator:
def __init__(self, duration, xmlns="http://www.w3.org/ns/ttml", xmlns_ttm="http://www.w3.org/ns/ttml#metadata", xmlns_amll="http://www.example.com/ns/amll", xmlns_itunes="http://music.apple.com/lyric-ttml-internal"):
self.tt = ET.Element("tt", attrib={
"xmlns": xmlns,
"xmlns:ttm": xmlns_ttm,
"xmlns:amll": xmlns_amll,
"xmlns:itunes": xmlns_itunes
})
self.head = ET.SubElement(self.tt, "head")
self.metadata = ET.SubElement(self.head, "metadata")
self.body = ET.SubElement(self.tt, "body", attrib={"dur": duration})
self.div = ET.SubElement(self.body, "div")
def add_lyrics(self, begin, end, agent, itunes_key, words):
p = ET.SubElement(self.div, "p", attrib={
"begin": begin,
"end": end,
"ttm:agent": agent,
"itunes:key": itunes_key
})
for word, start, stop in words:
span = ET.SubElement(p, "span", attrib={"begin": start, "end": stop})
span.text = word
def save(self, filename):
tree = ET.ElementTree(self.tt)
tree.write(filename, encoding="utf-8", xml_declaration=True)
def extract_lrc_from_ttml(ttml_file):
def format_time(ttml_time):
return ttml_time[3:]
tree = ET.parse(ttml_file)
root = tree.getroot()
namespace = {"": "http://www.w3.org/ns/ttml", "ttm": "http://www.w3.org/ns/ttml#metadata"}
lrc_lines = []
for p in root.findall(".//p", namespace):
begin = p.attrib.get("begin")
end = p.attrib.get("end")
text_content = ""
for span in p.findall("span", namespace):
text_content += span.text or ""
# Format begin and end times
begin_time = format_time(begin)
end_time = format_time(end)
# Add formatted lines to the LRC list
lrc_lines.append(f"[{begin_time}] {text_content}")
lrc_lines.append(f"[{end_time}]") # Add the end time as a separate line
return "\n".join(lrc_lines)

View File

@ -0,0 +1,6 @@
import stable_whisper
def align2srt(lyrics, audio_path, output_path):
model = stable_whisper.load_model('large-v3')
result = model.align(audio_path, lyrics, language="Chinese", regroup=False)
result.to_srt_vtt(output_path, segment_level=False)

View File

@ -0,0 +1,47 @@
import os
from whisperAlignment.splitGroups import split_audio_and_text
from whisperAlignment.align2srt import align2srt
from whisperAlignment.srt2lrc import srt2lrc
from utils.cleanTempDir import cleanTempDir
def alignWithGroup(segments_file, audio_file, lyrics_file, output_file):
# Clean temp/segments dir (Insure it exists)
cleanTempDir('./temp/segments')
# Split groups
split_audio_and_text(segments_file, audio_file, lyrics_file, 'temp/segments')
# Get numbers of segments by count "txt" files in temp/segments
nums = len([name for name in os.listdir('./temp/segments') if name.endswith('.txt')])
for i in range(1, int(nums) + 1):
segment_lyric = f"./temp/segments/segment_{str(i)}.txt"
segment_audio = f"./temp/segments/segment_{str(i)}.mp3"
segment_srt = f"./temp/segments/segment_{str(i)}.srt"
segment_lrc = f"./temp/segments/segment_{str(i)}.lrc"
segment_start = f"./temp/segments/segment_{str(i)}.start"
with open(segment_lyric, 'r') as f:
lyrics = f.read()
align2srt(lyrics, segment_audio, segment_srt)
with open(segment_start, 'r') as f:
offset = float(f.read())
srt2lrc(lyrics, segment_srt, segment_lrc, offset)
# Combine lrc files
lrcs = []
for i in range(1, int(nums) + 1):
lrcs.append(f"./temp/segments/segment_{str(i)}.lrc")
with open(output_file, 'w') as f:
for lrc in lrcs:
with open(lrc, 'r') as lrc_file:
f.write(lrc_file.read())
f.write('\n')
SEGMENTS_FILE = './data/1.group'
AUDIO_FILE = './data/1.mp3'
LYRICS_FILE = './data/1.txt'
OUTPUT_FILE = './data/1.lrc'
if __name__ == "__main__":
alignWithGroup(SEGMENTS_FILE, AUDIO_FILE, LYRICS_FILE, OUTPUT_FILE)

View File

@ -0,0 +1,65 @@
from pydub import AudioSegment
import os
def parse_line(line):
"""Parse a line in the format '1-26|00:42-02:07'."""
line_range, time_range = line.split('|')
start_line, end_line = map(int, line_range.split('-'))
start_time, end_time = time_range.split('-')
return (start_line, end_line, start_time, end_time)
def time_to_milliseconds(time_str):
"""Convert a time string in HH:MM:SS or MM:SS format to milliseconds."""
parts = list(map(int, time_str.split(':')))
if len(parts) == 2:
minutes, seconds = parts
hours = 0
elif len(parts) == 3:
hours, minutes, seconds = parts
else:
raise ValueError("Invalid time format")
return ((hours * 3600 + minutes * 60 + seconds) * 1000)
def split_audio_and_text(mapping_file, audio_file, text_file, output_dir):
"""Split audio and text into corresponding segments based on mapping_file."""
# Read mapping file
with open(mapping_file, 'r') as f:
mappings = [parse_line(line.strip()) for line in f if line.strip()]
# Load audio file
audio = AudioSegment.from_file(audio_file)
# Read text file lines
with open(text_file, 'r') as f:
text_lines = f.readlines()
# Remove empty lines
text_lines = [line for line in text_lines if line.strip()]
# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)
for i, (start_line, end_line, start_time, end_time) in enumerate(mappings):
# Extract text segment
text_segment = text_lines[start_line - 1:end_line]
# Extract audio segment
start_ms = time_to_milliseconds(start_time)
end_ms = time_to_milliseconds(end_time)
audio_segment = audio[start_ms:end_ms]
# Save text segment
text_output_path = os.path.join(output_dir, f'segment_{i + 1}.txt')
with open(text_output_path, 'w') as text_file:
text_file.writelines(text_segment)
# Save audio segment
audio_output_path = os.path.join(output_dir, f'segment_{i + 1}.mp3')
audio_segment.export(audio_output_path, format='mp3')
# Save segment start time
start_time_output_path = os.path.join(output_dir, f'segment_{i + 1}.start')
with open(start_time_output_path, 'w') as start_time_file:
start_time_file.write(str(start_ms / 1000))
print(f"Saved segment {i + 1}: {text_output_path}, {audio_output_path}")

View File

@ -0,0 +1,48 @@
import pysrt
def parseTime(object):
return object.hours * 3600 + object.minutes * 60 + object.seconds + object.milliseconds / 1000
def serializeTime(time):
minutes = int(time / 60)
seconds = int(time % 60)
milliseconds = int((time - int(time)) * 1000)
return f"{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
def srt2lrc(lyrics, srt_file, lrc_file, time_offset=0):
subs = pysrt.open(srt_file, encoding='utf-8')
# 加载歌词并按行分割
lyrics_lines = lyrics.splitlines()
# 初始化
aligned_lines = []
current_line = ""
start_time = None
# 遍历 SRT 的每一项
for sub in subs:
word = sub.text.strip()
if not current_line:
start_time = parseTime(sub.start) # 记录行的开始时间
current_line += word
# 如果当前行匹配到歌词中的一行
if lyrics_lines and current_line == lyrics_lines[0]:
end_time = parseTime(sub.end) # 记录行的结束时间
aligned_lines.append(f"[{serializeTime(start_time+time_offset)}] {current_line}\n[{serializeTime(end_time+time_offset)}]")
# 移除已匹配的歌词行并重置
lyrics_lines.pop(0)
current_line = ""
start_time = None
result = []
# 后处理,只留下最后一行的结束时间
for i in range(len(aligned_lines) - 1):
result.append(aligned_lines[i].split('\n')[0])
result.append(aligned_lines[-1])
with open(lrc_file, 'w') as f:
f.write('\n'.join(result))