From 2698a752776ce5500ead2c798decc9e8720d1a99 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Fri, 27 Dec 2024 19:46:22 +0800 Subject: [PATCH] add: AI lyrics aligning --- .gitignore | 178 +++++++++++++++++++++++++++++ align-pipeline.md | 3 + mmsAlignment/align2LRC.py | 6 + mmsAlignment/alignWithMMS.py | 167 +++++++++++++++++++++++++++ mmsAlignment/splitSong.py | 58 ++++++++++ utils/audio.py | 16 +++ utils/cleanTempDir.py | 6 + utils/ttml.py | 57 +++++++++ whisperAlignment/align2srt.py | 6 + whisperAlignment/alignWithGroup.py | 47 ++++++++ whisperAlignment/splitGroups.py | 65 +++++++++++ whisperAlignment/srt2lrc.py | 48 ++++++++ 12 files changed, 657 insertions(+) create mode 100644 .gitignore create mode 100644 align-pipeline.md create mode 100644 mmsAlignment/align2LRC.py create mode 100644 mmsAlignment/alignWithMMS.py create mode 100644 mmsAlignment/splitSong.py create mode 100644 utils/audio.py create mode 100644 utils/cleanTempDir.py create mode 100644 utils/ttml.py create mode 100644 whisperAlignment/align2srt.py create mode 100644 whisperAlignment/alignWithGroup.py create mode 100644 whisperAlignment/splitGroups.py create mode 100644 whisperAlignment/srt2lrc.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..622929a --- /dev/null +++ b/.gitignore @@ -0,0 +1,178 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# PyPI configuration file +.pypirc + + +# Project specific +/temp +/data + +.DS_Store \ No newline at end of file diff --git a/align-pipeline.md b/align-pipeline.md new file mode 100644 index 0000000..b140b9d --- /dev/null +++ b/align-pipeline.md @@ -0,0 +1,3 @@ +1. prepare `1.mp3`, `1.txt`, `1.group` in `./data` +2. `whisperAlignment/alignWithGroup` +3. `mmsAlignment \ No newline at end of file diff --git a/mmsAlignment/align2LRC.py b/mmsAlignment/align2LRC.py new file mode 100644 index 0000000..7773d9f --- /dev/null +++ b/mmsAlignment/align2LRC.py @@ -0,0 +1,6 @@ +from utils.ttml import extract_lrc_from_ttml + +lrc_output = extract_lrc_from_ttml('./data/1.ttml') + +with open('./data/1-final.lrc', 'w') as f: + f.write(lrc_output) \ No newline at end of file diff --git a/mmsAlignment/alignWithMMS.py b/mmsAlignment/alignWithMMS.py new file mode 100644 index 0000000..c5169c9 --- /dev/null +++ b/mmsAlignment/alignWithMMS.py @@ -0,0 +1,167 @@ +import os +import re +import torch +import torchaudio +from typing import List +from pypinyin import lazy_pinyin +from pypinyin_dict.phrase_pinyin_data import cc_cedict +from torchaudio.transforms import Resample +from tqdm import tqdm +from utils.ttml import TTMLGenerator +from utils.audio import get_audio_duration + +# 初始化设备、模型、分词器、对齐器等 +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +bundle = torchaudio.pipelines.MMS_FA +model = bundle.get_model().to(device) +tokenizer = bundle.get_tokenizer() +aligner = bundle.get_aligner() + +cc_cedict.load() + +def timestamp(seconds: float) -> str: + """将浮点数秒钟转换为TTML时间戳格式""" + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + seconds = seconds % 60 + milliseconds = int((seconds % 1) * 1000) + seconds = int(seconds) + return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}" + +def compute_alignments(waveform: torch.Tensor, transcript: List[str]): + with torch.inference_mode(): + emission, _ = model(waveform.to(device)) + token_spans = aligner(emission[0], tokenizer(transcript)) + return emission, token_spans + +def parse_lrc(lrc_file, audio_len): + """解析LRC文件,返回一个包含时间戳和歌词的列表""" + with open(lrc_file, 'r', encoding='utf-8') as f: + lines = f.readlines() + + lrc_data = [] + for line in lines: + # 使用正则表达式匹配时间戳和歌词 + match = re.match(r'\[(\d+):(\d+\.\d+)\](.*)', line) + if match: + minutes = int(match.group(1)) + seconds = float(match.group(2)) + lyric = match.group(3).strip() + lyric = lyric.replace(" ", "") + timestamp = minutes * 60 + seconds + lrc_data.append((lyric, timestamp)) + + for i, (lyric, start_time) in enumerate(lrc_data): + # Skip empty line + if lyric.strip() == "": + continue + if i < len(lrc_data) - 1: + end_time = lrc_data[i + 1][1] + else: + end_time = audio_len + lrc_data[i] = (lyric, start_time, end_time) + + # Filter empty lines again + lrc_data = [line for line in lrc_data if line[0].strip() != ""] + + return lrc_data + +def extract_numbers_from_files(directory): + """ + 读取给定目录,提取文件名中的数字部分,并返回一个包含这些数字的列表。 + + :param directory: 目录路径 + :return: 包含数字的列表 + """ + numbers = [] + pattern = re.compile(r'line-(\d+)\.wav') + + try: + for filename in os.listdir(directory): + match = pattern.match(filename) + if match: + number = int(match.group(1)) + numbers.append(number) + except Exception as e: + print(f"Error reading directory: {e}") + return None + + return numbers + +def process_line(line_idx, start_time): + with open(f"./temp/lines/line-{line_idx}.txt", "r") as f: + text = f.read() + + waveform, sample_rate = torchaudio.load(f"./temp/lines/line-{line_idx}.wav") + + waveform = waveform[0:1] + resampler = Resample(orig_freq=sample_rate, new_freq=16000) + waveform = resampler(waveform) + + text_pinyin = lazy_pinyin(text) + text_normalized = " ".join(text_pinyin) + + transcript = text_normalized.split() + emission, token_spans = compute_alignments(waveform, transcript) + num_frames = emission.size(1) + ratio = waveform.size(1) / num_frames + + words = [] + for i in range(len(token_spans)): + spans = token_spans[i] + x0 = start_time + int(ratio * spans[0].start) / 16000 + x1 = start_time + int(ratio * spans[-1].end) / 16000 + words.append({ + "word": text[i], + "start": x0, + "end": x1 + }) + idx=0 + for item in words: + if idx == len(words) - 1: + break + item["end"] = words[idx + 1]["start"] + idx+=1 + result = [] + for word in words: + result.append((word["word"], timestamp(word["start"]), timestamp(word["end"]))) + return result + + +def align(audio_file: str, lrc_file: str, output_ttml: str, segments_dir: str = "./temp/lines"): + """ + 对齐音频和歌词,并输出TTML文件。 + + :param audio_file: 音频文件路径 + :param lrc_file: LRC歌词文件路径 + :param output_ttml: 输出TTML文件路径 + :param segments_dir: 存放分割后音频片段的目录,默认为"./segments" + """ + # 获取音频时长 + duration = get_audio_duration(audio_file) + + # 解析LRC文件 + lrc_data = parse_lrc(lrc_file, duration) + + # 提取要处理的行号 + lines_to_process = sorted(extract_numbers_from_files(segments_dir)) + + # 创建TTML生成器实例 + ttml_generator = TTMLGenerator(duration=timestamp(duration)) + + i = 0 + for line_num in tqdm(lines_to_process): + start_time = lrc_data[i][1] + end_time = lrc_data[i][2] + result = process_line(line_num, start_time) + ttml_generator.add_lyrics( + begin=timestamp(start_time), end=timestamp(end_time), agent="v1", itunes_key=f"L{i+1}", + words=result + ) + i += 1 + + # 保存TTML文件 + ttml_generator.save(output_ttml) + +if __name__ == "__main__": + align("./data/1.flac", "./data/1.lrc", "./data/output.ttml", "./temp/lines") \ No newline at end of file diff --git a/mmsAlignment/splitSong.py b/mmsAlignment/splitSong.py new file mode 100644 index 0000000..bf0ec03 --- /dev/null +++ b/mmsAlignment/splitSong.py @@ -0,0 +1,58 @@ +from pydub import AudioSegment +from utils.cleanTempDir import cleanTempDir +import re + +def parse_lrc(lrc_file): + """解析LRC文件,返回一个包含时间戳和歌词的列表""" + with open(lrc_file, 'r', encoding='utf-8') as f: + lines = f.readlines() + + lrc_data = [] + for line in lines: + # 使用正则表达式匹配时间戳和歌词 + match = re.match(r'\[(\d+):(\d+\.\d+)\](.*)', line) + if match: + minutes = int(match.group(1)) + seconds = float(match.group(2)) + lyric = match.group(3).strip() + lyric = lyric.replace(" ", "") + timestamp = minutes * 60 + seconds + lrc_data.append((timestamp, lyric)) + + return lrc_data + +def split_audio_by_lrc(audio_file, lrc_data): + """根据LRC数据分割音频文件,并保存为单独的WAV文件""" + audio = AudioSegment.from_file(audio_file) + cleanTempDir("./temp/lines") + + for i, (start_time, lyric) in enumerate(lrc_data): + # Skip empty line + if lyric.strip() == "": + continue + if i < len(lrc_data) - 1: + end_time = lrc_data[i + 1][0] + else: + end_time = len(audio) / 1000 # 最后一行歌词到音频结束 + start_time = max(0, start_time - 0.1) # 前后各扩0.1秒 + end_time = min(len(audio) / 1000, end_time + 0.1) + start_time_ms = start_time * 1000 + end_time_ms = end_time * 1000 + + segment = audio[start_time_ms:end_time_ms] + output_file = f"./temp/lines/line-{i+1}.wav" + output_script = f"./temp/lines/line-{i+1}.txt" + output_time = f"./temp/lines/line-{i+1}.time" + segment.export(output_file, format="wav") + with open(output_script, "w") as f: + f.write(lyric) + with open(output_time, "w") as f: + f.write(str(start_time)+","+str(end_time)) + print(f"Saved {output_file}") + +if __name__ == "__main__": + lrc_file = "./data/1.lrc" # LRC文件路径 + audio_file = "./data/1.flac" # 音频文件路径 + + lrc_data = parse_lrc(lrc_file) + split_audio_by_lrc(audio_file, lrc_data) \ No newline at end of file diff --git a/utils/audio.py b/utils/audio.py new file mode 100644 index 0000000..ebafe9e --- /dev/null +++ b/utils/audio.py @@ -0,0 +1,16 @@ +from pydub import AudioSegment + +def get_audio_duration(file_path): + """ + 读取音频文件并获取其时长(秒数)。 + + :param file_path: 音频文件的路径 + :return: 音频文件的时长(秒数) + """ + try: + audio = AudioSegment.from_file(file_path) + duration_in_seconds = len(audio) / 1000.0 + return duration_in_seconds + except Exception as e: + print(f"Error reading audio file: {e}") + return None \ No newline at end of file diff --git a/utils/cleanTempDir.py b/utils/cleanTempDir.py new file mode 100644 index 0000000..6ebff60 --- /dev/null +++ b/utils/cleanTempDir.py @@ -0,0 +1,6 @@ +import os +import shutil + +def cleanTempDir(dir_path): + shutil.rmtree(dir_path, ignore_errors=True) + os.makedirs(dir_path, exist_ok=True) \ No newline at end of file diff --git a/utils/ttml.py b/utils/ttml.py new file mode 100644 index 0000000..69cc0f9 --- /dev/null +++ b/utils/ttml.py @@ -0,0 +1,57 @@ +import xml.etree.ElementTree as ET + +class TTMLGenerator: + def __init__(self, duration, xmlns="http://www.w3.org/ns/ttml", xmlns_ttm="http://www.w3.org/ns/ttml#metadata", xmlns_amll="http://www.example.com/ns/amll", xmlns_itunes="http://music.apple.com/lyric-ttml-internal"): + self.tt = ET.Element("tt", attrib={ + "xmlns": xmlns, + "xmlns:ttm": xmlns_ttm, + "xmlns:amll": xmlns_amll, + "xmlns:itunes": xmlns_itunes + }) + self.head = ET.SubElement(self.tt, "head") + self.metadata = ET.SubElement(self.head, "metadata") + self.body = ET.SubElement(self.tt, "body", attrib={"dur": duration}) + self.div = ET.SubElement(self.body, "div") + + def add_lyrics(self, begin, end, agent, itunes_key, words): + p = ET.SubElement(self.div, "p", attrib={ + "begin": begin, + "end": end, + "ttm:agent": agent, + "itunes:key": itunes_key + }) + for word, start, stop in words: + span = ET.SubElement(p, "span", attrib={"begin": start, "end": stop}) + span.text = word + + def save(self, filename): + tree = ET.ElementTree(self.tt) + tree.write(filename, encoding="utf-8", xml_declaration=True) + +def extract_lrc_from_ttml(ttml_file): + def format_time(ttml_time): + return ttml_time[3:] + + tree = ET.parse(ttml_file) + root = tree.getroot() + namespace = {"": "http://www.w3.org/ns/ttml", "ttm": "http://www.w3.org/ns/ttml#metadata"} + + lrc_lines = [] + + for p in root.findall(".//p", namespace): + begin = p.attrib.get("begin") + end = p.attrib.get("end") + text_content = "" + + for span in p.findall("span", namespace): + text_content += span.text or "" + + # Format begin and end times + begin_time = format_time(begin) + end_time = format_time(end) + + # Add formatted lines to the LRC list + lrc_lines.append(f"[{begin_time}] {text_content}") + lrc_lines.append(f"[{end_time}]") # Add the end time as a separate line + + return "\n".join(lrc_lines) diff --git a/whisperAlignment/align2srt.py b/whisperAlignment/align2srt.py new file mode 100644 index 0000000..54947eb --- /dev/null +++ b/whisperAlignment/align2srt.py @@ -0,0 +1,6 @@ +import stable_whisper + +def align2srt(lyrics, audio_path, output_path): + model = stable_whisper.load_model('large-v3') + result = model.align(audio_path, lyrics, language="Chinese", regroup=False) + result.to_srt_vtt(output_path, segment_level=False) \ No newline at end of file diff --git a/whisperAlignment/alignWithGroup.py b/whisperAlignment/alignWithGroup.py new file mode 100644 index 0000000..840718f --- /dev/null +++ b/whisperAlignment/alignWithGroup.py @@ -0,0 +1,47 @@ +import os +from whisperAlignment.splitGroups import split_audio_and_text +from whisperAlignment.align2srt import align2srt +from whisperAlignment.srt2lrc import srt2lrc +from utils.cleanTempDir import cleanTempDir + +def alignWithGroup(segments_file, audio_file, lyrics_file, output_file): + # Clean temp/segments dir (Insure it exists) + cleanTempDir('./temp/segments') + + # Split groups + split_audio_and_text(segments_file, audio_file, lyrics_file, 'temp/segments') + + # Get numbers of segments by count "txt" files in temp/segments + nums = len([name for name in os.listdir('./temp/segments') if name.endswith('.txt')]) + + for i in range(1, int(nums) + 1): + segment_lyric = f"./temp/segments/segment_{str(i)}.txt" + segment_audio = f"./temp/segments/segment_{str(i)}.mp3" + segment_srt = f"./temp/segments/segment_{str(i)}.srt" + segment_lrc = f"./temp/segments/segment_{str(i)}.lrc" + segment_start = f"./temp/segments/segment_{str(i)}.start" + with open(segment_lyric, 'r') as f: + lyrics = f.read() + align2srt(lyrics, segment_audio, segment_srt) + with open(segment_start, 'r') as f: + offset = float(f.read()) + srt2lrc(lyrics, segment_srt, segment_lrc, offset) + + # Combine lrc files + lrcs = [] + for i in range(1, int(nums) + 1): + lrcs.append(f"./temp/segments/segment_{str(i)}.lrc") + + with open(output_file, 'w') as f: + for lrc in lrcs: + with open(lrc, 'r') as lrc_file: + f.write(lrc_file.read()) + f.write('\n') + +SEGMENTS_FILE = './data/1.group' +AUDIO_FILE = './data/1.mp3' +LYRICS_FILE = './data/1.txt' +OUTPUT_FILE = './data/1.lrc' + +if __name__ == "__main__": + alignWithGroup(SEGMENTS_FILE, AUDIO_FILE, LYRICS_FILE, OUTPUT_FILE) \ No newline at end of file diff --git a/whisperAlignment/splitGroups.py b/whisperAlignment/splitGroups.py new file mode 100644 index 0000000..1fc907a --- /dev/null +++ b/whisperAlignment/splitGroups.py @@ -0,0 +1,65 @@ +from pydub import AudioSegment +import os + +def parse_line(line): + """Parse a line in the format '1-26|00:42-02:07'.""" + line_range, time_range = line.split('|') + start_line, end_line = map(int, line_range.split('-')) + start_time, end_time = time_range.split('-') + return (start_line, end_line, start_time, end_time) + +def time_to_milliseconds(time_str): + """Convert a time string in HH:MM:SS or MM:SS format to milliseconds.""" + parts = list(map(int, time_str.split(':'))) + if len(parts) == 2: + minutes, seconds = parts + hours = 0 + elif len(parts) == 3: + hours, minutes, seconds = parts + else: + raise ValueError("Invalid time format") + return ((hours * 3600 + minutes * 60 + seconds) * 1000) + +def split_audio_and_text(mapping_file, audio_file, text_file, output_dir): + """Split audio and text into corresponding segments based on mapping_file.""" + # Read mapping file + with open(mapping_file, 'r') as f: + mappings = [parse_line(line.strip()) for line in f if line.strip()] + + # Load audio file + audio = AudioSegment.from_file(audio_file) + + # Read text file lines + with open(text_file, 'r') as f: + text_lines = f.readlines() + + # Remove empty lines + text_lines = [line for line in text_lines if line.strip()] + + # Ensure output directory exists + os.makedirs(output_dir, exist_ok=True) + + for i, (start_line, end_line, start_time, end_time) in enumerate(mappings): + # Extract text segment + text_segment = text_lines[start_line - 1:end_line] + + # Extract audio segment + start_ms = time_to_milliseconds(start_time) + end_ms = time_to_milliseconds(end_time) + audio_segment = audio[start_ms:end_ms] + + # Save text segment + text_output_path = os.path.join(output_dir, f'segment_{i + 1}.txt') + with open(text_output_path, 'w') as text_file: + text_file.writelines(text_segment) + + # Save audio segment + audio_output_path = os.path.join(output_dir, f'segment_{i + 1}.mp3') + audio_segment.export(audio_output_path, format='mp3') + + # Save segment start time + start_time_output_path = os.path.join(output_dir, f'segment_{i + 1}.start') + with open(start_time_output_path, 'w') as start_time_file: + start_time_file.write(str(start_ms / 1000)) + + print(f"Saved segment {i + 1}: {text_output_path}, {audio_output_path}") diff --git a/whisperAlignment/srt2lrc.py b/whisperAlignment/srt2lrc.py new file mode 100644 index 0000000..bb14bc9 --- /dev/null +++ b/whisperAlignment/srt2lrc.py @@ -0,0 +1,48 @@ +import pysrt + +def parseTime(object): + return object.hours * 3600 + object.minutes * 60 + object.seconds + object.milliseconds / 1000 + +def serializeTime(time): + minutes = int(time / 60) + seconds = int(time % 60) + milliseconds = int((time - int(time)) * 1000) + return f"{minutes:02d}:{seconds:02d}.{milliseconds:03d}" + +def srt2lrc(lyrics, srt_file, lrc_file, time_offset=0): + subs = pysrt.open(srt_file, encoding='utf-8') + + # 加载歌词并按行分割 + lyrics_lines = lyrics.splitlines() + + # 初始化 + aligned_lines = [] + current_line = "" + start_time = None + + # 遍历 SRT 的每一项 + for sub in subs: + word = sub.text.strip() + if not current_line: + start_time = parseTime(sub.start) # 记录行的开始时间 + + current_line += word + + # 如果当前行匹配到歌词中的一行 + if lyrics_lines and current_line == lyrics_lines[0]: + end_time = parseTime(sub.end) # 记录行的结束时间 + aligned_lines.append(f"[{serializeTime(start_time+time_offset)}] {current_line}\n[{serializeTime(end_time+time_offset)}]") + + # 移除已匹配的歌词行并重置 + lyrics_lines.pop(0) + current_line = "" + start_time = None + + result = [] + # 后处理,只留下最后一行的结束时间 + for i in range(len(aligned_lines) - 1): + result.append(aligned_lines[i].split('\n')[0]) + result.append(aligned_lines[-1]) + + with open(lrc_file, 'w') as f: + f.write('\n'.join(result))