add: AI lyrics aligning

2024-12-27 19:46:22 +08:00 · 2024-12-27 19:46:22 +08:00 · 2698a75277
commit 2698a75277
12 changed files with 657 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,178 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# PyPI configuration file
+.pypirc
+
+
+# Project specific
+/temp
+/data
+
+.DS_Store
--- a/align-pipeline.md
+++ b/align-pipeline.md
@ -0,0 +1,3 @@
+1. prepare `1.mp3`, `1.txt`, `1.group` in `./data`
+2. `whisperAlignment/alignWithGroup`
+3. `mmsAlignment
--- a/mmsAlignment/align2LRC.py
+++ b/mmsAlignment/align2LRC.py
@ -0,0 +1,6 @@
+from utils.ttml import extract_lrc_from_ttml
+
+lrc_output = extract_lrc_from_ttml('./data/1.ttml')
+
+with open('./data/1-final.lrc', 'w') as f:
+    f.write(lrc_output)
--- a/mmsAlignment/alignWithMMS.py
+++ b/mmsAlignment/alignWithMMS.py
@ -0,0 +1,167 @@
+import os
+import re
+import torch
+import torchaudio
+from typing import List
+from pypinyin import lazy_pinyin
+from pypinyin_dict.phrase_pinyin_data import cc_cedict
+from torchaudio.transforms import Resample
+from tqdm import tqdm
+from utils.ttml import TTMLGenerator
+from utils.audio import get_audio_duration
+
+# 初始化设备、模型、分词器、对齐器等
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+bundle = torchaudio.pipelines.MMS_FA
+model = bundle.get_model().to(device)
+tokenizer = bundle.get_tokenizer()
+aligner = bundle.get_aligner()
+
+cc_cedict.load()
+
+def timestamp(seconds: float) -> str:
+    """将浮点数秒钟转换为TTML时间戳格式"""
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    seconds = seconds % 60
+    milliseconds = int((seconds % 1) * 1000)
+    seconds = int(seconds)
+    return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
+
+def compute_alignments(waveform: torch.Tensor, transcript: List[str]):
+    with torch.inference_mode():
+        emission, _ = model(waveform.to(device))
+        token_spans = aligner(emission[0], tokenizer(transcript))
+    return emission, token_spans
+
+def parse_lrc(lrc_file, audio_len):
+    """解析LRC文件，返回一个包含时间戳和歌词的列表"""
+    with open(lrc_file, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+    
+    lrc_data = []
+    for line in lines:
+        # 使用正则表达式匹配时间戳和歌词
+        match = re.match(r'\[(\d+):(\d+\.\d+)\](.*)', line)
+        if match:
+            minutes = int(match.group(1))
+            seconds = float(match.group(2))
+            lyric = match.group(3).strip()
+            lyric = lyric.replace(" ", "")
+            timestamp = minutes * 60 + seconds
+            lrc_data.append((lyric, timestamp))
+    
+    for i, (lyric, start_time) in enumerate(lrc_data):
+        # Skip empty line
+        if lyric.strip() == "":
+            continue
+        if i < len(lrc_data) - 1:
+            end_time = lrc_data[i + 1][1]
+        else:
+            end_time = audio_len
+        lrc_data[i] = (lyric, start_time, end_time)
+    
+    # Filter empty lines again
+    lrc_data = [line for line in lrc_data if line[0].strip() != ""]
+
+    return lrc_data
+
+def extract_numbers_from_files(directory):
+    """
+    读取给定目录，提取文件名中的数字部分，并返回一个包含这些数字的列表。
+
+    :param directory: 目录路径
+    :return: 包含数字的列表
+    """
+    numbers = []
+    pattern = re.compile(r'line-(\d+)\.wav')
+
+    try:
+        for filename in os.listdir(directory):
+            match = pattern.match(filename)
+            if match:
+                number = int(match.group(1))
+                numbers.append(number)
+    except Exception as e:
+        print(f"Error reading directory: {e}")
+        return None
+
+    return numbers
+
+def process_line(line_idx, start_time):
+    with open(f"./temp/lines/line-{line_idx}.txt", "r") as f:
+        text = f.read()
+        
+    waveform, sample_rate = torchaudio.load(f"./temp/lines/line-{line_idx}.wav")
+
+    waveform = waveform[0:1]
+    resampler = Resample(orig_freq=sample_rate, new_freq=16000)
+    waveform = resampler(waveform)
+
+    text_pinyin = lazy_pinyin(text)
+    text_normalized = " ".join(text_pinyin)
+    
+    transcript = text_normalized.split()
+    emission, token_spans = compute_alignments(waveform, transcript)
+    num_frames = emission.size(1)
+    ratio = waveform.size(1) / num_frames
+
+    words = []
+    for i in range(len(token_spans)):
+        spans = token_spans[i]
+        x0 = start_time + int(ratio * spans[0].start) / 16000
+        x1 = start_time + int(ratio * spans[-1].end) / 16000
+        words.append({
+            "word": text[i],
+            "start": x0,
+            "end": x1
+        })
+    idx=0
+    for item in words:
+        if idx == len(words) - 1:
+            break
+        item["end"] = words[idx + 1]["start"]
+        idx+=1
+    result = []
+    for word in words:
+        result.append((word["word"], timestamp(word["start"]), timestamp(word["end"])))
+    return result
+
+
+def align(audio_file: str, lrc_file: str, output_ttml: str, segments_dir: str = "./temp/lines"):
+    """
+    对齐音频和歌词，并输出TTML文件。
+    
+    :param audio_file: 音频文件路径
+    :param lrc_file: LRC歌词文件路径
+    :param output_ttml: 输出TTML文件路径
+    :param segments_dir: 存放分割后音频片段的目录，默认为"./segments"
+    """
+    # 获取音频时长
+    duration = get_audio_duration(audio_file)
+
+    # 解析LRC文件
+    lrc_data = parse_lrc(lrc_file, duration)
+
+    # 提取要处理的行号
+    lines_to_process = sorted(extract_numbers_from_files(segments_dir))
+
+    # 创建TTML生成器实例
+    ttml_generator = TTMLGenerator(duration=timestamp(duration))
+    
+    i = 0
+    for line_num in tqdm(lines_to_process):
+        start_time = lrc_data[i][1]
+        end_time = lrc_data[i][2]
+        result = process_line(line_num, start_time)
+        ttml_generator.add_lyrics(
+            begin=timestamp(start_time), end=timestamp(end_time), agent="v1", itunes_key=f"L{i+1}",
+            words=result
+        )
+        i += 1
+
+    # 保存TTML文件
+    ttml_generator.save(output_ttml)
+    
+if __name__ == "__main__":
+    align("./data/1.flac", "./data/1.lrc", "./data/output.ttml", "./temp/lines")
--- a/mmsAlignment/splitSong.py
+++ b/mmsAlignment/splitSong.py
@ -0,0 +1,58 @@
+from pydub import AudioSegment
+from utils.cleanTempDir import cleanTempDir
+import re
+
+def parse_lrc(lrc_file):
+    """解析LRC文件，返回一个包含时间戳和歌词的列表"""
+    with open(lrc_file, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+    
+    lrc_data = []
+    for line in lines:
+        # 使用正则表达式匹配时间戳和歌词
+        match = re.match(r'\[(\d+):(\d+\.\d+)\](.*)', line)
+        if match:
+            minutes = int(match.group(1))
+            seconds = float(match.group(2))
+            lyric = match.group(3).strip()
+            lyric = lyric.replace(" ", "")
+            timestamp = minutes * 60 + seconds
+            lrc_data.append((timestamp, lyric))
+    
+    return lrc_data
+
+def split_audio_by_lrc(audio_file, lrc_data):
+    """根据LRC数据分割音频文件，并保存为单独的WAV文件"""
+    audio = AudioSegment.from_file(audio_file)
+    cleanTempDir("./temp/lines")
+    
+    for i, (start_time, lyric) in enumerate(lrc_data):
+        # Skip empty line
+        if lyric.strip() == "":
+            continue
+        if i < len(lrc_data) - 1:
+            end_time = lrc_data[i + 1][0]
+        else:
+            end_time = len(audio) / 1000  # 最后一行歌词到音频结束
+        start_time = max(0, start_time - 0.1)  # 前后各扩0.1秒
+        end_time = min(len(audio) / 1000, end_time + 0.1)
+        start_time_ms = start_time * 1000
+        end_time_ms = end_time * 1000
+        
+        segment = audio[start_time_ms:end_time_ms]
+        output_file = f"./temp/lines/line-{i+1}.wav"
+        output_script = f"./temp/lines/line-{i+1}.txt"
+        output_time = f"./temp/lines/line-{i+1}.time"
+        segment.export(output_file, format="wav")
+        with open(output_script, "w") as f:
+            f.write(lyric)
+        with open(output_time, "w") as f:
+            f.write(str(start_time)+","+str(end_time))
+        print(f"Saved {output_file}")
+
+if __name__ == "__main__":
+    lrc_file = "./data/1.lrc"  # LRC文件路径
+    audio_file = "./data/1.flac"  # 音频文件路径
+    
+    lrc_data = parse_lrc(lrc_file)
+    split_audio_by_lrc(audio_file, lrc_data)
--- a/utils/audio.py
+++ b/utils/audio.py
@ -0,0 +1,16 @@
+from pydub import AudioSegment
+
+def get_audio_duration(file_path):
+    """
+    读取音频文件并获取其时长（秒数）。
+
+    :param file_path: 音频文件的路径
+    :return: 音频文件的时长（秒数）
+    """
+    try:
+        audio = AudioSegment.from_file(file_path)
+        duration_in_seconds = len(audio) / 1000.0
+        return duration_in_seconds
+    except Exception as e:
+        print(f"Error reading audio file: {e}")
+        return None
--- a/utils/cleanTempDir.py
+++ b/utils/cleanTempDir.py
@ -0,0 +1,6 @@
+import os
+import shutil
+
+def cleanTempDir(dir_path):
+    shutil.rmtree(dir_path, ignore_errors=True)
+    os.makedirs(dir_path, exist_ok=True)
--- a/utils/ttml.py
+++ b/utils/ttml.py
@ -0,0 +1,57 @@
+import xml.etree.ElementTree as ET
+
+class TTMLGenerator:
+    def __init__(self, duration, xmlns="http://www.w3.org/ns/ttml", xmlns_ttm="http://www.w3.org/ns/ttml#metadata", xmlns_amll="http://www.example.com/ns/amll", xmlns_itunes="http://music.apple.com/lyric-ttml-internal"):
+        self.tt = ET.Element("tt", attrib={
+            "xmlns": xmlns,
+            "xmlns:ttm": xmlns_ttm,
+            "xmlns:amll": xmlns_amll,
+            "xmlns:itunes": xmlns_itunes
+        })
+        self.head = ET.SubElement(self.tt, "head")
+        self.metadata = ET.SubElement(self.head, "metadata")
+        self.body = ET.SubElement(self.tt, "body", attrib={"dur": duration})
+        self.div = ET.SubElement(self.body, "div")
+
+    def add_lyrics(self, begin, end, agent, itunes_key, words):
+        p = ET.SubElement(self.div, "p", attrib={
+            "begin": begin,
+            "end": end,
+            "ttm:agent": agent,
+            "itunes:key": itunes_key
+        })
+        for word, start, stop in words:
+            span = ET.SubElement(p, "span", attrib={"begin": start, "end": stop})
+            span.text = word
+
+    def save(self, filename):
+        tree = ET.ElementTree(self.tt)
+        tree.write(filename, encoding="utf-8", xml_declaration=True)
+
+def extract_lrc_from_ttml(ttml_file):
+    def format_time(ttml_time):
+        return ttml_time[3:]
+
+    tree = ET.parse(ttml_file)
+    root = tree.getroot()
+    namespace = {"": "http://www.w3.org/ns/ttml", "ttm": "http://www.w3.org/ns/ttml#metadata"}
+
+    lrc_lines = []
+    
+    for p in root.findall(".//p", namespace):
+        begin = p.attrib.get("begin")
+        end = p.attrib.get("end")
+        text_content = ""
+
+        for span in p.findall("span", namespace):
+            text_content += span.text or ""
+
+        # Format begin and end times
+        begin_time = format_time(begin)
+        end_time = format_time(end)
+
+        # Add formatted lines to the LRC list
+        lrc_lines.append(f"[{begin_time}] {text_content}")
+        lrc_lines.append(f"[{end_time}]")  # Add the end time as a separate line
+
+    return "\n".join(lrc_lines)
--- a/whisperAlignment/align2srt.py
+++ b/whisperAlignment/align2srt.py
@ -0,0 +1,6 @@
+import stable_whisper
+
+def align2srt(lyrics, audio_path, output_path):
+    model = stable_whisper.load_model('large-v3')
+    result = model.align(audio_path, lyrics, language="Chinese", regroup=False)
+    result.to_srt_vtt(output_path, segment_level=False)
--- a/whisperAlignment/alignWithGroup.py
+++ b/whisperAlignment/alignWithGroup.py
@ -0,0 +1,47 @@
+import os
+from whisperAlignment.splitGroups import split_audio_and_text
+from whisperAlignment.align2srt import align2srt
+from whisperAlignment.srt2lrc import srt2lrc
+from utils.cleanTempDir import cleanTempDir
+
+def alignWithGroup(segments_file, audio_file, lyrics_file, output_file):
+    # Clean temp/segments dir (Insure it exists)
+    cleanTempDir('./temp/segments')
+
+    # Split groups
+    split_audio_and_text(segments_file, audio_file, lyrics_file, 'temp/segments')
+
+    # Get numbers of segments by count "txt" files in temp/segments
+    nums = len([name for name in os.listdir('./temp/segments') if name.endswith('.txt')])
+
+    for i in range(1, int(nums) + 1):
+        segment_lyric = f"./temp/segments/segment_{str(i)}.txt"
+        segment_audio = f"./temp/segments/segment_{str(i)}.mp3"
+        segment_srt = f"./temp/segments/segment_{str(i)}.srt"
+        segment_lrc = f"./temp/segments/segment_{str(i)}.lrc"
+        segment_start = f"./temp/segments/segment_{str(i)}.start"
+        with open(segment_lyric, 'r') as f:
+            lyrics = f.read()
+        align2srt(lyrics, segment_audio, segment_srt)
+        with open(segment_start, 'r') as f:
+            offset = float(f.read())
+        srt2lrc(lyrics, segment_srt, segment_lrc, offset)
+
+    # Combine lrc files
+    lrcs = []
+    for i in range(1, int(nums) + 1):
+        lrcs.append(f"./temp/segments/segment_{str(i)}.lrc")
+
+    with open(output_file, 'w') as f:
+        for lrc in lrcs:
+            with open(lrc, 'r') as lrc_file:
+                f.write(lrc_file.read())
+                f.write('\n')
+
+SEGMENTS_FILE = './data/1.group'
+AUDIO_FILE = './data/1.mp3'
+LYRICS_FILE = './data/1.txt'
+OUTPUT_FILE = './data/1.lrc'
+
+if __name__ == "__main__":
+    alignWithGroup(SEGMENTS_FILE, AUDIO_FILE, LYRICS_FILE, OUTPUT_FILE)
--- a/whisperAlignment/splitGroups.py
+++ b/whisperAlignment/splitGroups.py
@ -0,0 +1,65 @@
+from pydub import AudioSegment
+import os
+
+def parse_line(line):
+    """Parse a line in the format '1-26|00:42-02:07'."""
+    line_range, time_range = line.split('|')
+    start_line, end_line = map(int, line_range.split('-'))
+    start_time, end_time = time_range.split('-')
+    return (start_line, end_line, start_time, end_time)
+
+def time_to_milliseconds(time_str):
+    """Convert a time string in HH:MM:SS or MM:SS format to milliseconds."""
+    parts = list(map(int, time_str.split(':')))
+    if len(parts) == 2:
+        minutes, seconds = parts
+        hours = 0
+    elif len(parts) == 3:
+        hours, minutes, seconds = parts
+    else:
+        raise ValueError("Invalid time format")
+    return ((hours * 3600 + minutes * 60 + seconds) * 1000)
+
+def split_audio_and_text(mapping_file, audio_file, text_file, output_dir):
+    """Split audio and text into corresponding segments based on mapping_file."""
+    # Read mapping file
+    with open(mapping_file, 'r') as f:
+        mappings = [parse_line(line.strip()) for line in f if line.strip()]
+
+    # Load audio file
+    audio = AudioSegment.from_file(audio_file)
+
+    # Read text file lines
+    with open(text_file, 'r') as f:
+        text_lines = f.readlines()
+
+    # Remove empty lines
+    text_lines = [line for line in text_lines if line.strip()]
+
+    # Ensure output directory exists
+    os.makedirs(output_dir, exist_ok=True)
+
+    for i, (start_line, end_line, start_time, end_time) in enumerate(mappings):
+        # Extract text segment
+        text_segment = text_lines[start_line - 1:end_line]
+
+        # Extract audio segment
+        start_ms = time_to_milliseconds(start_time)
+        end_ms = time_to_milliseconds(end_time)
+        audio_segment = audio[start_ms:end_ms]
+
+        # Save text segment
+        text_output_path = os.path.join(output_dir, f'segment_{i + 1}.txt')
+        with open(text_output_path, 'w') as text_file:
+            text_file.writelines(text_segment)
+
+        # Save audio segment
+        audio_output_path = os.path.join(output_dir, f'segment_{i + 1}.mp3')
+        audio_segment.export(audio_output_path, format='mp3')
+
+        # Save segment start time
+        start_time_output_path = os.path.join(output_dir, f'segment_{i + 1}.start')
+        with open(start_time_output_path, 'w') as start_time_file:
+            start_time_file.write(str(start_ms / 1000))
+
+        print(f"Saved segment {i + 1}: {text_output_path}, {audio_output_path}")
--- a/whisperAlignment/srt2lrc.py
+++ b/whisperAlignment/srt2lrc.py
@ -0,0 +1,48 @@
+import pysrt
+
+def parseTime(object):
+    return object.hours * 3600 + object.minutes * 60 + object.seconds + object.milliseconds / 1000
+
+def serializeTime(time):
+    minutes = int(time / 60)
+    seconds = int(time % 60)
+    milliseconds = int((time - int(time)) * 1000)
+    return f"{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
+
+def srt2lrc(lyrics, srt_file, lrc_file, time_offset=0):
+    subs = pysrt.open(srt_file, encoding='utf-8')
+
+    # 加载歌词并按行分割
+    lyrics_lines = lyrics.splitlines()
+    
+    # 初始化
+    aligned_lines = []
+    current_line = ""
+    start_time = None
+
+    # 遍历 SRT 的每一项
+    for sub in subs:
+        word = sub.text.strip()
+        if not current_line:
+            start_time = parseTime(sub.start)  # 记录行的开始时间
+
+        current_line += word
+
+        # 如果当前行匹配到歌词中的一行
+        if lyrics_lines and current_line == lyrics_lines[0]:
+            end_time = parseTime(sub.end)  # 记录行的结束时间
+            aligned_lines.append(f"[{serializeTime(start_time+time_offset)}] {current_line}\n[{serializeTime(end_time+time_offset)}]")
+
+            # 移除已匹配的歌词行并重置
+            lyrics_lines.pop(0)
+            current_line = ""
+            start_time = None
+    
+    result = []
+    # 后处理，只留下最后一行的结束时间
+    for i in range(len(aligned_lines) - 1):
+        result.append(aligned_lines[i].split('\n')[0])
+    result.append(aligned_lines[-1])
+
+    with open(lrc_file, 'w') as f:
+        f.write('\n'.join(result))