add: AI lyrics aligning

2024-12-27 19:46:22 +08:00 · 2024-12-27 19:46:22 +08:00 · 2698a75277
commit 2698a75277
12 changed files with 657 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,178 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # UV
 #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #uv.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
 .pdm.toml
 .pdm-python
 .pdm-build/
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 # PyPI configuration file
 .pypirc
 # Project specific
 /temp
 /data
 .DS_Store
--- a/align-pipeline.md
+++ b/align-pipeline.md
@ -0,0 +1,3 @@
 1. prepare `1.mp3`, `1.txt`, `1.group` in `./data`
 2. `whisperAlignment/alignWithGroup`
 3. `mmsAlignment
--- a/mmsAlignment/align2LRC.py
+++ b/mmsAlignment/align2LRC.py
@ -0,0 +1,6 @@
 from utils.ttml import extract_lrc_from_ttml
 lrc_output = extract_lrc_from_ttml('./data/1.ttml')
 with open('./data/1-final.lrc', 'w') as f:
    f.write(lrc_output)
--- a/mmsAlignment/alignWithMMS.py
+++ b/mmsAlignment/alignWithMMS.py
@ -0,0 +1,167 @@
 import os
 import re
 import torch
 import torchaudio
 from typing import List
 from pypinyin import lazy_pinyin
 from pypinyin_dict.phrase_pinyin_data import cc_cedict
 from torchaudio.transforms import Resample
 from tqdm import tqdm
 from utils.ttml import TTMLGenerator
 from utils.audio import get_audio_duration
 # 初始化设备、模型、分词器、对齐器等
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 bundle = torchaudio.pipelines.MMS_FA
 model = bundle.get_model().to(device)
 tokenizer = bundle.get_tokenizer()
 aligner = bundle.get_aligner()
 cc_cedict.load()
 def timestamp(seconds: float) -> str:
    """将浮点数秒钟转换为TTML时间戳格式"""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    seconds = seconds % 60
    milliseconds = int((seconds % 1) * 1000)
    seconds = int(seconds)
    return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
 def compute_alignments(waveform: torch.Tensor, transcript: List[str]):
    with torch.inference_mode():
        emission, _ = model(waveform.to(device))
        token_spans = aligner(emission[0], tokenizer(transcript))
    return emission, token_spans
 def parse_lrc(lrc_file, audio_len):
    """解析LRC文件，返回一个包含时间戳和歌词的列表"""
    with open(lrc_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    lrc_data = []
    for line in lines:
        # 使用正则表达式匹配时间戳和歌词
        match = re.match(r'\[(\d+):(\d+\.\d+)\](.*)', line)
        if match:
            minutes = int(match.group(1))
            seconds = float(match.group(2))
            lyric = match.group(3).strip()
            lyric = lyric.replace(" ", "")
            timestamp = minutes * 60 + seconds
            lrc_data.append((lyric, timestamp))
    for i, (lyric, start_time) in enumerate(lrc_data):
        # Skip empty line
        if lyric.strip() == "":
            continue
        if i < len(lrc_data) - 1:
            end_time = lrc_data[i + 1][1]
        else:
            end_time = audio_len
        lrc_data[i] = (lyric, start_time, end_time)
    # Filter empty lines again
    lrc_data = [line for line in lrc_data if line[0].strip() != ""]
    return lrc_data
 def extract_numbers_from_files(directory):
    """
    读取给定目录，提取文件名中的数字部分，并返回一个包含这些数字的列表。
    :param directory: 目录路径
    :return: 包含数字的列表
    """
    numbers = []
    pattern = re.compile(r'line-(\d+)\.wav')
    try:
        for filename in os.listdir(directory):
            match = pattern.match(filename)
            if match:
                number = int(match.group(1))
                numbers.append(number)
    except Exception as e:
        print(f"Error reading directory: {e}")
        return None
    return numbers
 def process_line(line_idx, start_time):
    with open(f"./temp/lines/line-{line_idx}.txt", "r") as f:
        text = f.read()
    waveform, sample_rate = torchaudio.load(f"./temp/lines/line-{line_idx}.wav")
    waveform = waveform[0:1]
    resampler = Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)
    text_pinyin = lazy_pinyin(text)
    text_normalized = " ".join(text_pinyin)
    transcript = text_normalized.split()
    emission, token_spans = compute_alignments(waveform, transcript)
    num_frames = emission.size(1)
    ratio = waveform.size(1) / num_frames
    words = []
    for i in range(len(token_spans)):
        spans = token_spans[i]
        x0 = start_time + int(ratio * spans[0].start) / 16000
        x1 = start_time + int(ratio * spans[-1].end) / 16000
        words.append({
            "word": text[i],
            "start": x0,
            "end": x1
        })
    idx=0
    for item in words:
        if idx == len(words) - 1:
            break
        item["end"] = words[idx + 1]["start"]
        idx+=1
    result = []
    for word in words:
        result.append((word["word"], timestamp(word["start"]), timestamp(word["end"])))
    return result
 def align(audio_file: str, lrc_file: str, output_ttml: str, segments_dir: str = "./temp/lines"):
    """
    对齐音频和歌词，并输出TTML文件。
    :param audio_file: 音频文件路径
    :param lrc_file: LRC歌词文件路径
    :param output_ttml: 输出TTML文件路径
    :param segments_dir: 存放分割后音频片段的目录，默认为"./segments"
    """
    # 获取音频时长
    duration = get_audio_duration(audio_file)
    # 解析LRC文件
    lrc_data = parse_lrc(lrc_file, duration)
    # 提取要处理的行号
    lines_to_process = sorted(extract_numbers_from_files(segments_dir))
    # 创建TTML生成器实例
    ttml_generator = TTMLGenerator(duration=timestamp(duration))
    i = 0
    for line_num in tqdm(lines_to_process):
        start_time = lrc_data[i][1]
        end_time = lrc_data[i][2]
        result = process_line(line_num, start_time)
        ttml_generator.add_lyrics(
            begin=timestamp(start_time), end=timestamp(end_time), agent="v1", itunes_key=f"L{i+1}",
            words=result
        )
        i += 1
    # 保存TTML文件
    ttml_generator.save(output_ttml)
 if __name__ == "__main__":
    align("./data/1.flac", "./data/1.lrc", "./data/output.ttml", "./temp/lines")
--- a/mmsAlignment/splitSong.py
+++ b/mmsAlignment/splitSong.py
@ -0,0 +1,58 @@
 from pydub import AudioSegment
 from utils.cleanTempDir import cleanTempDir
 import re
 def parse_lrc(lrc_file):
    """解析LRC文件，返回一个包含时间戳和歌词的列表"""
    with open(lrc_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    lrc_data = []
    for line in lines:
        # 使用正则表达式匹配时间戳和歌词
        match = re.match(r'\[(\d+):(\d+\.\d+)\](.*)', line)
        if match:
            minutes = int(match.group(1))
            seconds = float(match.group(2))
            lyric = match.group(3).strip()
            lyric = lyric.replace(" ", "")
            timestamp = minutes * 60 + seconds
            lrc_data.append((timestamp, lyric))
    return lrc_data
 def split_audio_by_lrc(audio_file, lrc_data):
    """根据LRC数据分割音频文件，并保存为单独的WAV文件"""
    audio = AudioSegment.from_file(audio_file)
    cleanTempDir("./temp/lines")
    for i, (start_time, lyric) in enumerate(lrc_data):
        # Skip empty line
        if lyric.strip() == "":
            continue
        if i < len(lrc_data) - 1:
            end_time = lrc_data[i + 1][0]
        else:
            end_time = len(audio) / 1000  # 最后一行歌词到音频结束
        start_time = max(0, start_time - 0.1)  # 前后各扩0.1秒
        end_time = min(len(audio) / 1000, end_time + 0.1)
        start_time_ms = start_time * 1000
        end_time_ms = end_time * 1000
        segment = audio[start_time_ms:end_time_ms]
        output_file = f"./temp/lines/line-{i+1}.wav"
        output_script = f"./temp/lines/line-{i+1}.txt"
        output_time = f"./temp/lines/line-{i+1}.time"
        segment.export(output_file, format="wav")
        with open(output_script, "w") as f:
            f.write(lyric)
        with open(output_time, "w") as f:
            f.write(str(start_time)+","+str(end_time))
        print(f"Saved {output_file}")
 if __name__ == "__main__":
    lrc_file = "./data/1.lrc"  # LRC文件路径
    audio_file = "./data/1.flac"  # 音频文件路径
    lrc_data = parse_lrc(lrc_file)
    split_audio_by_lrc(audio_file, lrc_data)
--- a/utils/audio.py
+++ b/utils/audio.py
@ -0,0 +1,16 @@
 from pydub import AudioSegment
 def get_audio_duration(file_path):
    """
    读取音频文件并获取其时长（秒数）。
    :param file_path: 音频文件的路径
    :return: 音频文件的时长（秒数）
    """
    try:
        audio = AudioSegment.from_file(file_path)
        duration_in_seconds = len(audio) / 1000.0
        return duration_in_seconds
    except Exception as e:
        print(f"Error reading audio file: {e}")
        return None
--- a/utils/cleanTempDir.py
+++ b/utils/cleanTempDir.py
@ -0,0 +1,6 @@
 import os
 import shutil
 def cleanTempDir(dir_path):
    shutil.rmtree(dir_path, ignore_errors=True)
    os.makedirs(dir_path, exist_ok=True)
--- a/utils/ttml.py
+++ b/utils/ttml.py
@ -0,0 +1,57 @@
 import xml.etree.ElementTree as ET
 class TTMLGenerator:
    def __init__(self, duration, xmlns="http://www.w3.org/ns/ttml", xmlns_ttm="http://www.w3.org/ns/ttml#metadata", xmlns_amll="http://www.example.com/ns/amll", xmlns_itunes="http://music.apple.com/lyric-ttml-internal"):
        self.tt = ET.Element("tt", attrib={
            "xmlns": xmlns,
            "xmlns:ttm": xmlns_ttm,
            "xmlns:amll": xmlns_amll,
            "xmlns:itunes": xmlns_itunes
        })
        self.head = ET.SubElement(self.tt, "head")
        self.metadata = ET.SubElement(self.head, "metadata")
        self.body = ET.SubElement(self.tt, "body", attrib={"dur": duration})
        self.div = ET.SubElement(self.body, "div")
    def add_lyrics(self, begin, end, agent, itunes_key, words):
        p = ET.SubElement(self.div, "p", attrib={
            "begin": begin,
            "end": end,
            "ttm:agent": agent,
            "itunes:key": itunes_key
        })
        for word, start, stop in words:
            span = ET.SubElement(p, "span", attrib={"begin": start, "end": stop})
            span.text = word
    def save(self, filename):
        tree = ET.ElementTree(self.tt)
        tree.write(filename, encoding="utf-8", xml_declaration=True)
 def extract_lrc_from_ttml(ttml_file):
    def format_time(ttml_time):
        return ttml_time[3:]
    tree = ET.parse(ttml_file)
    root = tree.getroot()
    namespace = {"": "http://www.w3.org/ns/ttml", "ttm": "http://www.w3.org/ns/ttml#metadata"}
    lrc_lines = []
    for p in root.findall(".//p", namespace):
        begin = p.attrib.get("begin")
        end = p.attrib.get("end")
        text_content = ""
        for span in p.findall("span", namespace):
            text_content += span.text or ""
        # Format begin and end times
        begin_time = format_time(begin)
        end_time = format_time(end)
        # Add formatted lines to the LRC list
        lrc_lines.append(f"[{begin_time}] {text_content}")
        lrc_lines.append(f"[{end_time}]")  # Add the end time as a separate line
    return "\n".join(lrc_lines)
--- a/whisperAlignment/align2srt.py
+++ b/whisperAlignment/align2srt.py
@ -0,0 +1,6 @@
 import stable_whisper
 def align2srt(lyrics, audio_path, output_path):
    model = stable_whisper.load_model('large-v3')
    result = model.align(audio_path, lyrics, language="Chinese", regroup=False)
    result.to_srt_vtt(output_path, segment_level=False)
--- a/whisperAlignment/alignWithGroup.py
+++ b/whisperAlignment/alignWithGroup.py
@ -0,0 +1,47 @@
 import os
 from whisperAlignment.splitGroups import split_audio_and_text
 from whisperAlignment.align2srt import align2srt
 from whisperAlignment.srt2lrc import srt2lrc
 from utils.cleanTempDir import cleanTempDir
 def alignWithGroup(segments_file, audio_file, lyrics_file, output_file):
    # Clean temp/segments dir (Insure it exists)
    cleanTempDir('./temp/segments')
    # Split groups
    split_audio_and_text(segments_file, audio_file, lyrics_file, 'temp/segments')
    # Get numbers of segments by count "txt" files in temp/segments
    nums = len([name for name in os.listdir('./temp/segments') if name.endswith('.txt')])
    for i in range(1, int(nums) + 1):
        segment_lyric = f"./temp/segments/segment_{str(i)}.txt"
        segment_audio = f"./temp/segments/segment_{str(i)}.mp3"
        segment_srt = f"./temp/segments/segment_{str(i)}.srt"
        segment_lrc = f"./temp/segments/segment_{str(i)}.lrc"
        segment_start = f"./temp/segments/segment_{str(i)}.start"
        with open(segment_lyric, 'r') as f:
            lyrics = f.read()
        align2srt(lyrics, segment_audio, segment_srt)
        with open(segment_start, 'r') as f:
            offset = float(f.read())
        srt2lrc(lyrics, segment_srt, segment_lrc, offset)
    # Combine lrc files
    lrcs = []
    for i in range(1, int(nums) + 1):
        lrcs.append(f"./temp/segments/segment_{str(i)}.lrc")
    with open(output_file, 'w') as f:
        for lrc in lrcs:
            with open(lrc, 'r') as lrc_file:
                f.write(lrc_file.read())
                f.write('\n')
 SEGMENTS_FILE = './data/1.group'
 AUDIO_FILE = './data/1.mp3'
 LYRICS_FILE = './data/1.txt'
 OUTPUT_FILE = './data/1.lrc'
 if __name__ == "__main__":
    alignWithGroup(SEGMENTS_FILE, AUDIO_FILE, LYRICS_FILE, OUTPUT_FILE)
--- a/whisperAlignment/splitGroups.py
+++ b/whisperAlignment/splitGroups.py
@ -0,0 +1,65 @@
 from pydub import AudioSegment
 import os
 def parse_line(line):
    """Parse a line in the format '1-26|00:42-02:07'."""
    line_range, time_range = line.split('|')
    start_line, end_line = map(int, line_range.split('-'))
    start_time, end_time = time_range.split('-')
    return (start_line, end_line, start_time, end_time)
 def time_to_milliseconds(time_str):
    """Convert a time string in HH:MM:SS or MM:SS format to milliseconds."""
    parts = list(map(int, time_str.split(':')))
    if len(parts) == 2:
        minutes, seconds = parts
        hours = 0
    elif len(parts) == 3:
        hours, minutes, seconds = parts
    else:
        raise ValueError("Invalid time format")
    return ((hours * 3600 + minutes * 60 + seconds) * 1000)
 def split_audio_and_text(mapping_file, audio_file, text_file, output_dir):
    """Split audio and text into corresponding segments based on mapping_file."""
    # Read mapping file
    with open(mapping_file, 'r') as f:
        mappings = [parse_line(line.strip()) for line in f if line.strip()]
    # Load audio file
    audio = AudioSegment.from_file(audio_file)
    # Read text file lines
    with open(text_file, 'r') as f:
        text_lines = f.readlines()
    # Remove empty lines
    text_lines = [line for line in text_lines if line.strip()]
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    for i, (start_line, end_line, start_time, end_time) in enumerate(mappings):
        # Extract text segment
        text_segment = text_lines[start_line - 1:end_line]
        # Extract audio segment
        start_ms = time_to_milliseconds(start_time)
        end_ms = time_to_milliseconds(end_time)
        audio_segment = audio[start_ms:end_ms]
        # Save text segment
        text_output_path = os.path.join(output_dir, f'segment_{i + 1}.txt')
        with open(text_output_path, 'w') as text_file:
            text_file.writelines(text_segment)
        # Save audio segment
        audio_output_path = os.path.join(output_dir, f'segment_{i + 1}.mp3')
        audio_segment.export(audio_output_path, format='mp3')
        # Save segment start time
        start_time_output_path = os.path.join(output_dir, f'segment_{i + 1}.start')
        with open(start_time_output_path, 'w') as start_time_file:
            start_time_file.write(str(start_ms / 1000))
        print(f"Saved segment {i + 1}: {text_output_path}, {audio_output_path}")
--- a/whisperAlignment/srt2lrc.py
+++ b/whisperAlignment/srt2lrc.py
@ -0,0 +1,48 @@
 import pysrt
 def parseTime(object):
    return object.hours * 3600 + object.minutes * 60 + object.seconds + object.milliseconds / 1000
 def serializeTime(time):
    minutes = int(time / 60)
    seconds = int(time % 60)
    milliseconds = int((time - int(time)) * 1000)
    return f"{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
 def srt2lrc(lyrics, srt_file, lrc_file, time_offset=0):
    subs = pysrt.open(srt_file, encoding='utf-8')
    # 加载歌词并按行分割
    lyrics_lines = lyrics.splitlines()
    # 初始化
    aligned_lines = []
    current_line = ""
    start_time = None
    # 遍历 SRT 的每一项
    for sub in subs:
        word = sub.text.strip()
        if not current_line:
            start_time = parseTime(sub.start)  # 记录行的开始时间
        current_line += word
        # 如果当前行匹配到歌词中的一行
        if lyrics_lines and current_line == lyrics_lines[0]:
            end_time = parseTime(sub.end)  # 记录行的结束时间
            aligned_lines.append(f"[{serializeTime(start_time+time_offset)}] {current_line}\n[{serializeTime(end_time+time_offset)}]")
            # 移除已匹配的歌词行并重置
            lyrics_lines.pop(0)
            current_line = ""
            start_time = None
    result = []
    # 后处理，只留下最后一行的结束时间
    for i in range(len(aligned_lines) - 1):
        result.append(aligned_lines[i].split('\n')[0])
    result.append(aligned_lines[-1])
    with open(lrc_file, 'w') as f:
        f.write('\n'.join(result))