feat: parsec based lyric parser

Signed-off-by: eternal-flame-AD <yume@yumechi.jp>
2024-07-24 20:30:24 -05:00 · 2024-07-24 20:30:24 -05:00 · 2341d13721
commit 2341d13721
parent e4677f4117
5 changed files with 260 additions and 28 deletions
--- a/bun.lockb
+++ b/bun.lockb
--- a/package.json
+++ b/package.json
@ -48,6 +48,7 @@
    "music-metadata-browser": "^2.5.10",
    "node-cache": "^5.1.2",
    "rollup-plugin-node-polyfills": "^0.2.1",
    "typescript-parsec": "^0.3.4",
    "uuid": "^9.0.1"
  }
 }
--- a/src/lib/lyrics/parser.ts
+++ b/src/lib/lyrics/parser.ts
@ -1,3 +1,20 @@
 import {
    alt_sc,
    apply,
    buildLexer,
    expectEOF,
    fail,
    kmid,
    opt_sc,
    rep,
    rep_sc,
    seq,
    str,
    tok,
    type Parser,
    type Token
 } from 'typescript-parsec';
 export interface ScriptItem {
    start: number;
    text: string;
@ -28,22 +45,188 @@ interface IDTag {
    [key: string]: string;
 }
-export function splitLine(str: string) {
+function convertTimeToMs({
-    return str.split('\n').filter((str) => str.trim() !== '');
+    mins,
-}
+    secs,
-
+    decimals
-export function ExtractIDTags(lines: string[]) {
+}: {
-    let result: IDTag = {};
+    mins?: number | string;
-    const IDTagRegex = /^\[(\w*): (.*?)]$/;
+    secs?: number | string;
-    let lastMatch = 0;
+    decimals?: string;
-    for (let i = 0; i < lines.length; i++) {
+}) {
-        const line = lines[i];
+    let result = 0;
-        const matchResult = line.trim().match(IDTagRegex);
+    if (mins) {
-        if (matchResult && matchResult.length == 3) {
+        result += Number(mins) * 60 * 1000;
-            const tagName = matchResult[1];
+    }
-            const tagValue = matchResult[2];
+    if (secs) {
-            result[tagName] = tagValue;
+        result += Number(secs) * 1000;
-        }
+    }
    if (decimals) {
        const denom = Math.pow(10, decimals.length);
        result += Number(decimals) / (denom / 1000);
    }
    return result;
-}
+}
 const digit = Array.from({ length: 10 }, (_, i) => apply(str(i.toString()), (_) => i)).reduce(
    (acc, cur) => alt_sc(cur, acc),
    fail('no alternatives')
 );
 const numStr = apply(rep_sc(digit), (r) => r.join(''));
 const num = apply(numStr, (r) => parseInt(r));
 const alpha = alt_sc(
    Array.from({ length: 26 }, (_, i) =>
        apply(str(String.fromCharCode('a'.charCodeAt(0) + i)), (_) => String.fromCharCode('a'.charCodeAt(0) + i))
    ).reduce((acc, cur) => alt_sc(cur, acc), fail('no alternatives')),
    Array.from({ length: 26 }, (_, i) =>
        apply(str(String.fromCharCode('A'.charCodeAt(0) + i)), (_) => String.fromCharCode('A'.charCodeAt(0) + i))
    ).reduce((acc, cur) => alt_sc(cur, acc), fail('no alternatives'))
 );
 const alphaStr = apply(rep(alpha), (r) => r.join(''));
 const spaces = rep_sc(str(' '));
 const unicodeStr = rep_sc(tok('char'));
 function trimmed<K, T>(p: Parser<K, Token<T>[]>): Parser<K, Token<T>[]> {
    return apply(p, (r) => {
        while (r.length > 0 && r[0].text.trim() === '') {
            r.shift();
        }
        while (r.length > 0 && r[r.length - 1].text.trim() === '') {
            r.pop();
        }
        return r;
    });
 }
 function anythingTyped(types: string[]) {
    return types.map((t) => tok(t)).reduce((acc, cur) => alt_sc(cur, acc), fail('no alternatives'));
 }
 function lrcTimestamp<K, T>(delim: [Parser<K, Token<T>>, Parser<K, Token<T>>]) {
    const innerTS = alt_sc(
        apply(seq(num, str(':'), num, str('.'), numStr), (r) =>
            convertTimeToMs({ mins: r[0], secs: r[2], decimals: r[4] })
        ),
        apply(seq(num, str('.'), numStr), (r) => convertTimeToMs({ secs: r[0], decimals: r[2] })),
        apply(seq(num, str(':'), num), (r) => convertTimeToMs({ mins: r[0], secs: r[2] })),
        apply(num, (r) => convertTimeToMs({ secs: r }))
    );
    return kmid(delim[0], innerTS, delim[1]);
 }
 const squareTS = lrcTimestamp([tok('['), tok(']')]);
 const angleTS = lrcTimestamp([tok('<'), tok('>')]);
 const lrcTag = apply(
    seq(
        tok('['),
        alphaStr,
        str(':'),
        tokenParserToText(trimmed(rep(anythingTyped(['char', '[', ']', '<', '>'])))),
        tok(']')
    ),
    (r) => ({
        [r[1]]: r[3]
    })
 );
 function tokenParserToText<K, T>(p: Parser<K, Token<T>> | Parser<K, Token<T>[]>): Parser<K, string> {
    return apply(p, (r: Token<T> | Token<T>[]) => {
        if (Array.isArray(r)) {
            return joinTokens(r);
        }
        return r.text;
    });
 }
 function joinTokens<T>(tokens: Token<T>[]) {
    return tokens.map((t) => t.text).join('');
 }
 function lrcLine(
    wordDiv = ''
 ): Parser<unknown, ['script_item', ScriptItem] | ['lrc_tag', IDTag] | ['comment', string] | ['empty', string]> {
    return alt_sc(
        apply(seq(squareTS, rep_sc(seq(opt_sc(angleTS), trimmed(rep_sc(anythingTyped(['char', '[', ']'])))))), (r) => {
            const start = r[0];
            const text = r[1]
                .map((s) => joinTokens(s[1]))
                .filter((s) => s.trim().length > 0)
                .join(wordDiv);
            const words = r[1]
                .filter((s) => joinTokens(s[1]).trim().length > 0)
                .map((s) => {
                    const wordBegin = s[0];
                    const word = s[1];
                    let ret: Partial<ScriptWordsItem> = { start: wordBegin };
                    if (word[0]) {
                        ret.beginIndex = word[0].pos.columnBegin - 1;
                    }
                    if (word[word.length - 1]) {
                        ret.endIndex = word[word.length - 1].pos.columnEnd;
                    }
                    return ret as ScriptWordsItem; // TODO: Complete this
                });
            return ['script_item', { start, text, words } as any as ScriptItem]; // TODO: Complete this
        }),
        apply(lrcTag, (r) => ['lrc_tag', r as IDTag]),
        apply(seq(spaces, str('#'), rep_sc(unicodeStr)), (cmt) => ['comment', cmt[2].join('')] as const)
    );
 }
 export function dumpToken<T>(t: Token<T> | undefined): string {
    if (t === undefined) {
        return '<EOF>';
    }
    return '`' + t.text + '` -> ' + dumpToken(t.next);
 }
 export function parseLRC(
    input: string,
    { wordDiv, strict }: { wordDiv?: string; strict?: boolean } = { wordDiv: ' ' }
 ): LrcJsonData {
    const tokenizer = buildLexer([
        [true, /^\[/gu, '['],
        [true, /^\]/gu, ']'],
        [true, /^</gu, '<'],
        [true, /^>/gu, '>'],
        [true, /^./gu, 'char']
    ]);
    const lines = input
        .split('\n')
        .filter((line) => line.trim().length > 0)
        .map((line) => tokenizer.parse(line));
    return lines
        .map((line) => {
            const res = expectEOF(lrcLine(wordDiv).parse(line));
            if (!res.successful) {
                if (strict) {
                    throw new Error('Failed to parse full line: ' + dumpToken(line));
                } else {
                    console.error('Failed to parse full line: ' + dumpToken(line));
                }
                return null;
            }
            return res.candidates[0].result;
        })
        .filter((r) => r !== null)
        .reduce((acc, cur) => {
            switch (cur[0]) {
                case 'lrc_tag':
                    Object.assign(acc, cur[1]);
                    return acc;
                case 'script_item':
                    acc.scripts = acc.scripts || [];
                    acc.scripts.push(cur[1]);
                    return acc;
                default:
                    return acc;
            }
        }, {} as LrcJsonData);
 }
--- a/src/test/lrcParser.test.ts
+++ b/src/test/lrcParser.test.ts
@ -1,19 +1,58 @@
 import { describe, expect, it } from 'vitest';
 import fs from 'fs';
-import { ExtractIDTags, splitLine } from '$lib/lyrics/parser';
+import { parseLRC } from '$lib/lyrics/parser';
 describe('LRC parser test', () => {
    const test01Buffer = fs.readFileSync('./src/test/resources/test-01.lrc');
    const test01Text = test01Buffer.toString('utf-8');
-    it('Line Split', () => {
+    const test02Buffer = fs.readFileSync('./src/test/resources/test-02.lrc');
-        const lyrics = test01Text;
+    const test02Text = test02Buffer.toString('utf-8');
-        const lines = splitLine(lyrics);
+    it('Parses test-01.lrc', () => {
-        expect(lines[26]).toBe('[01:52.991]');
+        const result = parseLRC(test01Text, { wordDiv: '', strict: true });
-    });
+
-    it('IDTag Extract', () => {
+        expect(result.ar).toBe("洛天依");
-        const lyrics = test01Text;
+        expect(result.ti).toBe("中华少女·终");
-        const lines = splitLine(lyrics);
+        expect(result.al).toBe("中华少女");
-        const idTags = ExtractIDTags(lines);
+        expect(result["tool"]).toBe("歌词滚动姬 https://lrc-maker.github.io");
-        expect(idTags['ar']).toBe('洛天依');
+        expect(result.scripts!![1].text).toBe("因果与恩怨牵杂等谁来诊断");
        expect(result.scripts!![1].start).toBe(49000 + 588);
    })
    it('Parses test-02.lrc', () => {
        const result = parseLRC(test02Text, { wordDiv: ' ', strict: true });
        expect(result.ti).toBe("Somebody to Love");
        expect(result.ar).toBe("Jefferson Airplane");
        expect(result.scripts!![0].text).toBe("When the truth is found to be lies");
        expect(result.scripts!![0].start).toBe(0);
        expect(result.scripts!![0].words!![1].beginIndex).toBe("[00:00.00] <00:00.04> When <00:00.16> the".indexOf("the"));
        expect(result.scripts!![0].words!![1].start).toBe(160);
    });
    it('Rejects some invalid LRCs', () => {
        const cases = [
            "[<00:00.00>] <00:00.04> When <00:00.16> the",
            "[00:00.00] <00:00.04> <When> <00:00.16> the",
            "[00:00.00> <00:00.04> When <00:00.16> the",
            "<00:00.00> <00:00.04> When <00:00.16> the",
            "<1:00:00.00> <00:00.04> When <00:00.16> the",
        ]
        for (const c of cases) {
            expect(() => parseLRC(c, { strict: true })).toThrow();
        }
    })
    it('Accepts some weird but parsable LRCs', () => {
        const cases = [
            "[ti: []]",
            "[ar: [<]]",
            "[ar: <ar>]",
            "[ar: a b c]",
            "[00:00.00] <00:00.04> When the <00:00.16> the",
            "[00:00.00] [00:00.04] When [00:00.16] the",
            "[00:00.0000000] <00:00.04> When <00:00.16> the",
            "[00:00.00] <00:00.04> [When] <00:00.16> the",
        ];
        for (const c of cases) {
            expect(() => parseLRC(c, { strict: false })).not.toThrow();
        }
    })
 });
--- a/src/test/resources/test-02.lrc
+++ b/src/test/resources/test-02.lrc
@ -0,0 +1,9 @@
 [ti: Somebody to Love]
 [ar: Jefferson Airplane]
 [al: Surrealistic Pillow]
 [lr: Lyricists of that song]
 [length: 2:58]
 [00:00.00] <00:00.04> When <00:00.16> the <00:00.82> truth <00:01.29> is <00:01.63> found <00:03.09> to <00:03.37> be <00:05.92> lies 
 [00:06.47] <00:07.67> And <00:07.94> all <00:08.36> the <00:08.63> joy <00:10.28> within <00:10.53> you <00:13.09> dies 
 [00:13.34] <00:14.32> Don't <00:14.73> you <00:15.14> want <00:15.57> somebody <00:16.09> to <00:16.46> love