feat: parsec based lyric parser

Signed-off-by: eternal-flame-AD <yume@yumechi.jp>
2024-07-24 20:30:24 -05:00 · 2024-07-24 20:30:24 -05:00 · 2341d13721
commit 2341d13721
parent e4677f4117
5 changed files with 260 additions and 28 deletions
--- a/bun.lockb
+++ b/bun.lockb
--- a/package.json
+++ b/package.json
@ -48,6 +48,7 @@
    "music-metadata-browser": "^2.5.10",
    "node-cache": "^5.1.2",
    "rollup-plugin-node-polyfills": "^0.2.1",
+    "typescript-parsec": "^0.3.4",
    "uuid": "^9.0.1"
  }
 }
--- a/src/lib/lyrics/parser.ts
+++ b/src/lib/lyrics/parser.ts
@ -1,3 +1,20 @@
+import {
+    alt_sc,
+    apply,
+    buildLexer,
+    expectEOF,
+    fail,
+    kmid,
+    opt_sc,
+    rep,
+    rep_sc,
+    seq,
+    str,
+    tok,
+    type Parser,
+    type Token
+} from 'typescript-parsec';
+
 export interface ScriptItem {
    start: number;
    text: string;
@ -28,22 +45,188 @@ interface IDTag {
    [key: string]: string;
 }

-export function splitLine(str: string) {
-    return str.split('\n').filter((str) => str.trim() !== '');
-}
-
-export function ExtractIDTags(lines: string[]) {
-    let result: IDTag = {};
-    const IDTagRegex = /^\[(\w*): (.*?)]$/;
-    let lastMatch = 0;
-    for (let i = 0; i < lines.length; i++) {
-        const line = lines[i];
-        const matchResult = line.trim().match(IDTagRegex);
-        if (matchResult && matchResult.length == 3) {
-            const tagName = matchResult[1];
-            const tagValue = matchResult[2];
-            result[tagName] = tagValue;
-        }
+function convertTimeToMs({
+    mins,
+    secs,
+    decimals
+}: {
+    mins?: number | string;
+    secs?: number | string;
+    decimals?: string;
+}) {
+    let result = 0;
+    if (mins) {
+        result += Number(mins) * 60 * 1000;
+    }
+    if (secs) {
+        result += Number(secs) * 1000;
+    }
+    if (decimals) {
+        const denom = Math.pow(10, decimals.length);
+        result += Number(decimals) / (denom / 1000);
    }
    return result;
-}
+}
+
+const digit = Array.from({ length: 10 }, (_, i) => apply(str(i.toString()), (_) => i)).reduce(
+    (acc, cur) => alt_sc(cur, acc),
+    fail('no alternatives')
+);
+const numStr = apply(rep_sc(digit), (r) => r.join(''));
+const num = apply(numStr, (r) => parseInt(r));
+const alpha = alt_sc(
+    Array.from({ length: 26 }, (_, i) =>
+        apply(str(String.fromCharCode('a'.charCodeAt(0) + i)), (_) => String.fromCharCode('a'.charCodeAt(0) + i))
+    ).reduce((acc, cur) => alt_sc(cur, acc), fail('no alternatives')),
+    Array.from({ length: 26 }, (_, i) =>
+        apply(str(String.fromCharCode('A'.charCodeAt(0) + i)), (_) => String.fromCharCode('A'.charCodeAt(0) + i))
+    ).reduce((acc, cur) => alt_sc(cur, acc), fail('no alternatives'))
+);
+
+const alphaStr = apply(rep(alpha), (r) => r.join(''));
+const spaces = rep_sc(str(' '));
+
+const unicodeStr = rep_sc(tok('char'));
+
+function trimmed<K, T>(p: Parser<K, Token<T>[]>): Parser<K, Token<T>[]> {
+    return apply(p, (r) => {
+        while (r.length > 0 && r[0].text.trim() === '') {
+            r.shift();
+        }
+        while (r.length > 0 && r[r.length - 1].text.trim() === '') {
+            r.pop();
+        }
+        return r;
+    });
+}
+
+function anythingTyped(types: string[]) {
+    return types.map((t) => tok(t)).reduce((acc, cur) => alt_sc(cur, acc), fail('no alternatives'));
+}
+
+function lrcTimestamp<K, T>(delim: [Parser<K, Token<T>>, Parser<K, Token<T>>]) {
+    const innerTS = alt_sc(
+        apply(seq(num, str(':'), num, str('.'), numStr), (r) =>
+            convertTimeToMs({ mins: r[0], secs: r[2], decimals: r[4] })
+        ),
+        apply(seq(num, str('.'), numStr), (r) => convertTimeToMs({ secs: r[0], decimals: r[2] })),
+        apply(seq(num, str(':'), num), (r) => convertTimeToMs({ mins: r[0], secs: r[2] })),
+        apply(num, (r) => convertTimeToMs({ secs: r }))
+    );
+    return kmid(delim[0], innerTS, delim[1]);
+}
+
+const squareTS = lrcTimestamp([tok('['), tok(']')]);
+const angleTS = lrcTimestamp([tok('<'), tok('>')]);
+
+const lrcTag = apply(
+    seq(
+        tok('['),
+        alphaStr,
+        str(':'),
+        tokenParserToText(trimmed(rep(anythingTyped(['char', '[', ']', '<', '>'])))),
+        tok(']')
+    ),
+    (r) => ({
+        [r[1]]: r[3]
+    })
+);
+
+function tokenParserToText<K, T>(p: Parser<K, Token<T>> | Parser<K, Token<T>[]>): Parser<K, string> {
+    return apply(p, (r: Token<T> | Token<T>[]) => {
+        if (Array.isArray(r)) {
+            return joinTokens(r);
+        }
+        return r.text;
+    });
+}
+
+function joinTokens<T>(tokens: Token<T>[]) {
+    return tokens.map((t) => t.text).join('');
+}
+
+function lrcLine(
+    wordDiv = ''
+): Parser<unknown, ['script_item', ScriptItem] | ['lrc_tag', IDTag] | ['comment', string] | ['empty', string]> {
+    return alt_sc(
+        apply(seq(squareTS, rep_sc(seq(opt_sc(angleTS), trimmed(rep_sc(anythingTyped(['char', '[', ']'])))))), (r) => {
+            const start = r[0];
+
+            const text = r[1]
+                .map((s) => joinTokens(s[1]))
+                .filter((s) => s.trim().length > 0)
+                .join(wordDiv);
+
+            const words = r[1]
+                .filter((s) => joinTokens(s[1]).trim().length > 0)
+                .map((s) => {
+                    const wordBegin = s[0];
+                    const word = s[1];
+                    let ret: Partial<ScriptWordsItem> = { start: wordBegin };
+                    if (word[0]) {
+                        ret.beginIndex = word[0].pos.columnBegin - 1;
+                    }
+                    if (word[word.length - 1]) {
+                        ret.endIndex = word[word.length - 1].pos.columnEnd;
+                    }
+                    return ret as ScriptWordsItem; // TODO: Complete this
+                });
+            return ['script_item', { start, text, words } as any as ScriptItem]; // TODO: Complete this
+        }),
+        apply(lrcTag, (r) => ['lrc_tag', r as IDTag]),
+        apply(seq(spaces, str('#'), rep_sc(unicodeStr)), (cmt) => ['comment', cmt[2].join('')] as const)
+    );
+}
+
+export function dumpToken<T>(t: Token<T> | undefined): string {
+    if (t === undefined) {
+        return '<EOF>';
+    }
+    return '`' + t.text + '` -> ' + dumpToken(t.next);
+}
+
+export function parseLRC(
+    input: string,
+    { wordDiv, strict }: { wordDiv?: string; strict?: boolean } = { wordDiv: ' ' }
+): LrcJsonData {
+    const tokenizer = buildLexer([
+        [true, /^\[/gu, '['],
+        [true, /^\]/gu, ']'],
+        [true, /^</gu, '<'],
+        [true, /^>/gu, '>'],
+        [true, /^./gu, 'char']
+    ]);
+
+    const lines = input
+        .split('\n')
+        .filter((line) => line.trim().length > 0)
+        .map((line) => tokenizer.parse(line));
+
+    return lines
+        .map((line) => {
+            const res = expectEOF(lrcLine(wordDiv).parse(line));
+            if (!res.successful) {
+                if (strict) {
+                    throw new Error('Failed to parse full line: ' + dumpToken(line));
+                } else {
+                    console.error('Failed to parse full line: ' + dumpToken(line));
+                }
+                return null;
+            }
+            return res.candidates[0].result;
+        })
+        .filter((r) => r !== null)
+        .reduce((acc, cur) => {
+            switch (cur[0]) {
+                case 'lrc_tag':
+                    Object.assign(acc, cur[1]);
+                    return acc;
+                case 'script_item':
+                    acc.scripts = acc.scripts || [];
+                    acc.scripts.push(cur[1]);
+                    return acc;
+                default:
+                    return acc;
+            }
+        }, {} as LrcJsonData);
+}
--- a/src/test/lrcParser.test.ts
+++ b/src/test/lrcParser.test.ts
@ -1,19 +1,58 @@
 import { describe, expect, it } from 'vitest';
 import fs from 'fs';
-import { ExtractIDTags, splitLine } from '$lib/lyrics/parser';
+import { parseLRC } from '$lib/lyrics/parser';

 describe('LRC parser test', () => {
    const test01Buffer = fs.readFileSync('./src/test/resources/test-01.lrc');
    const test01Text = test01Buffer.toString('utf-8');
-    it('Line Split', () => {
-        const lyrics = test01Text;
-        const lines = splitLine(lyrics);
-        expect(lines[26]).toBe('[01:52.991]');
-    });
-    it('IDTag Extract', () => {
-        const lyrics = test01Text;
-        const lines = splitLine(lyrics);
-        const idTags = ExtractIDTags(lines);
-        expect(idTags['ar']).toBe('洛天依');
+    const test02Buffer = fs.readFileSync('./src/test/resources/test-02.lrc');
+    const test02Text = test02Buffer.toString('utf-8');
+    it('Parses test-01.lrc', () => {
+        const result = parseLRC(test01Text, { wordDiv: '', strict: true });
+
+        expect(result.ar).toBe("洛天依");
+        expect(result.ti).toBe("中华少女·终");
+        expect(result.al).toBe("中华少女");
+        expect(result["tool"]).toBe("歌词滚动姬 https://lrc-maker.github.io");
+        expect(result.scripts!![1].text).toBe("因果与恩怨牵杂等谁来诊断");
+        expect(result.scripts!![1].start).toBe(49000 + 588);
+    })
+    it('Parses test-02.lrc', () => {
+        const result = parseLRC(test02Text, { wordDiv: ' ', strict: true });
+
+        expect(result.ti).toBe("Somebody to Love");
+        expect(result.ar).toBe("Jefferson Airplane");
+        expect(result.scripts!![0].text).toBe("When the truth is found to be lies");
+        expect(result.scripts!![0].start).toBe(0);
+        expect(result.scripts!![0].words!![1].beginIndex).toBe("[00:00.00] <00:00.04> When <00:00.16> the".indexOf("the"));
+        expect(result.scripts!![0].words!![1].start).toBe(160);
    });
+    it('Rejects some invalid LRCs', () => {
+        const cases = [
+            "[<00:00.00>] <00:00.04> When <00:00.16> the",
+            "[00:00.00] <00:00.04> <When> <00:00.16> the",
+            "[00:00.00> <00:00.04> When <00:00.16> the",
+            "<00:00.00> <00:00.04> When <00:00.16> the",
+            "<1:00:00.00> <00:00.04> When <00:00.16> the",
+        ]
+        for (const c of cases) {
+            expect(() => parseLRC(c, { strict: true })).toThrow();
+        }
+    })
+    it('Accepts some weird but parsable LRCs', () => {
+        const cases = [
+            "[ti: []]",
+            "[ar: [<]]",
+            "[ar: <ar>]",
+            "[ar: a b c]",
+            "[00:00.00] <00:00.04> When the <00:00.16> the",
+            "[00:00.00] [00:00.04] When [00:00.16] the",
+            "[00:00.0000000] <00:00.04> When <00:00.16> the",
+            "[00:00.00] <00:00.04> [When] <00:00.16> the",
+        ];
+
+        for (const c of cases) {
+            expect(() => parseLRC(c, { strict: false })).not.toThrow();
+        }
+    })
 });
--- a/src/test/resources/test-02.lrc
+++ b/src/test/resources/test-02.lrc
@ -0,0 +1,9 @@
+[ti: Somebody to Love]
+[ar: Jefferson Airplane]
+[al: Surrealistic Pillow]
+[lr: Lyricists of that song]
+[length: 2:58]
+
+[00:00.00] <00:00.04> When <00:00.16> the <00:00.82> truth <00:01.29> is <00:01.63> found <00:03.09> to <00:03.37> be <00:05.92> lies 
+[00:06.47] <00:07.67> And <00:07.94> all <00:08.36> the <00:08.63> joy <00:10.28> within <00:10.53> you <00:13.09> dies 
+[00:13.34] <00:14.32> Don't <00:14.73> you <00:15.14> want <00:15.57> somebody <00:16.09> to <00:16.46> love