feat: parsec based lyric parser

Signed-off-by: eternal-flame-AD <yume@yumechi.jp>
This commit is contained in:
eternal-flame-AD 2024-07-24 20:30:24 -05:00
parent e4677f4117
commit 2341d13721
No known key found for this signature in database
5 changed files with 260 additions and 28 deletions

BIN
bun.lockb

Binary file not shown.

View File

@ -48,6 +48,7 @@
"music-metadata-browser": "^2.5.10",
"node-cache": "^5.1.2",
"rollup-plugin-node-polyfills": "^0.2.1",
"typescript-parsec": "^0.3.4",
"uuid": "^9.0.1"
}
}

View File

@ -1,3 +1,20 @@
import {
alt_sc,
apply,
buildLexer,
expectEOF,
fail,
kmid,
opt_sc,
rep,
rep_sc,
seq,
str,
tok,
type Parser,
type Token
} from 'typescript-parsec';
export interface ScriptItem {
start: number;
text: string;
@ -28,22 +45,188 @@ interface IDTag {
[key: string]: string;
}
export function splitLine(str: string) {
return str.split('\n').filter((str) => str.trim() !== '');
}
export function ExtractIDTags(lines: string[]) {
let result: IDTag = {};
const IDTagRegex = /^\[(\w*): (.*?)]$/;
let lastMatch = 0;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const matchResult = line.trim().match(IDTagRegex);
if (matchResult && matchResult.length == 3) {
const tagName = matchResult[1];
const tagValue = matchResult[2];
result[tagName] = tagValue;
}
function convertTimeToMs({
mins,
secs,
decimals
}: {
mins?: number | string;
secs?: number | string;
decimals?: string;
}) {
let result = 0;
if (mins) {
result += Number(mins) * 60 * 1000;
}
if (secs) {
result += Number(secs) * 1000;
}
if (decimals) {
const denom = Math.pow(10, decimals.length);
result += Number(decimals) / (denom / 1000);
}
return result;
}
}
const digit = Array.from({ length: 10 }, (_, i) => apply(str(i.toString()), (_) => i)).reduce(
(acc, cur) => alt_sc(cur, acc),
fail('no alternatives')
);
const numStr = apply(rep_sc(digit), (r) => r.join(''));
const num = apply(numStr, (r) => parseInt(r));
const alpha = alt_sc(
Array.from({ length: 26 }, (_, i) =>
apply(str(String.fromCharCode('a'.charCodeAt(0) + i)), (_) => String.fromCharCode('a'.charCodeAt(0) + i))
).reduce((acc, cur) => alt_sc(cur, acc), fail('no alternatives')),
Array.from({ length: 26 }, (_, i) =>
apply(str(String.fromCharCode('A'.charCodeAt(0) + i)), (_) => String.fromCharCode('A'.charCodeAt(0) + i))
).reduce((acc, cur) => alt_sc(cur, acc), fail('no alternatives'))
);
const alphaStr = apply(rep(alpha), (r) => r.join(''));
const spaces = rep_sc(str(' '));
const unicodeStr = rep_sc(tok('char'));
function trimmed<K, T>(p: Parser<K, Token<T>[]>): Parser<K, Token<T>[]> {
return apply(p, (r) => {
while (r.length > 0 && r[0].text.trim() === '') {
r.shift();
}
while (r.length > 0 && r[r.length - 1].text.trim() === '') {
r.pop();
}
return r;
});
}
function anythingTyped(types: string[]) {
return types.map((t) => tok(t)).reduce((acc, cur) => alt_sc(cur, acc), fail('no alternatives'));
}
function lrcTimestamp<K, T>(delim: [Parser<K, Token<T>>, Parser<K, Token<T>>]) {
const innerTS = alt_sc(
apply(seq(num, str(':'), num, str('.'), numStr), (r) =>
convertTimeToMs({ mins: r[0], secs: r[2], decimals: r[4] })
),
apply(seq(num, str('.'), numStr), (r) => convertTimeToMs({ secs: r[0], decimals: r[2] })),
apply(seq(num, str(':'), num), (r) => convertTimeToMs({ mins: r[0], secs: r[2] })),
apply(num, (r) => convertTimeToMs({ secs: r }))
);
return kmid(delim[0], innerTS, delim[1]);
}
const squareTS = lrcTimestamp([tok('['), tok(']')]);
const angleTS = lrcTimestamp([tok('<'), tok('>')]);
const lrcTag = apply(
seq(
tok('['),
alphaStr,
str(':'),
tokenParserToText(trimmed(rep(anythingTyped(['char', '[', ']', '<', '>'])))),
tok(']')
),
(r) => ({
[r[1]]: r[3]
})
);
function tokenParserToText<K, T>(p: Parser<K, Token<T>> | Parser<K, Token<T>[]>): Parser<K, string> {
return apply(p, (r: Token<T> | Token<T>[]) => {
if (Array.isArray(r)) {
return joinTokens(r);
}
return r.text;
});
}
function joinTokens<T>(tokens: Token<T>[]) {
return tokens.map((t) => t.text).join('');
}
function lrcLine(
wordDiv = ''
): Parser<unknown, ['script_item', ScriptItem] | ['lrc_tag', IDTag] | ['comment', string] | ['empty', string]> {
return alt_sc(
apply(seq(squareTS, rep_sc(seq(opt_sc(angleTS), trimmed(rep_sc(anythingTyped(['char', '[', ']'])))))), (r) => {
const start = r[0];
const text = r[1]
.map((s) => joinTokens(s[1]))
.filter((s) => s.trim().length > 0)
.join(wordDiv);
const words = r[1]
.filter((s) => joinTokens(s[1]).trim().length > 0)
.map((s) => {
const wordBegin = s[0];
const word = s[1];
let ret: Partial<ScriptWordsItem> = { start: wordBegin };
if (word[0]) {
ret.beginIndex = word[0].pos.columnBegin - 1;
}
if (word[word.length - 1]) {
ret.endIndex = word[word.length - 1].pos.columnEnd;
}
return ret as ScriptWordsItem; // TODO: Complete this
});
return ['script_item', { start, text, words } as any as ScriptItem]; // TODO: Complete this
}),
apply(lrcTag, (r) => ['lrc_tag', r as IDTag]),
apply(seq(spaces, str('#'), rep_sc(unicodeStr)), (cmt) => ['comment', cmt[2].join('')] as const)
);
}
export function dumpToken<T>(t: Token<T> | undefined): string {
if (t === undefined) {
return '<EOF>';
}
return '`' + t.text + '` -> ' + dumpToken(t.next);
}
export function parseLRC(
input: string,
{ wordDiv, strict }: { wordDiv?: string; strict?: boolean } = { wordDiv: ' ' }
): LrcJsonData {
const tokenizer = buildLexer([
[true, /^\[/gu, '['],
[true, /^\]/gu, ']'],
[true, /^</gu, '<'],
[true, /^>/gu, '>'],
[true, /^./gu, 'char']
]);
const lines = input
.split('\n')
.filter((line) => line.trim().length > 0)
.map((line) => tokenizer.parse(line));
return lines
.map((line) => {
const res = expectEOF(lrcLine(wordDiv).parse(line));
if (!res.successful) {
if (strict) {
throw new Error('Failed to parse full line: ' + dumpToken(line));
} else {
console.error('Failed to parse full line: ' + dumpToken(line));
}
return null;
}
return res.candidates[0].result;
})
.filter((r) => r !== null)
.reduce((acc, cur) => {
switch (cur[0]) {
case 'lrc_tag':
Object.assign(acc, cur[1]);
return acc;
case 'script_item':
acc.scripts = acc.scripts || [];
acc.scripts.push(cur[1]);
return acc;
default:
return acc;
}
}, {} as LrcJsonData);
}

View File

@ -1,19 +1,58 @@
import { describe, expect, it } from 'vitest';
import fs from 'fs';
import { ExtractIDTags, splitLine } from '$lib/lyrics/parser';
import { parseLRC } from '$lib/lyrics/parser';
describe('LRC parser test', () => {
const test01Buffer = fs.readFileSync('./src/test/resources/test-01.lrc');
const test01Text = test01Buffer.toString('utf-8');
it('Line Split', () => {
const lyrics = test01Text;
const lines = splitLine(lyrics);
expect(lines[26]).toBe('[01:52.991]');
});
it('IDTag Extract', () => {
const lyrics = test01Text;
const lines = splitLine(lyrics);
const idTags = ExtractIDTags(lines);
expect(idTags['ar']).toBe('洛天依');
const test02Buffer = fs.readFileSync('./src/test/resources/test-02.lrc');
const test02Text = test02Buffer.toString('utf-8');
it('Parses test-01.lrc', () => {
const result = parseLRC(test01Text, { wordDiv: '', strict: true });
expect(result.ar).toBe("洛天依");
expect(result.ti).toBe("中华少女·终");
expect(result.al).toBe("中华少女");
expect(result["tool"]).toBe("歌词滚动姬 https://lrc-maker.github.io");
expect(result.scripts!![1].text).toBe("因果与恩怨牵杂等谁来诊断");
expect(result.scripts!![1].start).toBe(49000 + 588);
})
it('Parses test-02.lrc', () => {
const result = parseLRC(test02Text, { wordDiv: ' ', strict: true });
expect(result.ti).toBe("Somebody to Love");
expect(result.ar).toBe("Jefferson Airplane");
expect(result.scripts!![0].text).toBe("When the truth is found to be lies");
expect(result.scripts!![0].start).toBe(0);
expect(result.scripts!![0].words!![1].beginIndex).toBe("[00:00.00] <00:00.04> When <00:00.16> the".indexOf("the"));
expect(result.scripts!![0].words!![1].start).toBe(160);
});
it('Rejects some invalid LRCs', () => {
const cases = [
"[<00:00.00>] <00:00.04> When <00:00.16> the",
"[00:00.00] <00:00.04> <When> <00:00.16> the",
"[00:00.00> <00:00.04> When <00:00.16> the",
"<00:00.00> <00:00.04> When <00:00.16> the",
"<1:00:00.00> <00:00.04> When <00:00.16> the",
]
for (const c of cases) {
expect(() => parseLRC(c, { strict: true })).toThrow();
}
})
it('Accepts some weird but parsable LRCs', () => {
const cases = [
"[ti: []]",
"[ar: [<]]",
"[ar: <ar>]",
"[ar: a b c]",
"[00:00.00] <00:00.04> When the <00:00.16> the",
"[00:00.00] [00:00.04] When [00:00.16] the",
"[00:00.0000000] <00:00.04> When <00:00.16> the",
"[00:00.00] <00:00.04> [When] <00:00.16> the",
];
for (const c of cases) {
expect(() => parseLRC(c, { strict: false })).not.toThrow();
}
})
});

View File

@ -0,0 +1,9 @@
[ti: Somebody to Love]
[ar: Jefferson Airplane]
[al: Surrealistic Pillow]
[lr: Lyricists of that song]
[length: 2:58]
[00:00.00] <00:00.04> When <00:00.16> the <00:00.82> truth <00:01.29> is <00:01.63> found <00:03.09> to <00:03.37> be <00:05.92> lies
[00:06.47] <00:07.67> And <00:07.94> all <00:08.36> the <00:08.63> joy <00:10.28> within <00:10.53> you <00:13.09> dies
[00:13.34] <00:14.32> Don't <00:14.73> you <00:15.14> want <00:15.57> somebody <00:16.09> to <00:16.46> love