feat: parsec based lyric parser
Signed-off-by: eternal-flame-AD <yume@yumechi.jp>
This commit is contained in:
parent
e4677f4117
commit
2341d13721
@ -48,6 +48,7 @@
|
||||
"music-metadata-browser": "^2.5.10",
|
||||
"node-cache": "^5.1.2",
|
||||
"rollup-plugin-node-polyfills": "^0.2.1",
|
||||
"typescript-parsec": "^0.3.4",
|
||||
"uuid": "^9.0.1"
|
||||
}
|
||||
}
|
||||
|
@ -1,3 +1,20 @@
|
||||
import {
|
||||
alt_sc,
|
||||
apply,
|
||||
buildLexer,
|
||||
expectEOF,
|
||||
fail,
|
||||
kmid,
|
||||
opt_sc,
|
||||
rep,
|
||||
rep_sc,
|
||||
seq,
|
||||
str,
|
||||
tok,
|
||||
type Parser,
|
||||
type Token
|
||||
} from 'typescript-parsec';
|
||||
|
||||
export interface ScriptItem {
|
||||
start: number;
|
||||
text: string;
|
||||
@ -28,22 +45,188 @@ interface IDTag {
|
||||
[key: string]: string;
|
||||
}
|
||||
|
||||
export function splitLine(str: string) {
|
||||
return str.split('\n').filter((str) => str.trim() !== '');
|
||||
}
|
||||
|
||||
export function ExtractIDTags(lines: string[]) {
|
||||
let result: IDTag = {};
|
||||
const IDTagRegex = /^\[(\w*): (.*?)]$/;
|
||||
let lastMatch = 0;
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i];
|
||||
const matchResult = line.trim().match(IDTagRegex);
|
||||
if (matchResult && matchResult.length == 3) {
|
||||
const tagName = matchResult[1];
|
||||
const tagValue = matchResult[2];
|
||||
result[tagName] = tagValue;
|
||||
}
|
||||
function convertTimeToMs({
|
||||
mins,
|
||||
secs,
|
||||
decimals
|
||||
}: {
|
||||
mins?: number | string;
|
||||
secs?: number | string;
|
||||
decimals?: string;
|
||||
}) {
|
||||
let result = 0;
|
||||
if (mins) {
|
||||
result += Number(mins) * 60 * 1000;
|
||||
}
|
||||
if (secs) {
|
||||
result += Number(secs) * 1000;
|
||||
}
|
||||
if (decimals) {
|
||||
const denom = Math.pow(10, decimals.length);
|
||||
result += Number(decimals) / (denom / 1000);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
const digit = Array.from({ length: 10 }, (_, i) => apply(str(i.toString()), (_) => i)).reduce(
|
||||
(acc, cur) => alt_sc(cur, acc),
|
||||
fail('no alternatives')
|
||||
);
|
||||
const numStr = apply(rep_sc(digit), (r) => r.join(''));
|
||||
const num = apply(numStr, (r) => parseInt(r));
|
||||
const alpha = alt_sc(
|
||||
Array.from({ length: 26 }, (_, i) =>
|
||||
apply(str(String.fromCharCode('a'.charCodeAt(0) + i)), (_) => String.fromCharCode('a'.charCodeAt(0) + i))
|
||||
).reduce((acc, cur) => alt_sc(cur, acc), fail('no alternatives')),
|
||||
Array.from({ length: 26 }, (_, i) =>
|
||||
apply(str(String.fromCharCode('A'.charCodeAt(0) + i)), (_) => String.fromCharCode('A'.charCodeAt(0) + i))
|
||||
).reduce((acc, cur) => alt_sc(cur, acc), fail('no alternatives'))
|
||||
);
|
||||
|
||||
const alphaStr = apply(rep(alpha), (r) => r.join(''));
|
||||
const spaces = rep_sc(str(' '));
|
||||
|
||||
const unicodeStr = rep_sc(tok('char'));
|
||||
|
||||
function trimmed<K, T>(p: Parser<K, Token<T>[]>): Parser<K, Token<T>[]> {
|
||||
return apply(p, (r) => {
|
||||
while (r.length > 0 && r[0].text.trim() === '') {
|
||||
r.shift();
|
||||
}
|
||||
while (r.length > 0 && r[r.length - 1].text.trim() === '') {
|
||||
r.pop();
|
||||
}
|
||||
return r;
|
||||
});
|
||||
}
|
||||
|
||||
function anythingTyped(types: string[]) {
|
||||
return types.map((t) => tok(t)).reduce((acc, cur) => alt_sc(cur, acc), fail('no alternatives'));
|
||||
}
|
||||
|
||||
function lrcTimestamp<K, T>(delim: [Parser<K, Token<T>>, Parser<K, Token<T>>]) {
|
||||
const innerTS = alt_sc(
|
||||
apply(seq(num, str(':'), num, str('.'), numStr), (r) =>
|
||||
convertTimeToMs({ mins: r[0], secs: r[2], decimals: r[4] })
|
||||
),
|
||||
apply(seq(num, str('.'), numStr), (r) => convertTimeToMs({ secs: r[0], decimals: r[2] })),
|
||||
apply(seq(num, str(':'), num), (r) => convertTimeToMs({ mins: r[0], secs: r[2] })),
|
||||
apply(num, (r) => convertTimeToMs({ secs: r }))
|
||||
);
|
||||
return kmid(delim[0], innerTS, delim[1]);
|
||||
}
|
||||
|
||||
const squareTS = lrcTimestamp([tok('['), tok(']')]);
|
||||
const angleTS = lrcTimestamp([tok('<'), tok('>')]);
|
||||
|
||||
const lrcTag = apply(
|
||||
seq(
|
||||
tok('['),
|
||||
alphaStr,
|
||||
str(':'),
|
||||
tokenParserToText(trimmed(rep(anythingTyped(['char', '[', ']', '<', '>'])))),
|
||||
tok(']')
|
||||
),
|
||||
(r) => ({
|
||||
[r[1]]: r[3]
|
||||
})
|
||||
);
|
||||
|
||||
function tokenParserToText<K, T>(p: Parser<K, Token<T>> | Parser<K, Token<T>[]>): Parser<K, string> {
|
||||
return apply(p, (r: Token<T> | Token<T>[]) => {
|
||||
if (Array.isArray(r)) {
|
||||
return joinTokens(r);
|
||||
}
|
||||
return r.text;
|
||||
});
|
||||
}
|
||||
|
||||
function joinTokens<T>(tokens: Token<T>[]) {
|
||||
return tokens.map((t) => t.text).join('');
|
||||
}
|
||||
|
||||
function lrcLine(
|
||||
wordDiv = ''
|
||||
): Parser<unknown, ['script_item', ScriptItem] | ['lrc_tag', IDTag] | ['comment', string] | ['empty', string]> {
|
||||
return alt_sc(
|
||||
apply(seq(squareTS, rep_sc(seq(opt_sc(angleTS), trimmed(rep_sc(anythingTyped(['char', '[', ']'])))))), (r) => {
|
||||
const start = r[0];
|
||||
|
||||
const text = r[1]
|
||||
.map((s) => joinTokens(s[1]))
|
||||
.filter((s) => s.trim().length > 0)
|
||||
.join(wordDiv);
|
||||
|
||||
const words = r[1]
|
||||
.filter((s) => joinTokens(s[1]).trim().length > 0)
|
||||
.map((s) => {
|
||||
const wordBegin = s[0];
|
||||
const word = s[1];
|
||||
let ret: Partial<ScriptWordsItem> = { start: wordBegin };
|
||||
if (word[0]) {
|
||||
ret.beginIndex = word[0].pos.columnBegin - 1;
|
||||
}
|
||||
if (word[word.length - 1]) {
|
||||
ret.endIndex = word[word.length - 1].pos.columnEnd;
|
||||
}
|
||||
return ret as ScriptWordsItem; // TODO: Complete this
|
||||
});
|
||||
return ['script_item', { start, text, words } as any as ScriptItem]; // TODO: Complete this
|
||||
}),
|
||||
apply(lrcTag, (r) => ['lrc_tag', r as IDTag]),
|
||||
apply(seq(spaces, str('#'), rep_sc(unicodeStr)), (cmt) => ['comment', cmt[2].join('')] as const)
|
||||
);
|
||||
}
|
||||
|
||||
export function dumpToken<T>(t: Token<T> | undefined): string {
|
||||
if (t === undefined) {
|
||||
return '<EOF>';
|
||||
}
|
||||
return '`' + t.text + '` -> ' + dumpToken(t.next);
|
||||
}
|
||||
|
||||
export function parseLRC(
|
||||
input: string,
|
||||
{ wordDiv, strict }: { wordDiv?: string; strict?: boolean } = { wordDiv: ' ' }
|
||||
): LrcJsonData {
|
||||
const tokenizer = buildLexer([
|
||||
[true, /^\[/gu, '['],
|
||||
[true, /^\]/gu, ']'],
|
||||
[true, /^</gu, '<'],
|
||||
[true, /^>/gu, '>'],
|
||||
[true, /^./gu, 'char']
|
||||
]);
|
||||
|
||||
const lines = input
|
||||
.split('\n')
|
||||
.filter((line) => line.trim().length > 0)
|
||||
.map((line) => tokenizer.parse(line));
|
||||
|
||||
return lines
|
||||
.map((line) => {
|
||||
const res = expectEOF(lrcLine(wordDiv).parse(line));
|
||||
if (!res.successful) {
|
||||
if (strict) {
|
||||
throw new Error('Failed to parse full line: ' + dumpToken(line));
|
||||
} else {
|
||||
console.error('Failed to parse full line: ' + dumpToken(line));
|
||||
}
|
||||
return null;
|
||||
}
|
||||
return res.candidates[0].result;
|
||||
})
|
||||
.filter((r) => r !== null)
|
||||
.reduce((acc, cur) => {
|
||||
switch (cur[0]) {
|
||||
case 'lrc_tag':
|
||||
Object.assign(acc, cur[1]);
|
||||
return acc;
|
||||
case 'script_item':
|
||||
acc.scripts = acc.scripts || [];
|
||||
acc.scripts.push(cur[1]);
|
||||
return acc;
|
||||
default:
|
||||
return acc;
|
||||
}
|
||||
}, {} as LrcJsonData);
|
||||
}
|
||||
|
@ -1,19 +1,58 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import fs from 'fs';
|
||||
import { ExtractIDTags, splitLine } from '$lib/lyrics/parser';
|
||||
import { parseLRC } from '$lib/lyrics/parser';
|
||||
|
||||
describe('LRC parser test', () => {
|
||||
const test01Buffer = fs.readFileSync('./src/test/resources/test-01.lrc');
|
||||
const test01Text = test01Buffer.toString('utf-8');
|
||||
it('Line Split', () => {
|
||||
const lyrics = test01Text;
|
||||
const lines = splitLine(lyrics);
|
||||
expect(lines[26]).toBe('[01:52.991]');
|
||||
});
|
||||
it('IDTag Extract', () => {
|
||||
const lyrics = test01Text;
|
||||
const lines = splitLine(lyrics);
|
||||
const idTags = ExtractIDTags(lines);
|
||||
expect(idTags['ar']).toBe('洛天依');
|
||||
const test02Buffer = fs.readFileSync('./src/test/resources/test-02.lrc');
|
||||
const test02Text = test02Buffer.toString('utf-8');
|
||||
it('Parses test-01.lrc', () => {
|
||||
const result = parseLRC(test01Text, { wordDiv: '', strict: true });
|
||||
|
||||
expect(result.ar).toBe("洛天依");
|
||||
expect(result.ti).toBe("中华少女·终");
|
||||
expect(result.al).toBe("中华少女");
|
||||
expect(result["tool"]).toBe("歌词滚动姬 https://lrc-maker.github.io");
|
||||
expect(result.scripts!![1].text).toBe("因果与恩怨牵杂等谁来诊断");
|
||||
expect(result.scripts!![1].start).toBe(49000 + 588);
|
||||
})
|
||||
it('Parses test-02.lrc', () => {
|
||||
const result = parseLRC(test02Text, { wordDiv: ' ', strict: true });
|
||||
|
||||
expect(result.ti).toBe("Somebody to Love");
|
||||
expect(result.ar).toBe("Jefferson Airplane");
|
||||
expect(result.scripts!![0].text).toBe("When the truth is found to be lies");
|
||||
expect(result.scripts!![0].start).toBe(0);
|
||||
expect(result.scripts!![0].words!![1].beginIndex).toBe("[00:00.00] <00:00.04> When <00:00.16> the".indexOf("the"));
|
||||
expect(result.scripts!![0].words!![1].start).toBe(160);
|
||||
});
|
||||
it('Rejects some invalid LRCs', () => {
|
||||
const cases = [
|
||||
"[<00:00.00>] <00:00.04> When <00:00.16> the",
|
||||
"[00:00.00] <00:00.04> <When> <00:00.16> the",
|
||||
"[00:00.00> <00:00.04> When <00:00.16> the",
|
||||
"<00:00.00> <00:00.04> When <00:00.16> the",
|
||||
"<1:00:00.00> <00:00.04> When <00:00.16> the",
|
||||
]
|
||||
for (const c of cases) {
|
||||
expect(() => parseLRC(c, { strict: true })).toThrow();
|
||||
}
|
||||
})
|
||||
it('Accepts some weird but parsable LRCs', () => {
|
||||
const cases = [
|
||||
"[ti: []]",
|
||||
"[ar: [<]]",
|
||||
"[ar: <ar>]",
|
||||
"[ar: a b c]",
|
||||
"[00:00.00] <00:00.04> When the <00:00.16> the",
|
||||
"[00:00.00] [00:00.04] When [00:00.16] the",
|
||||
"[00:00.0000000] <00:00.04> When <00:00.16> the",
|
||||
"[00:00.00] <00:00.04> [When] <00:00.16> the",
|
||||
];
|
||||
|
||||
for (const c of cases) {
|
||||
expect(() => parseLRC(c, { strict: false })).not.toThrow();
|
||||
}
|
||||
})
|
||||
});
|
9
src/test/resources/test-02.lrc
Normal file
9
src/test/resources/test-02.lrc
Normal file
@ -0,0 +1,9 @@
|
||||
[ti: Somebody to Love]
|
||||
[ar: Jefferson Airplane]
|
||||
[al: Surrealistic Pillow]
|
||||
[lr: Lyricists of that song]
|
||||
[length: 2:58]
|
||||
|
||||
[00:00.00] <00:00.04> When <00:00.16> the <00:00.82> truth <00:01.29> is <00:01.63> found <00:03.09> to <00:03.37> be <00:05.92> lies
|
||||
[00:06.47] <00:07.67> And <00:07.94> all <00:08.36> the <00:08.63> joy <00:10.28> within <00:10.53> you <00:13.09> dies
|
||||
[00:13.34] <00:14.32> Don't <00:14.73> you <00:15.14> want <00:15.57> somebody <00:16.09> to <00:16.46> love
|
Loading…
Reference in New Issue
Block a user