feat: parsec based lyric parser
Signed-off-by: eternal-flame-AD <yume@yumechi.jp>
This commit is contained in:
parent
e4677f4117
commit
2341d13721
@ -48,6 +48,7 @@
|
|||||||
"music-metadata-browser": "^2.5.10",
|
"music-metadata-browser": "^2.5.10",
|
||||||
"node-cache": "^5.1.2",
|
"node-cache": "^5.1.2",
|
||||||
"rollup-plugin-node-polyfills": "^0.2.1",
|
"rollup-plugin-node-polyfills": "^0.2.1",
|
||||||
|
"typescript-parsec": "^0.3.4",
|
||||||
"uuid": "^9.0.1"
|
"uuid": "^9.0.1"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,3 +1,20 @@
|
|||||||
|
import {
|
||||||
|
alt_sc,
|
||||||
|
apply,
|
||||||
|
buildLexer,
|
||||||
|
expectEOF,
|
||||||
|
fail,
|
||||||
|
kmid,
|
||||||
|
opt_sc,
|
||||||
|
rep,
|
||||||
|
rep_sc,
|
||||||
|
seq,
|
||||||
|
str,
|
||||||
|
tok,
|
||||||
|
type Parser,
|
||||||
|
type Token
|
||||||
|
} from 'typescript-parsec';
|
||||||
|
|
||||||
export interface ScriptItem {
|
export interface ScriptItem {
|
||||||
start: number;
|
start: number;
|
||||||
text: string;
|
text: string;
|
||||||
@ -28,22 +45,188 @@ interface IDTag {
|
|||||||
[key: string]: string;
|
[key: string]: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function splitLine(str: string) {
|
function convertTimeToMs({
|
||||||
return str.split('\n').filter((str) => str.trim() !== '');
|
mins,
|
||||||
}
|
secs,
|
||||||
|
decimals
|
||||||
export function ExtractIDTags(lines: string[]) {
|
}: {
|
||||||
let result: IDTag = {};
|
mins?: number | string;
|
||||||
const IDTagRegex = /^\[(\w*): (.*?)]$/;
|
secs?: number | string;
|
||||||
let lastMatch = 0;
|
decimals?: string;
|
||||||
for (let i = 0; i < lines.length; i++) {
|
}) {
|
||||||
const line = lines[i];
|
let result = 0;
|
||||||
const matchResult = line.trim().match(IDTagRegex);
|
if (mins) {
|
||||||
if (matchResult && matchResult.length == 3) {
|
result += Number(mins) * 60 * 1000;
|
||||||
const tagName = matchResult[1];
|
}
|
||||||
const tagValue = matchResult[2];
|
if (secs) {
|
||||||
result[tagName] = tagValue;
|
result += Number(secs) * 1000;
|
||||||
}
|
}
|
||||||
|
if (decimals) {
|
||||||
|
const denom = Math.pow(10, decimals.length);
|
||||||
|
result += Number(decimals) / (denom / 1000);
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const digit = Array.from({ length: 10 }, (_, i) => apply(str(i.toString()), (_) => i)).reduce(
|
||||||
|
(acc, cur) => alt_sc(cur, acc),
|
||||||
|
fail('no alternatives')
|
||||||
|
);
|
||||||
|
const numStr = apply(rep_sc(digit), (r) => r.join(''));
|
||||||
|
const num = apply(numStr, (r) => parseInt(r));
|
||||||
|
const alpha = alt_sc(
|
||||||
|
Array.from({ length: 26 }, (_, i) =>
|
||||||
|
apply(str(String.fromCharCode('a'.charCodeAt(0) + i)), (_) => String.fromCharCode('a'.charCodeAt(0) + i))
|
||||||
|
).reduce((acc, cur) => alt_sc(cur, acc), fail('no alternatives')),
|
||||||
|
Array.from({ length: 26 }, (_, i) =>
|
||||||
|
apply(str(String.fromCharCode('A'.charCodeAt(0) + i)), (_) => String.fromCharCode('A'.charCodeAt(0) + i))
|
||||||
|
).reduce((acc, cur) => alt_sc(cur, acc), fail('no alternatives'))
|
||||||
|
);
|
||||||
|
|
||||||
|
const alphaStr = apply(rep(alpha), (r) => r.join(''));
|
||||||
|
const spaces = rep_sc(str(' '));
|
||||||
|
|
||||||
|
const unicodeStr = rep_sc(tok('char'));
|
||||||
|
|
||||||
|
function trimmed<K, T>(p: Parser<K, Token<T>[]>): Parser<K, Token<T>[]> {
|
||||||
|
return apply(p, (r) => {
|
||||||
|
while (r.length > 0 && r[0].text.trim() === '') {
|
||||||
|
r.shift();
|
||||||
|
}
|
||||||
|
while (r.length > 0 && r[r.length - 1].text.trim() === '') {
|
||||||
|
r.pop();
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function anythingTyped(types: string[]) {
|
||||||
|
return types.map((t) => tok(t)).reduce((acc, cur) => alt_sc(cur, acc), fail('no alternatives'));
|
||||||
|
}
|
||||||
|
|
||||||
|
function lrcTimestamp<K, T>(delim: [Parser<K, Token<T>>, Parser<K, Token<T>>]) {
|
||||||
|
const innerTS = alt_sc(
|
||||||
|
apply(seq(num, str(':'), num, str('.'), numStr), (r) =>
|
||||||
|
convertTimeToMs({ mins: r[0], secs: r[2], decimals: r[4] })
|
||||||
|
),
|
||||||
|
apply(seq(num, str('.'), numStr), (r) => convertTimeToMs({ secs: r[0], decimals: r[2] })),
|
||||||
|
apply(seq(num, str(':'), num), (r) => convertTimeToMs({ mins: r[0], secs: r[2] })),
|
||||||
|
apply(num, (r) => convertTimeToMs({ secs: r }))
|
||||||
|
);
|
||||||
|
return kmid(delim[0], innerTS, delim[1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
const squareTS = lrcTimestamp([tok('['), tok(']')]);
|
||||||
|
const angleTS = lrcTimestamp([tok('<'), tok('>')]);
|
||||||
|
|
||||||
|
const lrcTag = apply(
|
||||||
|
seq(
|
||||||
|
tok('['),
|
||||||
|
alphaStr,
|
||||||
|
str(':'),
|
||||||
|
tokenParserToText(trimmed(rep(anythingTyped(['char', '[', ']', '<', '>'])))),
|
||||||
|
tok(']')
|
||||||
|
),
|
||||||
|
(r) => ({
|
||||||
|
[r[1]]: r[3]
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
function tokenParserToText<K, T>(p: Parser<K, Token<T>> | Parser<K, Token<T>[]>): Parser<K, string> {
|
||||||
|
return apply(p, (r: Token<T> | Token<T>[]) => {
|
||||||
|
if (Array.isArray(r)) {
|
||||||
|
return joinTokens(r);
|
||||||
|
}
|
||||||
|
return r.text;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function joinTokens<T>(tokens: Token<T>[]) {
|
||||||
|
return tokens.map((t) => t.text).join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
function lrcLine(
|
||||||
|
wordDiv = ''
|
||||||
|
): Parser<unknown, ['script_item', ScriptItem] | ['lrc_tag', IDTag] | ['comment', string] | ['empty', string]> {
|
||||||
|
return alt_sc(
|
||||||
|
apply(seq(squareTS, rep_sc(seq(opt_sc(angleTS), trimmed(rep_sc(anythingTyped(['char', '[', ']'])))))), (r) => {
|
||||||
|
const start = r[0];
|
||||||
|
|
||||||
|
const text = r[1]
|
||||||
|
.map((s) => joinTokens(s[1]))
|
||||||
|
.filter((s) => s.trim().length > 0)
|
||||||
|
.join(wordDiv);
|
||||||
|
|
||||||
|
const words = r[1]
|
||||||
|
.filter((s) => joinTokens(s[1]).trim().length > 0)
|
||||||
|
.map((s) => {
|
||||||
|
const wordBegin = s[0];
|
||||||
|
const word = s[1];
|
||||||
|
let ret: Partial<ScriptWordsItem> = { start: wordBegin };
|
||||||
|
if (word[0]) {
|
||||||
|
ret.beginIndex = word[0].pos.columnBegin - 1;
|
||||||
|
}
|
||||||
|
if (word[word.length - 1]) {
|
||||||
|
ret.endIndex = word[word.length - 1].pos.columnEnd;
|
||||||
|
}
|
||||||
|
return ret as ScriptWordsItem; // TODO: Complete this
|
||||||
|
});
|
||||||
|
return ['script_item', { start, text, words } as any as ScriptItem]; // TODO: Complete this
|
||||||
|
}),
|
||||||
|
apply(lrcTag, (r) => ['lrc_tag', r as IDTag]),
|
||||||
|
apply(seq(spaces, str('#'), rep_sc(unicodeStr)), (cmt) => ['comment', cmt[2].join('')] as const)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function dumpToken<T>(t: Token<T> | undefined): string {
|
||||||
|
if (t === undefined) {
|
||||||
|
return '<EOF>';
|
||||||
|
}
|
||||||
|
return '`' + t.text + '` -> ' + dumpToken(t.next);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function parseLRC(
|
||||||
|
input: string,
|
||||||
|
{ wordDiv, strict }: { wordDiv?: string; strict?: boolean } = { wordDiv: ' ' }
|
||||||
|
): LrcJsonData {
|
||||||
|
const tokenizer = buildLexer([
|
||||||
|
[true, /^\[/gu, '['],
|
||||||
|
[true, /^\]/gu, ']'],
|
||||||
|
[true, /^</gu, '<'],
|
||||||
|
[true, /^>/gu, '>'],
|
||||||
|
[true, /^./gu, 'char']
|
||||||
|
]);
|
||||||
|
|
||||||
|
const lines = input
|
||||||
|
.split('\n')
|
||||||
|
.filter((line) => line.trim().length > 0)
|
||||||
|
.map((line) => tokenizer.parse(line));
|
||||||
|
|
||||||
|
return lines
|
||||||
|
.map((line) => {
|
||||||
|
const res = expectEOF(lrcLine(wordDiv).parse(line));
|
||||||
|
if (!res.successful) {
|
||||||
|
if (strict) {
|
||||||
|
throw new Error('Failed to parse full line: ' + dumpToken(line));
|
||||||
|
} else {
|
||||||
|
console.error('Failed to parse full line: ' + dumpToken(line));
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return res.candidates[0].result;
|
||||||
|
})
|
||||||
|
.filter((r) => r !== null)
|
||||||
|
.reduce((acc, cur) => {
|
||||||
|
switch (cur[0]) {
|
||||||
|
case 'lrc_tag':
|
||||||
|
Object.assign(acc, cur[1]);
|
||||||
|
return acc;
|
||||||
|
case 'script_item':
|
||||||
|
acc.scripts = acc.scripts || [];
|
||||||
|
acc.scripts.push(cur[1]);
|
||||||
|
return acc;
|
||||||
|
default:
|
||||||
|
return acc;
|
||||||
|
}
|
||||||
|
}, {} as LrcJsonData);
|
||||||
|
}
|
||||||
|
@ -1,19 +1,58 @@
|
|||||||
import { describe, expect, it } from 'vitest';
|
import { describe, expect, it } from 'vitest';
|
||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
import { ExtractIDTags, splitLine } from '$lib/lyrics/parser';
|
import { parseLRC } from '$lib/lyrics/parser';
|
||||||
|
|
||||||
describe('LRC parser test', () => {
|
describe('LRC parser test', () => {
|
||||||
const test01Buffer = fs.readFileSync('./src/test/resources/test-01.lrc');
|
const test01Buffer = fs.readFileSync('./src/test/resources/test-01.lrc');
|
||||||
const test01Text = test01Buffer.toString('utf-8');
|
const test01Text = test01Buffer.toString('utf-8');
|
||||||
it('Line Split', () => {
|
const test02Buffer = fs.readFileSync('./src/test/resources/test-02.lrc');
|
||||||
const lyrics = test01Text;
|
const test02Text = test02Buffer.toString('utf-8');
|
||||||
const lines = splitLine(lyrics);
|
it('Parses test-01.lrc', () => {
|
||||||
expect(lines[26]).toBe('[01:52.991]');
|
const result = parseLRC(test01Text, { wordDiv: '', strict: true });
|
||||||
});
|
|
||||||
it('IDTag Extract', () => {
|
expect(result.ar).toBe("洛天依");
|
||||||
const lyrics = test01Text;
|
expect(result.ti).toBe("中华少女·终");
|
||||||
const lines = splitLine(lyrics);
|
expect(result.al).toBe("中华少女");
|
||||||
const idTags = ExtractIDTags(lines);
|
expect(result["tool"]).toBe("歌词滚动姬 https://lrc-maker.github.io");
|
||||||
expect(idTags['ar']).toBe('洛天依');
|
expect(result.scripts!![1].text).toBe("因果与恩怨牵杂等谁来诊断");
|
||||||
|
expect(result.scripts!![1].start).toBe(49000 + 588);
|
||||||
|
})
|
||||||
|
it('Parses test-02.lrc', () => {
|
||||||
|
const result = parseLRC(test02Text, { wordDiv: ' ', strict: true });
|
||||||
|
|
||||||
|
expect(result.ti).toBe("Somebody to Love");
|
||||||
|
expect(result.ar).toBe("Jefferson Airplane");
|
||||||
|
expect(result.scripts!![0].text).toBe("When the truth is found to be lies");
|
||||||
|
expect(result.scripts!![0].start).toBe(0);
|
||||||
|
expect(result.scripts!![0].words!![1].beginIndex).toBe("[00:00.00] <00:00.04> When <00:00.16> the".indexOf("the"));
|
||||||
|
expect(result.scripts!![0].words!![1].start).toBe(160);
|
||||||
});
|
});
|
||||||
|
it('Rejects some invalid LRCs', () => {
|
||||||
|
const cases = [
|
||||||
|
"[<00:00.00>] <00:00.04> When <00:00.16> the",
|
||||||
|
"[00:00.00] <00:00.04> <When> <00:00.16> the",
|
||||||
|
"[00:00.00> <00:00.04> When <00:00.16> the",
|
||||||
|
"<00:00.00> <00:00.04> When <00:00.16> the",
|
||||||
|
"<1:00:00.00> <00:00.04> When <00:00.16> the",
|
||||||
|
]
|
||||||
|
for (const c of cases) {
|
||||||
|
expect(() => parseLRC(c, { strict: true })).toThrow();
|
||||||
|
}
|
||||||
|
})
|
||||||
|
it('Accepts some weird but parsable LRCs', () => {
|
||||||
|
const cases = [
|
||||||
|
"[ti: []]",
|
||||||
|
"[ar: [<]]",
|
||||||
|
"[ar: <ar>]",
|
||||||
|
"[ar: a b c]",
|
||||||
|
"[00:00.00] <00:00.04> When the <00:00.16> the",
|
||||||
|
"[00:00.00] [00:00.04] When [00:00.16] the",
|
||||||
|
"[00:00.0000000] <00:00.04> When <00:00.16> the",
|
||||||
|
"[00:00.00] <00:00.04> [When] <00:00.16> the",
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const c of cases) {
|
||||||
|
expect(() => parseLRC(c, { strict: false })).not.toThrow();
|
||||||
|
}
|
||||||
|
})
|
||||||
});
|
});
|
9
src/test/resources/test-02.lrc
Normal file
9
src/test/resources/test-02.lrc
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
[ti: Somebody to Love]
|
||||||
|
[ar: Jefferson Airplane]
|
||||||
|
[al: Surrealistic Pillow]
|
||||||
|
[lr: Lyricists of that song]
|
||||||
|
[length: 2:58]
|
||||||
|
|
||||||
|
[00:00.00] <00:00.04> When <00:00.16> the <00:00.82> truth <00:01.29> is <00:01.63> found <00:03.09> to <00:03.37> be <00:05.92> lies
|
||||||
|
[00:06.47] <00:07.67> And <00:07.94> all <00:08.36> the <00:08.63> joy <00:10.28> within <00:10.53> you <00:13.09> dies
|
||||||
|
[00:13.34] <00:14.32> Don't <00:14.73> you <00:15.14> want <00:15.57> somebody <00:16.09> to <00:16.46> love
|
Loading…
Reference in New Issue
Block a user