From 5af2236109be7640b6eb4b86e479c77ca09093f6 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sat, 15 Mar 2025 21:25:26 +0800 Subject: [PATCH 01/79] temp: remove the scheduleSnapshotTick job --- lib/db/snapshotSchedule.ts | 11 +++++++++++ lib/mq/init.ts | 6 +++--- src/worker.ts | 6 +++--- 3 files changed, 17 insertions(+), 6 deletions(-) create mode 100644 lib/db/snapshotSchedule.ts diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts new file mode 100644 index 0000000..c719eb6 --- /dev/null +++ b/lib/db/snapshotSchedule.ts @@ -0,0 +1,11 @@ +import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; + +export async function getUnsnapshotedSongs(client: Client) { + const queryResult = await client.queryObject<{ aid: bigint }>(` + SELECT DISTINCT s.aid + FROM songs s + LEFT JOIN video_snapshot v ON s.aid = v.aid + WHERE v.aid IS NULL; + `); + return queryResult.rows.map((row) => Number(row.aid)); +} diff --git a/lib/mq/init.ts b/lib/mq/init.ts index 95693ab..03a0aad 100644 --- a/lib/mq/init.ts +++ b/lib/mq/init.ts @@ -1,4 +1,4 @@ -import { MINUTE } from "$std/datetime/constants.ts"; +import { MINUTE, SECOND } from "$std/datetime/constants.ts"; import { ClassifyVideoQueue, LatestVideosQueue, SnapshotQueue } from "lib/mq/index.ts"; import logger from "lib/log/logger.ts"; @@ -15,8 +15,8 @@ export async function initMQ() { every: 3 * MINUTE, immediately: true, }); - await SnapshotQueue.upsertJobScheduler("scheduleSnapshotTick", { - every: 3 * MINUTE, + await SnapshotQueue.upsertJobScheduler("snapshotTick", { + every: 1 * SECOND, immediately: true, }); diff --git a/src/worker.ts b/src/worker.ts index c79e943..1b59785 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -56,15 +56,15 @@ const snapshotWorker = new Worker( "snapshot", async (job: Job) => { switch (job.name) { - case "scheduleSnapshotTick": - await snapshotTickWorker(job); - break; case "snapshotMilestoneVideo": await takeSnapshotForMilestoneVideoWorker(job); break; case "snapshotVideo": await takeSnapshotForVideoWorker(job); break; + case "snapshotTick": + await snapshotTickWorker(job); + break; default: break; } From 7104a95af93438eabf116dde42d57bb81f8cff8f Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sat, 15 Mar 2025 21:27:19 +0800 Subject: [PATCH 02/79] ref: rename table all_data, bili_user to bilibili_metadata, bilibili_user --- lib/db/allData.ts | 14 +++++++------- lib/mq/task/collectSongs.ts | 6 +++--- lib/mq/task/getVideoDetails.ts | 6 +++--- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/lib/db/allData.ts b/lib/db/allData.ts index 8e30780..0c6a42d 100644 --- a/lib/db/allData.ts +++ b/lib/db/allData.ts @@ -3,19 +3,19 @@ import { AllDataType, BiliUserType } from "lib/db/schema.d.ts"; import { modelVersion } from "lib/ml/filter_inference.ts"; export async function videoExistsInAllData(client: Client, aid: number) { - return await client.queryObject<{ exists: boolean }>(`SELECT EXISTS(SELECT 1 FROM all_data WHERE aid = $1)`, [aid]) + return await client.queryObject<{ exists: boolean }>(`SELECT EXISTS(SELECT 1 FROM bilibili_metadata WHERE aid = $1)`, [aid]) .then((result) => result.rows[0].exists); } export async function userExistsInBiliUsers(client: Client, uid: number) { - return await client.queryObject<{ exists: boolean }>(`SELECT EXISTS(SELECT 1 FROM bili_user WHERE uid = $1)`, [ + return await client.queryObject<{ exists: boolean }>(`SELECT EXISTS(SELECT 1 FROM bilibili_user WHERE uid = $1)`, [ uid, ]); } export async function getUnlabelledVideos(client: Client) { const queryResult = await client.queryObject<{ aid: number }>( - `SELECT a.aid FROM all_data a LEFT JOIN labelling_result l ON a.aid = l.aid WHERE l.aid IS NULL`, + `SELECT a.aid FROM bilibili_metadata a LEFT JOIN labelling_result l ON a.aid = l.aid WHERE l.aid IS NULL`, ); return queryResult.rows.map((row) => row.aid); } @@ -29,14 +29,14 @@ export async function insertVideoLabel(client: Client, aid: number, label: numbe export async function getVideoInfoFromAllData(client: Client, aid: number) { const queryResult = await client.queryObject( - `SELECT * FROM all_data WHERE aid = $1`, + `SELECT * FROM bilibili_metadata WHERE aid = $1`, [aid], ); const row = queryResult.rows[0]; let authorInfo = ""; if (row.uid && await userExistsInBiliUsers(client, row.uid)) { const q = await client.queryObject( - `SELECT * FROM bili_user WHERE uid = $1`, + `SELECT * FROM bilibili_user WHERE uid = $1`, [row.uid], ); const userRow = q.rows[0]; @@ -56,8 +56,8 @@ export async function getUnArchivedBiliUsers(client: Client) { const queryResult = await client.queryObject<{ uid: number }>( ` SELECT ad.uid - FROM all_data ad - LEFT JOIN bili_user bu ON ad.uid = bu.uid + FROM bilibili_metadata ad + LEFT JOIN bilibili_user bu ON ad.uid = bu.uid WHERE bu.uid IS NULL; `, [], diff --git a/lib/mq/task/collectSongs.ts b/lib/mq/task/collectSongs.ts index 04e033d..7a7daad 100644 --- a/lib/mq/task/collectSongs.ts +++ b/lib/mq/task/collectSongs.ts @@ -18,9 +18,9 @@ export async function insertIntoSongs(client: Client, aid: number) { INSERT INTO songs (aid, bvid, published_at, duration) VALUES ( $1, - (SELECT bvid FROM all_data WHERE aid = $1), - (SELECT published_at FROM all_data WHERE aid = $1), - (SELECT duration FROM all_data WHERE aid = $1) + (SELECT bvid FROM bilibili_metadata WHERE aid = $1), + (SELECT published_at FROM bilibili_metadata WHERE aid = $1), + (SELECT duration FROM bilibili_metadata WHERE aid = $1) ) ON CONFLICT DO NOTHING `, diff --git a/lib/mq/task/getVideoDetails.ts b/lib/mq/task/getVideoDetails.ts index ead8dd0..1f4287b 100644 --- a/lib/mq/task/getVideoDetails.ts +++ b/lib/mq/task/getVideoDetails.ts @@ -24,19 +24,19 @@ export async function insertVideoInfo(client: Client, aid: number) { const published_at = formatTimestampToPsql(data.View.pubdate); const duration = data.View.duration; await client.queryObject( - `INSERT INTO all_data (aid, bvid, description, uid, tags, title, published_at, duration) + `INSERT INTO bilibili_metadata (aid, bvid, description, uid, tags, title, published_at, duration) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`, [aid, bvid, desc, uid, tags, title, published_at, duration], ); const userExists = await userExistsInBiliUsers(client, aid); if (!userExists) { await client.queryObject( - `INSERT INTO bili_user (uid, username, "desc", fans) VALUES ($1, $2, $3, $4)`, + `INSERT INTO bilibili_user (uid, username, "desc", fans) VALUES ($1, $2, $3, $4)`, [uid, data.View.owner.name, data.Card.card.sign, data.Card.follower], ); } else { await client.queryObject( - `UPDATE bili_user SET fans = $1 WHERE uid = $2`, + `UPDATE bilibili_user SET fans = $1 WHERE uid = $2`, [data.Card.follower, uid], ); } From a6c8fd7f3f0b2f50ecab3f705af68e10cf2389b7 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sun, 16 Mar 2025 01:23:10 +0800 Subject: [PATCH 03/79] ref: code structure related to AI --- lib/db/allData.ts | 4 +- lib/db/snapshot.ts | 3 +- lib/db/snapshotSchedule.ts | 19 ++++--- lib/ml/akari.ts | 106 +++++++++++++++++++++++++++++++++++ lib/ml/benchmark.ts | 9 ++- lib/ml/filter_inference.ts | 99 -------------------------------- lib/ml/manager.ts | 37 ++++++++++++ lib/ml/quant_benchmark.ts | 8 ++- lib/mq/exec/classifyVideo.ts | 4 +- src/filterWorker.ts | 4 +- 10 files changed, 176 insertions(+), 117 deletions(-) create mode 100644 lib/ml/akari.ts delete mode 100644 lib/ml/filter_inference.ts create mode 100644 lib/ml/manager.ts diff --git a/lib/db/allData.ts b/lib/db/allData.ts index 8e30780..26840fb 100644 --- a/lib/db/allData.ts +++ b/lib/db/allData.ts @@ -1,6 +1,6 @@ import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; import { AllDataType, BiliUserType } from "lib/db/schema.d.ts"; -import { modelVersion } from "lib/ml/filter_inference.ts"; +import Akari from "lib/ml/akari.ts"; export async function videoExistsInAllData(client: Client, aid: number) { return await client.queryObject<{ exists: boolean }>(`SELECT EXISTS(SELECT 1 FROM all_data WHERE aid = $1)`, [aid]) @@ -23,7 +23,7 @@ export async function getUnlabelledVideos(client: Client) { export async function insertVideoLabel(client: Client, aid: number, label: number) { return await client.queryObject( `INSERT INTO labelling_result (aid, label, model_version) VALUES ($1, $2, $3) ON CONFLICT (aid, model_version) DO NOTHING`, - [aid, label, modelVersion], + [aid, label, Akari.getModelVersion()], ); } diff --git a/lib/db/snapshot.ts b/lib/db/snapshot.ts index 663a628..c3f515b 100644 --- a/lib/db/snapshot.ts +++ b/lib/db/snapshot.ts @@ -28,7 +28,8 @@ export async function getSongsNearMilestone(client: Client) { max_views_per_aid WHERE (max_views >= 90000 AND max_views < 100000) OR - (max_views >= 900000 AND max_views < 1000000) + (max_views >= 900000 AND max_views < 1000000) OR + (max_views >= 9900000 AND max_views < 10000000) ) -- 获取符合条件的完整行数据 SELECT diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index c719eb6..111ffa1 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -1,11 +1,12 @@ import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -export async function getUnsnapshotedSongs(client: Client) { - const queryResult = await client.queryObject<{ aid: bigint }>(` - SELECT DISTINCT s.aid - FROM songs s - LEFT JOIN video_snapshot v ON s.aid = v.aid - WHERE v.aid IS NULL; - `); - return queryResult.rows.map((row) => Number(row.aid)); -} +/* + Returns true if the specified `aid` has at least one record with "pending" or "processing" status. +*/ +export async function videoHasActiveSchedule(client: Client, aid: number) { + const res = await client.queryObject<{ status: string }>( + `SELECT status FROM snapshot_schedule WHERE aid = $1 AND (status = 'pending' OR status = 'processing')`, + [aid], + ); + return res.rows.length > 0; +} \ No newline at end of file diff --git a/lib/ml/akari.ts b/lib/ml/akari.ts new file mode 100644 index 0000000..386bb56 --- /dev/null +++ b/lib/ml/akari.ts @@ -0,0 +1,106 @@ +import { AIManager } from "lib/ml/manager.ts"; +import * as ort from "onnxruntime"; +import logger from "lib/log/logger.ts"; +import { WorkerError } from "lib/mq/schema.ts"; +import { AutoTokenizer, PreTrainedTokenizer } from "@huggingface/transformers"; + +const tokenizerModel = "alikia2x/jina-embedding-v3-m2v-1024"; +const onnxClassifierPath = "./model/video_classifier_v3_17.onnx"; +const onnxEmbeddingPath = "./model/model.onnx"; + +class AkariProto extends AIManager { + private tokenizer: PreTrainedTokenizer | null = null; + private readonly modelVersion = "3.17"; + + constructor() { + super(); + this.models = { + "classifier": onnxClassifierPath, + "embedding": onnxEmbeddingPath, + } + } + + public override async init(): Promise { + super.init(); + await this.initJinaTokenizer(); + } + + private tokenizerInitialized(): boolean { + return this.tokenizer !== null; + } + + private getTokenizer(): PreTrainedTokenizer { + if (!this.tokenizerInitialized()) { + throw new Error("Tokenizer is not initialized. Call init() first."); + } + return this.tokenizer!; + } + + private async initJinaTokenizer(): Promise { + if (this.tokenizerInitialized()) { + return; + } + try { + this.tokenizer = await AutoTokenizer.from_pretrained(tokenizerModel); + logger.log("Tokenizer initialized", "ml"); + } catch (error) { + throw new WorkerError(error as Error, "ml", "fn:initTokenizer"); + } + } + + private async getJinaEmbeddings1024(texts: string[]): Promise { + const tokenizer = this.getTokenizer(); + const session = this.getModelSession("embedding"); + + const { input_ids } = await tokenizer(texts, { + add_special_tokens: false, + return_tensors: "js", + }); + + const cumsum = (arr: number[]): number[] => + arr.reduce((acc: number[], num: number, i: number) => [...acc, num + (acc[i - 1] || 0)], []); + + const offsets: number[] = [0, ...cumsum(input_ids.slice(0, -1).map((x: string[]) => x.length))]; + const flattened_input_ids = input_ids.flat(); + + const inputs = { + input_ids: new ort.Tensor("int64", new BigInt64Array(flattened_input_ids.map(BigInt)), [ + flattened_input_ids.length, + ]), + offsets: new ort.Tensor("int64", new BigInt64Array(offsets.map(BigInt)), [offsets.length]), + }; + + const { embeddings } = await session.run(inputs); + return Array.from(embeddings.data as Float32Array); + } + + private async runClassification(embeddings: number[]): Promise { + const session = this.getModelSession("classifier"); + const inputTensor = new ort.Tensor( + Float32Array.from(embeddings), + [1, 3, 1024], + ); + + const { logits } = await session.run({ channel_features: inputTensor }); + return this.softmax(logits.data as Float32Array); + } + + public async classifyVideo(title: string, description: string, tags: string, aid: number): Promise { + const embeddings = await this.getJinaEmbeddings1024([ + title, + description, + tags, + ]); + const probabilities = await this.runClassification(embeddings); + logger.log(`Prediction result for aid: ${aid}: [${probabilities.map((p) => p.toFixed(5))}]`, "ml"); + return probabilities.indexOf(Math.max(...probabilities)); + } + + public getModelVersion(): string { + return this.modelVersion; + } +} + +const Akari = new AkariProto(); +export default Akari; + diff --git a/lib/ml/benchmark.ts b/lib/ml/benchmark.ts index 0cfc193..3911c31 100644 --- a/lib/ml/benchmark.ts +++ b/lib/ml/benchmark.ts @@ -1,6 +1,13 @@ import { AutoTokenizer, PreTrainedTokenizer } from "@huggingface/transformers"; import * as ort from "onnxruntime"; -import { softmax } from "lib/ml/filter_inference.ts"; + + +function softmax(logits: Float32Array): number[] { + const maxLogit = Math.max(...logits); + const exponents = logits.map((logit) => Math.exp(logit - maxLogit)); + const sumOfExponents = exponents.reduce((sum, exp) => sum + exp, 0); + return Array.from(exponents.map((exp) => exp / sumOfExponents)); +} // 配置参数 const sentenceTransformerModelName = "alikia2x/jina-embedding-v3-m2v-1024"; diff --git a/lib/ml/filter_inference.ts b/lib/ml/filter_inference.ts deleted file mode 100644 index 019061f..0000000 --- a/lib/ml/filter_inference.ts +++ /dev/null @@ -1,99 +0,0 @@ -import { AutoTokenizer, PreTrainedTokenizer } from "@huggingface/transformers"; -import * as ort from "onnxruntime"; -import logger from "lib/log/logger.ts"; -import { WorkerError } from "lib/mq/schema.ts"; - -const tokenizerModel = "alikia2x/jina-embedding-v3-m2v-1024"; -const onnxClassifierPath = "./model/video_classifier_v3_17.onnx"; -const onnxEmbeddingOriginalPath = "./model/model.onnx"; -export const modelVersion = "3.17"; - -let sessionClassifier: ort.InferenceSession | null = null; -let sessionEmbedding: ort.InferenceSession | null = null; -let tokenizer: PreTrainedTokenizer | null = null; - -export async function initializeModels() { - if (tokenizer && sessionClassifier && sessionEmbedding) { - return; - } - - try { - tokenizer = await AutoTokenizer.from_pretrained(tokenizerModel); - - const [classifierSession, embeddingSession] = await Promise.all([ - ort.InferenceSession.create(onnxClassifierPath), - ort.InferenceSession.create(onnxEmbeddingOriginalPath), - ]); - - sessionClassifier = classifierSession; - sessionEmbedding = embeddingSession; - logger.log("Filter models initialized", "ml"); - } catch (error) { - throw new WorkerError(error as Error, "ml", "fn:initializeModels"); - } -} - -export function softmax(logits: Float32Array): number[] { - const maxLogit = Math.max(...logits); - const exponents = logits.map((logit) => Math.exp(logit - maxLogit)); - const sumOfExponents = exponents.reduce((sum, exp) => sum + exp, 0); - return Array.from(exponents.map((exp) => exp / sumOfExponents)); -} - -async function getONNXEmbeddings(texts: string[], session: ort.InferenceSession): Promise { - if (!tokenizer) { - throw new Error("Tokenizer is not initialized. Call initializeModels() first."); - } - const { input_ids } = await tokenizer(texts, { - add_special_tokens: false, - return_tensor: false, - }); - - const cumsum = (arr: number[]): number[] => - arr.reduce((acc: number[], num: number, i: number) => [...acc, num + (acc[i - 1] || 0)], []); - - const offsets: number[] = [0, ...cumsum(input_ids.slice(0, -1).map((x: string) => x.length))]; - const flattened_input_ids = input_ids.flat(); - - const inputs = { - input_ids: new ort.Tensor("int64", new BigInt64Array(flattened_input_ids.map(BigInt)), [ - flattened_input_ids.length, - ]), - offsets: new ort.Tensor("int64", new BigInt64Array(offsets.map(BigInt)), [offsets.length]), - }; - - const { embeddings } = await session.run(inputs); - return Array.from(embeddings.data as Float32Array); -} - -async function runClassification(embeddings: number[]): Promise { - if (!sessionClassifier) { - throw new Error("Classifier session is not initialized. Call initializeModels() first."); - } - const inputTensor = new ort.Tensor( - Float32Array.from(embeddings), - [1, 3, 1024], - ); - - const { logits } = await sessionClassifier.run({ channel_features: inputTensor }); - return softmax(logits.data as Float32Array); -} - -export async function classifyVideo( - title: string, - description: string, - tags: string, - aid: number, -): Promise { - if (!sessionEmbedding) { - throw new Error("Embedding session is not initialized. Call initializeModels() first."); - } - const embeddings = await getONNXEmbeddings([ - title, - description, - tags, - ], sessionEmbedding); - const probabilities = await runClassification(embeddings); - logger.log(`Prediction result for aid: ${aid}: [${probabilities.map((p) => p.toFixed(5))}]`, "ml"); - return probabilities.indexOf(Math.max(...probabilities)); -} diff --git a/lib/ml/manager.ts b/lib/ml/manager.ts new file mode 100644 index 0000000..268985d --- /dev/null +++ b/lib/ml/manager.ts @@ -0,0 +1,37 @@ +import * as ort from "onnxruntime"; +import logger from "lib/log/logger.ts"; +import { WorkerError } from "lib/mq/schema.ts"; + +export class AIManager { + public sessions: { [key: string]: ort.InferenceSession } = {}; + public models: { [key: string]: string } = {}; + + constructor() { + } + + public async init() { + const modelKeys = Object.keys(this.models); + for (const key of modelKeys) { + try { + this.sessions[key] = await ort.InferenceSession.create(this.models[key]); + logger.log(`Model ${key} initialized`, "ml"); + } catch (error) { + throw new WorkerError(error as Error, "ml", "fn:init"); + } + } + } + + public getModelSession(key: string): ort.InferenceSession { + if (!this.sessions[key]) { + throw new WorkerError(new Error(`Model ${key} not found / not initialized.`), "ml", "fn:getModelSession"); + } + return this.sessions[key]; + } + + public softmax(logits: Float32Array): number[] { + const maxLogit = Math.max(...logits); + const exponents = logits.map((logit) => Math.exp(logit - maxLogit)); + const sumOfExponents = exponents.reduce((sum, exp) => sum + exp, 0); + return Array.from(exponents.map((exp) => exp / sumOfExponents)); + } +} diff --git a/lib/ml/quant_benchmark.ts b/lib/ml/quant_benchmark.ts index bcc5044..aab6308 100644 --- a/lib/ml/quant_benchmark.ts +++ b/lib/ml/quant_benchmark.ts @@ -1,6 +1,12 @@ import { AutoTokenizer, PreTrainedTokenizer } from "@huggingface/transformers"; import * as ort from "onnxruntime"; -import { softmax } from "lib/ml/filter_inference.ts"; + +function softmax(logits: Float32Array): number[] { + const maxLogit = Math.max(...logits); + const exponents = logits.map((logit) => Math.exp(logit - maxLogit)); + const sumOfExponents = exponents.reduce((sum, exp) => sum + exp, 0); + return Array.from(exponents.map((exp) => exp / sumOfExponents)); +} // 配置参数 const sentenceTransformerModelName = "alikia2x/jina-embedding-v3-m2v-1024"; diff --git a/lib/mq/exec/classifyVideo.ts b/lib/mq/exec/classifyVideo.ts index 3541892..6649931 100644 --- a/lib/mq/exec/classifyVideo.ts +++ b/lib/mq/exec/classifyVideo.ts @@ -1,7 +1,7 @@ import { Job } from "bullmq"; import { db } from "lib/db/init.ts"; import { getUnlabelledVideos, getVideoInfoFromAllData, insertVideoLabel } from "lib/db/allData.ts"; -import { classifyVideo } from "lib/ml/filter_inference.ts"; +import Akari from "lib/ml/akari.ts"; import { ClassifyVideoQueue } from "lib/mq/index.ts"; import logger from "lib/log/logger.ts"; import { lockManager } from "lib/mq/lockManager.ts"; @@ -19,7 +19,7 @@ export const classifyVideoWorker = async (job: Job) => { const title = videoInfo.title?.trim() || "untitled"; const description = videoInfo.description?.trim() || "N/A"; const tags = videoInfo.tags?.trim() || "empty"; - const label = await classifyVideo(title, description, tags, aid); + const label = await Akari.classifyVideo(title, description, tags, aid); if (label == -1) { logger.warn(`Failed to classify video ${aid}`, "ml"); } diff --git a/src/filterWorker.ts b/src/filterWorker.ts index 8eb43d4..cb42048 100644 --- a/src/filterWorker.ts +++ b/src/filterWorker.ts @@ -4,7 +4,7 @@ import logger from "lib/log/logger.ts"; import { classifyVideosWorker, classifyVideoWorker } from "lib/mq/exec/classifyVideo.ts"; import { WorkerError } from "lib/mq/schema.ts"; import { lockManager } from "lib/mq/lockManager.ts"; -import { initializeModels } from "lib/ml/filter_inference.ts"; +import Akari from "lib/ml/akari.ts"; Deno.addSignalListener("SIGINT", async () => { logger.log("SIGINT Received: Shutting down workers...", "mq"); @@ -18,7 +18,7 @@ Deno.addSignalListener("SIGTERM", async () => { Deno.exit(); }); -await initializeModels(); +Akari.init(); const filterWorker = new Worker( "classifyVideo", From 0ff1c78dcc22e0782a65af85d6b9d74c53097c40 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sun, 16 Mar 2025 14:00:49 +0800 Subject: [PATCH 04/79] fix: incorrect timestamp unit when inserting to database --- lib/mq/task/getVideoDetails.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/mq/task/getVideoDetails.ts b/lib/mq/task/getVideoDetails.ts index 1f4287b..cff4890 100644 --- a/lib/mq/task/getVideoDetails.ts +++ b/lib/mq/task/getVideoDetails.ts @@ -21,7 +21,7 @@ export async function insertVideoInfo(client: Client, aid: number) { .filter((tag) => tag.tag_type in ["old_channel", "topic"]) .map((tag) => tag.tag_name).join(","); const title = data.View.title; - const published_at = formatTimestampToPsql(data.View.pubdate); + const published_at = formatTimestampToPsql(data.View.pubdate * 1000); const duration = data.View.duration; await client.queryObject( `INSERT INTO bilibili_metadata (aid, bvid, description, uid, tags, title, published_at, duration) From a9ac8de5472087a914cb32c2a09261b7dae060ae Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sun, 16 Mar 2025 14:23:11 +0800 Subject: [PATCH 05/79] fix: unhandled timezone mismatch when inserting to database --- lib/mq/task/getVideoDetails.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/mq/task/getVideoDetails.ts b/lib/mq/task/getVideoDetails.ts index cff4890..51a1876 100644 --- a/lib/mq/task/getVideoDetails.ts +++ b/lib/mq/task/getVideoDetails.ts @@ -4,6 +4,7 @@ import { formatTimestampToPsql } from "lib/utils/formatTimestampToPostgre.ts"; import logger from "lib/log/logger.ts"; import { ClassifyVideoQueue } from "lib/mq/index.ts"; import { userExistsInBiliUsers, videoExistsInAllData } from "lib/db/allData.ts"; +import { HOUR, SECOND } from "$std/datetime/constants.ts"; export async function insertVideoInfo(client: Client, aid: number) { const videoExists = await videoExistsInAllData(client, aid); @@ -21,7 +22,7 @@ export async function insertVideoInfo(client: Client, aid: number) { .filter((tag) => tag.tag_type in ["old_channel", "topic"]) .map((tag) => tag.tag_name).join(","); const title = data.View.title; - const published_at = formatTimestampToPsql(data.View.pubdate * 1000); + const published_at = formatTimestampToPsql(data.View.pubdate * SECOND + 8 * HOUR); const duration = data.View.duration; await client.queryObject( `INSERT INTO bilibili_metadata (aid, bvid, description, uid, tags, title, published_at, duration) From b07d0c18f95e920314b27dd64e89ddfec7324658 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 17 Mar 2025 00:25:31 +0800 Subject: [PATCH 06/79] update: preparation for snapshotSchedule --- lib/db/snapshot.ts | 2 +- lib/db/snapshotSchedule.ts | 89 +++++++++++++ lib/ml/akari.ts | 151 ++++++++++----------- lib/ml/manager.ts | 2 +- lib/ml/mantis.ts | 25 ++++ lib/mq/exec/snapshotTick.ts | 220 ++----------------------------- lib/mq/init.ts | 4 + pred/inference.py | 16 +-- src/worker.ts | 8 +- test/db/snapshotSchedule.test.ts | 18 +++ test/ml/akari.json | 22 ++++ test/ml/akari.test.ts | 46 +++++++ 12 files changed, 305 insertions(+), 298 deletions(-) create mode 100644 lib/ml/mantis.ts create mode 100644 test/db/snapshotSchedule.test.ts create mode 100644 test/ml/akari.json create mode 100644 test/ml/akari.test.ts diff --git a/lib/db/snapshot.ts b/lib/db/snapshot.ts index c3f515b..81fe9a8 100644 --- a/lib/db/snapshot.ts +++ b/lib/db/snapshot.ts @@ -3,7 +3,7 @@ import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; import { VideoSnapshotType } from "lib/db/schema.d.ts"; import { parseTimestampFromPsql } from "lib/utils/formatTimestampToPostgre.ts"; -export async function getSongsNearMilestone(client: Client) { +export async function getVideosNearMilestone(client: Client) { const queryResult = await client.queryObject(` WITH max_views_per_aid AS ( -- 找出每个 aid 的最大 views 值,并确保 aid 存在于 songs 表中 diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 111ffa1..3b77fce 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -1,3 +1,4 @@ +import { DAY, HOUR, MINUTE, SECOND } from "$std/datetime/constants.ts"; import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; /* @@ -9,4 +10,92 @@ export async function videoHasActiveSchedule(client: Client, aid: number) { [aid], ); return res.rows.length > 0; +} + +interface Snapshot { + created_at: Date; + views: number; +} + +export async function findClosestSnapshot( + client: Client, + aid: number, + targetTime: Date +): Promise { + const query = ` + SELECT created_at, views FROM video_snapshot + WHERE aid = $1 + ORDER BY ABS(EXTRACT(EPOCH FROM (created_at - $2::timestamptz))) ASC + LIMIT 1 + `; + const result = await client.queryObject<{ created_at: string; views: number }>( + query, + [aid, targetTime.toISOString()] + ); + if (result.rows.length === 0) return null; + const row = result.rows[0]; + return { + created_at: new Date(row.created_at), + views: row.views, + }; +} + +export async function getShortTermTimeFeaturesForVideo( + client: Client, + aid: number, + initialTimestampMiliseconds: number +): Promise { + const initialTime = new Date(initialTimestampMiliseconds); + const timeWindows = [ + [ 5 * MINUTE, 0 * MINUTE], + [ 15 * MINUTE, 0 * MINUTE], + [ 40 * MINUTE, 0 * MINUTE], + [ 1 * HOUR, 0 * HOUR], + [ 2 * HOUR, 1 * HOUR], + [ 3 * HOUR, 2 * HOUR], + [ 3 * HOUR, 0 * HOUR], + [ 6 * HOUR, 0 * HOUR], + [18 * HOUR, 12 * HOUR], + [ 1 * DAY, 0 * DAY], + [ 3 * DAY, 0 * DAY], + [ 7 * DAY, 0 * DAY] + ]; + + const results: number[] = []; + + for (const [windowStart, windowEnd] of timeWindows) { + const targetTimeStart = new Date(initialTime.getTime() - windowStart); + const targetTimeEnd = new Date(initialTime.getTime() - windowEnd); + + const startRecord = await findClosestSnapshot(client, aid, targetTimeStart); + const endRecord = await findClosestSnapshot(client, aid, targetTimeEnd); + + if (!startRecord || !endRecord) { + results.push(NaN); + continue; + } + + const timeDiffSeconds = + (endRecord.created_at.getTime() - startRecord.created_at.getTime()) / 1000; + const windowDuration = windowStart - windowEnd; + + let scale = 0; + if (windowDuration > 0) { + scale = timeDiffSeconds / windowDuration; + } + + const viewsDiff = endRecord.views - startRecord.views; + const adjustedViews = Math.max(viewsDiff, 1); + + let result: number; + if (scale > 0) { + result = Math.log2(adjustedViews / scale + 1); + } else { + result = Math.log2(adjustedViews + 1); + } + + results.push(result); + } + + return results; } \ No newline at end of file diff --git a/lib/ml/akari.ts b/lib/ml/akari.ts index 386bb56..d5ce9b2 100644 --- a/lib/ml/akari.ts +++ b/lib/ml/akari.ts @@ -5,102 +5,103 @@ import { WorkerError } from "lib/mq/schema.ts"; import { AutoTokenizer, PreTrainedTokenizer } from "@huggingface/transformers"; const tokenizerModel = "alikia2x/jina-embedding-v3-m2v-1024"; -const onnxClassifierPath = "./model/video_classifier_v3_17.onnx"; -const onnxEmbeddingPath = "./model/model.onnx"; +const onnxClassifierPath = "./model/akari/3.17.onnx"; +const onnxEmbeddingPath = "./model/embedding/model.onnx"; class AkariProto extends AIManager { - private tokenizer: PreTrainedTokenizer | null = null; - private readonly modelVersion = "3.17"; + private tokenizer: PreTrainedTokenizer | null = null; + private readonly modelVersion = "3.17"; constructor() { super(); - this.models = { - "classifier": onnxClassifierPath, - "embedding": onnxEmbeddingPath, - } + this.models = { + "classifier": onnxClassifierPath, + "embedding": onnxEmbeddingPath, + }; } - public override async init(): Promise { - super.init(); - await this.initJinaTokenizer(); - } + public override async init(): Promise { + await super.init(); + await this.initJinaTokenizer(); + } - private tokenizerInitialized(): boolean { - return this.tokenizer !== null; - } + private tokenizerInitialized(): boolean { + return this.tokenizer !== null; + } - private getTokenizer(): PreTrainedTokenizer { - if (!this.tokenizerInitialized()) { - throw new Error("Tokenizer is not initialized. Call init() first."); - } - return this.tokenizer!; - } + private getTokenizer(): PreTrainedTokenizer { + if (!this.tokenizerInitialized()) { + throw new Error("Tokenizer is not initialized. Call init() first."); + } + return this.tokenizer!; + } - private async initJinaTokenizer(): Promise { - if (this.tokenizerInitialized()) { - return; - } - try { - this.tokenizer = await AutoTokenizer.from_pretrained(tokenizerModel); - logger.log("Tokenizer initialized", "ml"); - } catch (error) { - throw new WorkerError(error as Error, "ml", "fn:initTokenizer"); - } - } + private async initJinaTokenizer(): Promise { + if (this.tokenizerInitialized()) { + return; + } + try { + this.tokenizer = await AutoTokenizer.from_pretrained(tokenizerModel); + logger.log("Tokenizer initialized", "ml"); + } catch (error) { + throw new WorkerError(error as Error, "ml", "fn:initTokenizer"); + } + } - private async getJinaEmbeddings1024(texts: string[]): Promise { - const tokenizer = this.getTokenizer(); - const session = this.getModelSession("embedding"); + private async getJinaEmbeddings1024(texts: string[]): Promise { + const tokenizer = this.getTokenizer(); + const session = this.getModelSession("embedding"); - const { input_ids } = await tokenizer(texts, { - add_special_tokens: false, - return_tensors: "js", - }); + const { input_ids } = await tokenizer(texts, { + add_special_tokens: false, + return_tensor: false, + }); - const cumsum = (arr: number[]): number[] => - arr.reduce((acc: number[], num: number, i: number) => [...acc, num + (acc[i - 1] || 0)], []); + const cumsum = (arr: number[]): number[] => + arr.reduce((acc: number[], num: number, i: number) => [...acc, num + (acc[i - 1] || 0)], []); - const offsets: number[] = [0, ...cumsum(input_ids.slice(0, -1).map((x: string[]) => x.length))]; - const flattened_input_ids = input_ids.flat(); + const offsets: number[] = [0, ...cumsum(input_ids.slice(0, -1).map((x: string) => x.length))]; + const flattened_input_ids = input_ids.flat(); - const inputs = { - input_ids: new ort.Tensor("int64", new BigInt64Array(flattened_input_ids.map(BigInt)), [ - flattened_input_ids.length, - ]), - offsets: new ort.Tensor("int64", new BigInt64Array(offsets.map(BigInt)), [offsets.length]), - }; + const inputs = { + input_ids: new ort.Tensor("int64", new BigInt64Array(flattened_input_ids.map(BigInt)), [ + flattened_input_ids.length, + ]), + offsets: new ort.Tensor("int64", new BigInt64Array(offsets.map(BigInt)), [offsets.length]), + }; - const { embeddings } = await session.run(inputs); - return Array.from(embeddings.data as Float32Array); - } + const { embeddings } = await session.run(inputs); + return Array.from(embeddings.data as Float32Array); + } - private async runClassification(embeddings: number[]): Promise { - const session = this.getModelSession("classifier"); - const inputTensor = new ort.Tensor( - Float32Array.from(embeddings), - [1, 3, 1024], - ); + private async runClassification(embeddings: number[]): Promise { + const session = this.getModelSession("classifier"); + const inputTensor = new ort.Tensor( + Float32Array.from(embeddings), + [1, 3, 1024], + ); - const { logits } = await session.run({ channel_features: inputTensor }); - return this.softmax(logits.data as Float32Array); - } + const { logits } = await session.run({ channel_features: inputTensor }); + return this.softmax(logits.data as Float32Array); + } - public async classifyVideo(title: string, description: string, tags: string, aid: number): Promise { - const embeddings = await this.getJinaEmbeddings1024([ - title, - description, - tags, - ]); - const probabilities = await this.runClassification(embeddings); - logger.log(`Prediction result for aid: ${aid}: [${probabilities.map((p) => p.toFixed(5))}]`, "ml"); - return probabilities.indexOf(Math.max(...probabilities)); - } + public async classifyVideo(title: string, description: string, tags: string, aid?: number): Promise { + const embeddings = await this.getJinaEmbeddings1024([ + title, + description, + tags, + ]); + const probabilities = await this.runClassification(embeddings); + if (aid) { + logger.log(`Prediction result for aid: ${aid}: [${probabilities.map((p) => p.toFixed(5))}]`, "ml"); + } + return probabilities.indexOf(Math.max(...probabilities)); + } - public getModelVersion(): string { - return this.modelVersion; - } + public getModelVersion(): string { + return this.modelVersion; + } } const Akari = new AkariProto(); export default Akari; - diff --git a/lib/ml/manager.ts b/lib/ml/manager.ts index 268985d..8f15513 100644 --- a/lib/ml/manager.ts +++ b/lib/ml/manager.ts @@ -22,7 +22,7 @@ export class AIManager { } public getModelSession(key: string): ort.InferenceSession { - if (!this.sessions[key]) { + if (this.sessions[key] === undefined) { throw new WorkerError(new Error(`Model ${key} not found / not initialized.`), "ml", "fn:getModelSession"); } return this.sessions[key]; diff --git a/lib/ml/mantis.ts b/lib/ml/mantis.ts new file mode 100644 index 0000000..59bc09a --- /dev/null +++ b/lib/ml/mantis.ts @@ -0,0 +1,25 @@ +import { AIManager } from "lib/ml/manager.ts"; +import * as ort from "onnxruntime"; +import logger from "lib/log/logger.ts"; +import { WorkerError } from "lib/mq/schema.ts"; + +const modelPath = "./model/model.onnx"; + +class MantisProto extends AIManager { + + constructor() { + super(); + this.models = { + "predictor": modelPath, + } + } + + public override async init(): Promise { + await super.init(); + } + + +} + +const Mantis = new MantisProto(); +export default Mantis; diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 12443ff..65564d0 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -1,229 +1,31 @@ import { Job } from "bullmq"; -import { MINUTE, SECOND } from "$std/datetime/constants.ts"; -import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; import { db } from "lib/db/init.ts"; -import { - getShortTermEtaPrediction, - getSongsNearMilestone, - getUnsnapshotedSongs, - songEligibleForMilestoneSnapshot, -} from "lib/db/snapshot.ts"; -import { SnapshotQueue } from "lib/mq/index.ts"; -import { insertVideoStats } from "lib/mq/task/getVideoStats.ts"; -import { parseTimestampFromPsql } from "lib/utils/formatTimestampToPostgre.ts"; -import { redis } from "lib/db/redis.ts"; -import { NetSchedulerError } from "lib/mq/scheduler.ts"; -import logger from "lib/log/logger.ts"; -import { formatSeconds } from "lib/utils/formatSeconds.ts"; -import { truncate } from "lib/utils/truncate.ts"; - -async function snapshotScheduled(aid: number) { - try { - return await redis.exists(`cvsa:snapshot:${aid}`); - } catch { - logger.error(`Failed to check scheduled status for ${aid}`, "mq"); - return false; - } -} - -async function setSnapshotScheduled(aid: number, value: boolean, exp: number) { - try { - if (value) { - await redis.set(`cvsa:snapshot:${aid}`, 1, "EX", exp); - } else { - await redis.del(`cvsa:snapshot:${aid}`); - } - } catch { - logger.error(`Failed to set scheduled status to ${value} for ${aid}`, "mq"); - } -} - -interface SongNearMilestone { - aid: number; - id: number; - created_at: string; - views: number; - coins: number; - likes: number; - favorites: number; - shares: number; - danmakus: number; - replies: number; -} - -async function processMilestoneSnapshots(client: Client, vidoesNearMilestone: SongNearMilestone[]) { - let i = 0; - for (const snapshot of vidoesNearMilestone) { - if (await snapshotScheduled(snapshot.aid)) { - logger.silly( - `Video ${snapshot.aid} is already scheduled for snapshot`, - "mq", - "fn:processMilestoneSnapshots", - ); - continue; - } - if (await songEligibleForMilestoneSnapshot(client, snapshot.aid) === false) { - logger.silly( - `Video ${snapshot.aid} is not eligible for milestone snapshot`, - "mq", - "fn:processMilestoneSnapshots", - ); - continue; - } - const factor = Math.floor(i / 8); - const delayTime = factor * SECOND * 2; - await SnapshotQueue.add("snapshotMilestoneVideo", { - aid: snapshot.aid, - currentViews: snapshot.views, - snapshotedAt: snapshot.created_at, - }, { delay: delayTime, priority: 1 }); - await setSnapshotScheduled(snapshot.aid, true, 20 * 60); - i++; - } -} - -async function processUnsnapshotedVideos(unsnapshotedVideos: number[]) { - let i = 0; - for (const aid of unsnapshotedVideos) { - if (await snapshotScheduled(aid)) { - logger.silly(`Video ${aid} is already scheduled for snapshot`, "mq", "fn:processUnsnapshotedVideos"); - continue; - } - const factor = Math.floor(i / 5); - const delayTime = factor * SECOND * 4; - await SnapshotQueue.add("snapshotVideo", { - aid, - }, { delay: delayTime, priority: 3 }); - await setSnapshotScheduled(aid, true, 6 * 60 * 60); - i++; - } -} +import { getVideosNearMilestone } from "lib/db/snapshot.ts"; +import { videoHasActiveSchedule } from "lib/db/snapshotSchedule.ts"; export const snapshotTickWorker = async (_job: Job) => { const client = await db.connect(); try { - const vidoesNearMilestone = await getSongsNearMilestone(client); - await processMilestoneSnapshots(client, vidoesNearMilestone); - - const unsnapshotedVideos = await getUnsnapshotedSongs(client); - await processUnsnapshotedVideos(unsnapshotedVideos); + // TODO: implement } finally { client.release(); } }; -export const takeSnapshotForMilestoneVideoWorker = async (job: Job) => { +export const collectMilestoneSnapshotsWorker = async (_job: Job) => { const client = await db.connect(); - await setSnapshotScheduled(job.data.aid, true, 20 * 60); try { - const aid: number = job.data.aid; - const currentViews: number = job.data.currentViews; - const lastSnapshoted: string = job.data.snapshotedAt; - const stat = await insertVideoStats(client, aid, "snapshotMilestoneVideo"); - if (typeof stat === "number") { - if (stat === -404 || stat === 62002 || stat == 62012) { - await setSnapshotScheduled(aid, true, 6 * 60 * 60); - } else { - await setSnapshotScheduled(aid, false, 0); - } - return; + const videos = await getVideosNearMilestone(client); + for (const video of videos) { + if (await videoHasActiveSchedule(client, video.aid)) continue; } - const nextMilestone = currentViews >= 100000 ? 1000000 : 100000; - if (stat.views >= nextMilestone) { - await setSnapshotScheduled(aid, false, 0); - return; - } - let eta = await getShortTermEtaPrediction(client, aid); - if (eta === null) { - const DELTA = 0.001; - const intervalSeconds = (Date.now() - parseTimestampFromPsql(lastSnapshoted)) / SECOND; - const viewsIncrement = stat.views - currentViews; - const incrementSpeed = viewsIncrement / (intervalSeconds + DELTA); - const viewsToIncrease = nextMilestone - stat.views; - eta = viewsToIncrease / (incrementSpeed + DELTA); - } - const scheduledNextSnapshotDelay = eta * SECOND / 3; - const maxInterval = 20 * MINUTE; - const minInterval = 1 * SECOND; - const delay = truncate(scheduledNextSnapshotDelay, minInterval, maxInterval); - await SnapshotQueue.add("snapshotMilestoneVideo", { - aid, - currentViews: stat.views, - snapshotedAt: stat.time, - }, { delay, priority: 1 }); - await job.updateData({ - ...job.data, - updatedViews: stat.views, - updatedTime: new Date(stat.time).toISOString(), - etaInMins: eta / 60, - }); - logger.log( - `Scheduled next milestone snapshot for ${aid} in ${ - formatSeconds(delay / 1000) - }, current views: ${stat.views}`, - "mq", - ); - } catch (e) { - if (e instanceof NetSchedulerError && e.code === "NO_AVAILABLE_PROXY") { - logger.warn( - `No available proxy for aid ${job.data.aid}.`, - "mq", - "fn:takeSnapshotForMilestoneVideoWorker", - ); - await SnapshotQueue.add("snapshotMilestoneVideo", { - aid: job.data.aid, - currentViews: job.data.currentViews, - snapshotedAt: job.data.snapshotedAt, - }, { delay: 5 * SECOND, priority: 1 }); - return; - } - throw e; + } catch (_e) { + // } finally { client.release(); } }; -export const takeSnapshotForVideoWorker = async (job: Job) => { - const client = await db.connect(); - await setSnapshotScheduled(job.data.aid, true, 6 * 60 * 60); - try { - const { aid } = job.data; - const stat = await insertVideoStats(client, aid, "getVideoInfo"); - if (typeof stat === "number") { - if (stat === -404 || stat === 62002 || stat == 62012) { - await setSnapshotScheduled(aid, true, 6 * 60 * 60); - } else { - await setSnapshotScheduled(aid, false, 0); - } - return; - } - logger.log(`Taken snapshot for ${aid}`, "mq"); - if (stat == null) { - setSnapshotScheduled(aid, false, 0); - return; - } - await job.updateData({ - ...job.data, - updatedViews: stat.views, - updatedTime: new Date(stat.time).toISOString(), - }); - const nearMilestone = (stat.views >= 90000 && stat.views < 100000) || - (stat.views >= 900000 && stat.views < 1000000); - if (nearMilestone) { - await SnapshotQueue.add("snapshotMilestoneVideo", { - aid, - currentViews: stat.views, - snapshotedAt: stat.time, - }, { delay: 0, priority: 1 }); - } - await setSnapshotScheduled(aid, false, 0); - } catch (e) { - if (e instanceof NetSchedulerError && e.code === "NO_AVAILABLE_PROXY") { - await setSnapshotScheduled(job.data.aid, false, 0); - return; - } - throw e; - } finally { - client.release(); - } +export const takeSnapshotForVideoWorker = async (_job: Job) => { + // TODO: implement }; diff --git a/lib/mq/init.ts b/lib/mq/init.ts index 03a0aad..688dd4a 100644 --- a/lib/mq/init.ts +++ b/lib/mq/init.ts @@ -19,6 +19,10 @@ export async function initMQ() { every: 1 * SECOND, immediately: true, }); + await SnapshotQueue.upsertJobScheduler("collectMilestoneSnapshots", { + every: 5 * MINUTE, + immediately: true, + }); logger.log("Message queue initialized."); } diff --git a/pred/inference.py b/pred/inference.py index 9a3d678..cadb90f 100644 --- a/pred/inference.py +++ b/pred/inference.py @@ -4,20 +4,20 @@ from model import CompactPredictor import torch def main(): - model = CompactPredictor(16).to('cpu', dtype=torch.float32) - model.load_state_dict(torch.load('./pred/checkpoints/model_20250315_0530.pt')) + model = CompactPredictor(10).to('cpu', dtype=torch.float32) + model.load_state_dict(torch.load('./pred/checkpoints/long_term.pt')) model.eval() # inference - initial = 999269 + initial = 997029 last = initial - start_time = '2025-03-15 01:03:21' - for i in range(1, 48): + start_time = '2025-03-17 00:13:17' + for i in range(1, 120): hour = i / 0.5 sec = hour * 3600 time_d = np.log2(sec) data = [time_d, np.log2(initial+1), # time_delta, current_views - 2.801318, 3.455128, 3.903391, 3.995577, 4.641488, 5.75131, 6.723868, 6.105322, 8.141023, 9.576701, 10.665067, # grows_feat - 0.043993, 0.72057, 28.000902 # time_feat + 6.111542, 8.404707, 10.071566, 11.55888, 12.457823,# grows_feat + 0.009225, 0.001318, 28.001814# time_feat ] np_arr = np.array([data]) tensor = torch.from_numpy(np_arr).to('cpu', dtype=torch.float32) @@ -25,7 +25,7 @@ def main(): num = output.detach().numpy()[0][0] views_pred = int(np.exp2(num)) + initial current_time = datetime.datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S') + datetime.timedelta(hours=hour) - print(current_time.strftime('%m-%d %H:%M'), views_pred, views_pred - last) + print(current_time.strftime('%m-%d %H:%M:%S'), views_pred, views_pred - last) last = views_pred if __name__ == '__main__': diff --git a/src/worker.ts b/src/worker.ts index 1b59785..9523a42 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -5,7 +5,7 @@ import logger from "lib/log/logger.ts"; import { lockManager } from "lib/mq/lockManager.ts"; import { WorkerError } from "lib/mq/schema.ts"; import { getVideoInfoWorker } from "lib/mq/exec/getLatestVideos.ts"; -import { snapshotTickWorker, takeSnapshotForMilestoneVideoWorker, takeSnapshotForVideoWorker } from "lib/mq/exec/snapshotTick.ts"; +import { snapshotTickWorker, collectMilestoneSnapshotsWorker, takeSnapshotForVideoWorker } from "lib/mq/exec/snapshotTick.ts"; Deno.addSignalListener("SIGINT", async () => { logger.log("SIGINT Received: Shutting down workers...", "mq"); @@ -56,15 +56,15 @@ const snapshotWorker = new Worker( "snapshot", async (job: Job) => { switch (job.name) { - case "snapshotMilestoneVideo": - await takeSnapshotForMilestoneVideoWorker(job); - break; case "snapshotVideo": await takeSnapshotForVideoWorker(job); break; case "snapshotTick": await snapshotTickWorker(job); break; + case "collectMilestoneSnapshots": + await collectMilestoneSnapshotsWorker(job); + break; default: break; } diff --git a/test/db/snapshotSchedule.test.ts b/test/db/snapshotSchedule.test.ts new file mode 100644 index 0000000..a5e1d6a --- /dev/null +++ b/test/db/snapshotSchedule.test.ts @@ -0,0 +1,18 @@ +import { assertEquals, assertInstanceOf, assertNotEquals } from "@std/assert"; +import { findClosestSnapshot } from "lib/db/snapshotSchedule.ts"; +import { postgresConfig } from "lib/db/pgConfig.ts"; +import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; + +Deno.test("Snapshot Schedule - getShortTermTimeFeaturesForVideo", async () => { + const client = new Client(postgresConfig); + try { + const result = await findClosestSnapshot(client, 247308539, new Date(1741983383000)); + assertNotEquals(result, null); + const created_at = result!.created_at; + const views = result!.views; + assertInstanceOf(created_at, Date); + assertEquals(typeof views, "number"); + } finally { + client.end(); + } +}); diff --git a/test/ml/akari.json b/test/ml/akari.json new file mode 100644 index 0000000..7345078 --- /dev/null +++ b/test/ml/akari.json @@ -0,0 +1,22 @@ +{ + "test1": [ + { + "title": "【洛天依】《一花依世界》(2024重调版)|“抬头仰望,夜空多安详”【原创PV付】", + "desc": "本家:BV1Vs411H7JH\n作曲:LS\n作词:杏花包子\n调教:鬼面P\n混音:虎皮猫P\n演唱:洛天依\n曲绘:山下鸭鸭窝\n映像:阿妍\n——————————————————————\n本稿为同人二创,非本家重制", + "tags": "发现《一花依世界》, Vsinger创作激励计划, 洛天依, VOCALOID CHINA, 翻唱, 原创PV付, ACE虚拟歌姬, 中文VOCALOID, 国风电子, 一花依世界, ACE Studio, Vsinger创作激励计划2024冬季物语", + "label": 2 + }, + { + "title": "【鏡音レン】アカシア【VOCALOID Cover】", + "desc": "鏡音リン・レン 13th Anniversary\n\nMusic:BUMP OF CHICKEN https://youtu.be/BoZ0Zwab6Oc\nust:Maplestyle sm37853236\nOff Vocal: https://youtu.be/YMzrUzq1uX0\nSinger:鏡音レン\n\n氷雨ハルカ\nYoutube :https://t.co/8zuv6g7Acm\nniconico:https://t.co/C6DRfdYAp0\ntwitter :https://twitter.com/hisame_haruka\n\n転載禁止\nPlease do not reprint without my permission.", + "tags": "鏡音レン", + "label": 0 + }, + { + "title": "【洛天依原创曲】谪星【姆斯塔之谕】", + "desc": "谪星\n\n策划/世界观:听雨\n作词:听雨\n作曲/编曲:太白\n混音:虎皮猫\n人设:以木\n曲绘:Ar极光\n调校:哈士奇p\n视频:苏卿白", + "tags": "2025虚拟歌手贺岁纪, 洛天依, 原创歌曲, VOCALOID, 虚拟歌手, 原创音乐, 姆斯塔, 中文VOCALOID", + "label": 1 + } + ] +} diff --git a/test/ml/akari.test.ts b/test/ml/akari.test.ts new file mode 100644 index 0000000..958f34d --- /dev/null +++ b/test/ml/akari.test.ts @@ -0,0 +1,46 @@ +import Akari from "lib/ml/akari.ts"; +import { assertEquals, assertGreaterOrEqual } from "jsr:@std/assert"; +import { join } from "$std/path/join.ts"; +import { SECOND } from "$std/datetime/constants.ts"; + +Deno.test("Akari AI - normal cases accuracy test", async () => { + const path = import.meta.dirname!; + const dataPath = join(path, "akari.json"); + const rawData = await Deno.readTextFile(dataPath); + const data = JSON.parse(rawData); + await Akari.init(); + for (const testCase of data.test1) { + const result = await Akari.classifyVideo( + testCase.title, + testCase.desc, + testCase.tags + ); + assertEquals(result, testCase.label); + } +}); + +Deno.test("Akari AI - performance test", async () => { + const path = import.meta.dirname!; + const dataPath = join(path, "akari.json"); + const rawData = await Deno.readTextFile(dataPath); + const data = JSON.parse(rawData); + await Akari.init(); + const N = 200; + const testCase = data.test1[0]; + const title = testCase.title; + const desc = testCase.desc; + const tags = testCase.tags; + const time = performance.now(); + for (let i = 0; i < N; i++){ + await Akari.classifyVideo( + title, + desc, + tags + ); + } + const end = performance.now(); + const elapsed = (end - time) / SECOND; + const throughput = N / elapsed; + assertGreaterOrEqual(throughput, 100); + console.log(`Akari AI throughput: ${throughput.toFixed(1)} samples / sec`) +}); \ No newline at end of file From cd8aa826e125ac54efee09b8d76471f6d3d05144 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 17 Mar 2025 00:33:28 +0800 Subject: [PATCH 07/79] fix: prevent videos from being crawled for too long --- lib/db/snapshot.ts | 19 ++++++++++++++++++- lib/mq/exec/snapshotTick.ts | 20 +++++++------------- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/lib/db/snapshot.ts b/lib/db/snapshot.ts index 663a628..5921059 100644 --- a/lib/db/snapshot.ts +++ b/lib/db/snapshot.ts @@ -71,7 +71,7 @@ export async function getSongSnapshotCount(client: Client, aid: number) { } export async function getShortTermEtaPrediction(client: Client, aid: number) { - const queryResult = await client.queryObject<{eta: number}>( + const queryResult = await client.queryObject<{ eta: number }>( ` WITH old_snapshot AS ( SELECT created_at, views @@ -120,6 +120,23 @@ export async function getShortTermEtaPrediction(client: Client, aid: number) { return queryResult.rows[0].eta; } +export async function getIntervalFromLastSnapshotToNow(client: Client, aid: number) { + const queryResult = await client.queryObject<{ interval: number }>( + ` + SELECT EXTRACT(EPOCH FROM (NOW() - created_at)) AS interval + FROM video_snapshot + WHERE aid = $1 + ORDER BY created_at DESC + LIMIT 1; + `, + [aid], + ); + if (queryResult.rows.length === 0) { + return null; + } + return queryResult.rows[0].interval; +} + export async function songEligibleForMilestoneSnapshot(client: Client, aid: number) { const count = await getSongSnapshotCount(client, aid); if (count < 2) { diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 12443ff..bbc7205 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -1,8 +1,9 @@ import { Job } from "bullmq"; -import { MINUTE, SECOND } from "$std/datetime/constants.ts"; +import { HOUR, MINUTE, SECOND } from "$std/datetime/constants.ts"; import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; import { db } from "lib/db/init.ts"; import { +getIntervalFromLastSnapshotToNow, getShortTermEtaPrediction, getSongsNearMilestone, getUnsnapshotedSongs, @@ -55,19 +56,12 @@ async function processMilestoneSnapshots(client: Client, vidoesNearMilestone: So let i = 0; for (const snapshot of vidoesNearMilestone) { if (await snapshotScheduled(snapshot.aid)) { - logger.silly( - `Video ${snapshot.aid} is already scheduled for snapshot`, - "mq", - "fn:processMilestoneSnapshots", - ); continue; } - if (await songEligibleForMilestoneSnapshot(client, snapshot.aid) === false) { - logger.silly( - `Video ${snapshot.aid} is not eligible for milestone snapshot`, - "mq", - "fn:processMilestoneSnapshots", - ); + const timeFromLastSnapshot = await getIntervalFromLastSnapshotToNow(client, snapshot.aid); + const lastSnapshotLessThan8Hrs = timeFromLastSnapshot && timeFromLastSnapshot * SECOND < 8 * HOUR; + const notEligible = await songEligibleForMilestoneSnapshot(client, snapshot.aid); + if (notEligible && lastSnapshotLessThan8Hrs) { continue; } const factor = Math.floor(i / 8); @@ -143,7 +137,7 @@ export const takeSnapshotForMilestoneVideoWorker = async (job: Job) => { eta = viewsToIncrease / (incrementSpeed + DELTA); } const scheduledNextSnapshotDelay = eta * SECOND / 3; - const maxInterval = 20 * MINUTE; + const maxInterval = 60 * MINUTE; const minInterval = 1 * SECOND; const delay = truncate(scheduledNextSnapshotDelay, minInterval, maxInterval); await SnapshotQueue.add("snapshotMilestoneVideo", { From 2e8ed7ce7014f16bbaac49f20ed43d9320096eef Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 20 Mar 2025 01:57:33 +0800 Subject: [PATCH 08/79] add: ETA estimation for short-term snapshot --- lib/db/snapshotSchedule.ts | 116 +++++++++++------------------------- lib/mq/exec/snapshotTick.ts | 44 +++++++++++++- 2 files changed, 77 insertions(+), 83 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 3b77fce..583d06a 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -1,101 +1,53 @@ -import { DAY, HOUR, MINUTE, SECOND } from "$std/datetime/constants.ts"; import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -/* +/* Returns true if the specified `aid` has at least one record with "pending" or "processing" status. */ export async function videoHasActiveSchedule(client: Client, aid: number) { - const res = await client.queryObject<{ status: string }>( - `SELECT status FROM snapshot_schedule WHERE aid = $1 AND (status = 'pending' OR status = 'processing')`, - [aid], - ); - return res.rows.length > 0; + const res = await client.queryObject<{ status: string }>( + `SELECT status FROM snapshot_schedule WHERE aid = $1 AND (status = 'pending' OR status = 'processing')`, + [aid], + ); + return res.rows.length > 0; } interface Snapshot { - created_at: Date; - views: number; + created_at: number; + views: number; } export async function findClosestSnapshot( - client: Client, - aid: number, - targetTime: Date + client: Client, + aid: number, + targetTime: Date, ): Promise { - const query = ` + const query = ` SELECT created_at, views FROM video_snapshot WHERE aid = $1 ORDER BY ABS(EXTRACT(EPOCH FROM (created_at - $2::timestamptz))) ASC LIMIT 1 `; - const result = await client.queryObject<{ created_at: string; views: number }>( - query, - [aid, targetTime.toISOString()] - ); - if (result.rows.length === 0) return null; - const row = result.rows[0]; - return { - created_at: new Date(row.created_at), - views: row.views, - }; + const result = await client.queryObject<{ created_at: string; views: number }>( + query, + [aid, targetTime.toISOString()], + ); + if (result.rows.length === 0) return null; + const row = result.rows[0]; + return { + created_at: new Date(row.created_at).getTime(), + views: row.views, + }; } -export async function getShortTermTimeFeaturesForVideo( - client: Client, - aid: number, - initialTimestampMiliseconds: number -): Promise { - const initialTime = new Date(initialTimestampMiliseconds); - const timeWindows = [ - [ 5 * MINUTE, 0 * MINUTE], - [ 15 * MINUTE, 0 * MINUTE], - [ 40 * MINUTE, 0 * MINUTE], - [ 1 * HOUR, 0 * HOUR], - [ 2 * HOUR, 1 * HOUR], - [ 3 * HOUR, 2 * HOUR], - [ 3 * HOUR, 0 * HOUR], - [ 6 * HOUR, 0 * HOUR], - [18 * HOUR, 12 * HOUR], - [ 1 * DAY, 0 * DAY], - [ 3 * DAY, 0 * DAY], - [ 7 * DAY, 0 * DAY] - ]; - - const results: number[] = []; - - for (const [windowStart, windowEnd] of timeWindows) { - const targetTimeStart = new Date(initialTime.getTime() - windowStart); - const targetTimeEnd = new Date(initialTime.getTime() - windowEnd); - - const startRecord = await findClosestSnapshot(client, aid, targetTimeStart); - const endRecord = await findClosestSnapshot(client, aid, targetTimeEnd); - - if (!startRecord || !endRecord) { - results.push(NaN); - continue; - } - - const timeDiffSeconds = - (endRecord.created_at.getTime() - startRecord.created_at.getTime()) / 1000; - const windowDuration = windowStart - windowEnd; - - let scale = 0; - if (windowDuration > 0) { - scale = timeDiffSeconds / windowDuration; - } - - const viewsDiff = endRecord.views - startRecord.views; - const adjustedViews = Math.max(viewsDiff, 1); - - let result: number; - if (scale > 0) { - result = Math.log2(adjustedViews / scale + 1); - } else { - result = Math.log2(adjustedViews + 1); - } - - results.push(result); - } - - return results; -} \ No newline at end of file +export async function getLatestSnapshot(client: Client, aid: number): Promise{ + const res = await client.queryObject<{ created_at: string; views: number }>( + `SELECT created_at, views FROM video_snapshot WHERE aid = $1 ORDER BY created_at DESC LIMIT 1`, + [aid], + ); + if (res.rows.length === 0) return null; + const row = res.rows[0]; + return { + created_at: new Date(row.created_at).getTime(), + views: row.views, + } +} diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 65564d0..9fcc604 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -1,7 +1,9 @@ import { Job } from "bullmq"; import { db } from "lib/db/init.ts"; import { getVideosNearMilestone } from "lib/db/snapshot.ts"; -import { videoHasActiveSchedule } from "lib/db/snapshotSchedule.ts"; +import { findClosestSnapshot, getLatestSnapshot, videoHasActiveSchedule } from "lib/db/snapshotSchedule.ts"; +import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { HOUR, MINUTE } from "$std/datetime/constants.ts"; export const snapshotTickWorker = async (_job: Job) => { const client = await db.connect(); @@ -12,12 +14,52 @@ export const snapshotTickWorker = async (_job: Job) => { } }; +export const closetMilestone = (views: number) => { + if (views < 100000) return 100000; + if (views < 1000000) return 1000000; + return 10000000; +}; + +const log = (value: number, base: number = 10) => Math.log(value) / Math.log(base); + +const getAdjustedShortTermETA = async (client: Client, aid: number) => { + const latestSnapshot = await getLatestSnapshot(client, aid); + // Immediately dispatch a snapshot if there is no snapshot yet + if (!latestSnapshot) return 0; + + const currentTimestamp = Date.now(); + const timeIntervals = [20 * MINUTE, 1 * HOUR, 3 * HOUR, 6 * HOUR]; + const DELTA = 0.00001; + let minETAHours = Infinity; + + for (const timeInterval of timeIntervals) { + const date = new Date(currentTimestamp - timeInterval); + const snapshot = await findClosestSnapshot(client, aid, date); + if (!snapshot) continue; + const hoursDiff = (currentTimestamp - snapshot.created_at) / HOUR; + const viewsDiff = snapshot.views - latestSnapshot.views; + const speed = viewsDiff / (hoursDiff + DELTA); + const target = closetMilestone(latestSnapshot.views); + const viewsToIncrease = target - latestSnapshot.views; + const eta = viewsToIncrease / (speed + DELTA); + const factor = log(2.97 / log(viewsToIncrease + 1), 1.14); + const adjustedETA = eta / factor; + if (adjustedETA < minETAHours) { + minETAHours = adjustedETA; + } + } + return minETAHours; +}; + export const collectMilestoneSnapshotsWorker = async (_job: Job) => { const client = await db.connect(); try { const videos = await getVideosNearMilestone(client); for (const video of videos) { if (await videoHasActiveSchedule(client, video.aid)) continue; + const eta = await getAdjustedShortTermETA(client, video.aid); + if (eta > 72) continue; + // TODO: dispatch snapshot job } } catch (_e) { // From 00b52c01f79699ed51e4691c4a28c077a4ac1bc2 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Fri, 21 Mar 2025 20:51:34 +0800 Subject: [PATCH 09/79] fix: unexpected column `bvid` when inserting to `songs` table --- lib/mq/task/collectSongs.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/mq/task/collectSongs.ts b/lib/mq/task/collectSongs.ts index 7a7daad..9c49823 100644 --- a/lib/mq/task/collectSongs.ts +++ b/lib/mq/task/collectSongs.ts @@ -15,10 +15,9 @@ export async function collectSongs(client: Client) { export async function insertIntoSongs(client: Client, aid: number) { await client.queryObject( ` - INSERT INTO songs (aid, bvid, published_at, duration) + INSERT INTO songs (aid, published_at, duration) VALUES ( $1, - (SELECT bvid FROM bilibili_metadata WHERE aid = $1), (SELECT published_at FROM bilibili_metadata WHERE aid = $1), (SELECT duration FROM bilibili_metadata WHERE aid = $1) ) From 8158ce10c02b1c21879ad55970ca0b7fd0fbdfc8 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Fri, 21 Mar 2025 21:06:01 +0800 Subject: [PATCH 10/79] fix: inserting videos into `songs` table regardless of classified label --- lib/mq/exec/classifyVideo.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/mq/exec/classifyVideo.ts b/lib/mq/exec/classifyVideo.ts index 3541892..53aadd9 100644 --- a/lib/mq/exec/classifyVideo.ts +++ b/lib/mq/exec/classifyVideo.ts @@ -26,7 +26,7 @@ export const classifyVideoWorker = async (job: Job) => { await insertVideoLabel(client, aid, label); const exists = await aidExistsInSongs(client, aid); - if (!exists) { + if (!exists && label !== 0) { await insertIntoSongs(client, aid); } From fabb77d98d8459d91592cf6ada0f22ce47fb0ffb Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sat, 22 Mar 2025 00:28:47 +0800 Subject: [PATCH 11/79] fix: inefficient SQL query for getting songs close to milestone --- lib/db/snapshot.ts | 53 +++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/lib/db/snapshot.ts b/lib/db/snapshot.ts index 5921059..9f8cee4 100644 --- a/lib/db/snapshot.ts +++ b/lib/db/snapshot.ts @@ -5,40 +5,31 @@ import { parseTimestampFromPsql } from "lib/utils/formatTimestampToPostgre.ts"; export async function getSongsNearMilestone(client: Client) { const queryResult = await client.queryObject(` - WITH max_views_per_aid AS ( - -- 找出每个 aid 的最大 views 值,并确保 aid 存在于 songs 表中 - SELECT - vs.aid, - MAX(vs.views) AS max_views - FROM + WITH filtered_snapshots AS ( + SELECT + vs.* + FROM video_snapshot vs - INNER JOIN - songs s - ON - vs.aid = s.aid - GROUP BY - vs.aid + WHERE + (vs.views >= 90000 AND vs.views < 100000) OR + (vs.views >= 900000 AND vs.views < 1000000) ), - filtered_max_views AS ( - -- 筛选出满足条件的最大 views - SELECT - aid, - max_views - FROM - max_views_per_aid - WHERE - (max_views >= 90000 AND max_views < 100000) OR - (max_views >= 900000 AND max_views < 1000000) + ranked_snapshots AS ( + SELECT + fs.*, + ROW_NUMBER() OVER (PARTITION BY fs.aid ORDER BY fs.created_at DESC) as rn, + MAX(fs.views) OVER (PARTITION BY fs.aid) as max_views_per_aid + FROM + filtered_snapshots fs + INNER JOIN + songs s ON fs.aid = s.aid ) - -- 获取符合条件的完整行数据 - SELECT - vs.* - FROM - video_snapshot vs - INNER JOIN - filtered_max_views fmv - ON - vs.aid = fmv.aid AND vs.views = fmv.max_views + SELECT + rs.id, rs.created_at, rs.views, rs.coins, rs.likes, rs.favorites, rs.shares, rs.danmakus, rs.aid, rs.replies + FROM + ranked_snapshots rs + WHERE + rs.rn = 1; `); return queryResult.rows.map((row) => { return { From 1895d601d96480cae1880fe9a7bcb639cc2ca027 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sat, 22 Mar 2025 00:40:00 +0800 Subject: [PATCH 12/79] update: dynamic delay factor for snapshotMilestoneVideo --- lib/mq/exec/snapshotTick.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index bbc7205..7d4d980 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -106,6 +106,8 @@ export const snapshotTickWorker = async (_job: Job) => { } }; +const log = (a: number, b: number = 10) => Math.log(a) / Math.log(b); + export const takeSnapshotForMilestoneVideoWorker = async (job: Job) => { const client = await db.connect(); await setSnapshotScheduled(job.data.aid, true, 20 * 60); @@ -128,6 +130,7 @@ export const takeSnapshotForMilestoneVideoWorker = async (job: Job) => { return; } let eta = await getShortTermEtaPrediction(client, aid); + let factor = 3; if (eta === null) { const DELTA = 0.001; const intervalSeconds = (Date.now() - parseTimestampFromPsql(lastSnapshoted)) / SECOND; @@ -135,8 +138,9 @@ export const takeSnapshotForMilestoneVideoWorker = async (job: Job) => { const incrementSpeed = viewsIncrement / (intervalSeconds + DELTA); const viewsToIncrease = nextMilestone - stat.views; eta = viewsToIncrease / (incrementSpeed + DELTA); + factor = log(2.97 / log(viewsToIncrease + 1), 1.14); } - const scheduledNextSnapshotDelay = eta * SECOND / 3; + const scheduledNextSnapshotDelay = eta * SECOND / factor; const maxInterval = 60 * MINUTE; const minInterval = 1 * SECOND; const delay = truncate(scheduledNextSnapshotDelay, minInterval, maxInterval); From 559c63b43410cf0074c1518e767e0ff35e0531f5 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sat, 22 Mar 2025 00:42:37 +0800 Subject: [PATCH 13/79] update: more beautiful time interval formatting --- lib/utils/formatSeconds.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/utils/formatSeconds.ts b/lib/utils/formatSeconds.ts index ffabb22..694f94c 100644 --- a/lib/utils/formatSeconds.ts +++ b/lib/utils/formatSeconds.ts @@ -3,7 +3,7 @@ export const formatSeconds = (seconds: number) => { return `${(seconds).toFixed(1)}s`; } if (seconds < 3600) { - return `${Math.floor(seconds / 60)}m${seconds % 60}s`; + return `${Math.floor(seconds / 60)}m${(seconds % 60).toFixed(1)}s`; } return `${Math.floor(seconds / 3600)}h ${((seconds % 3600) / 60).toFixed(2)}m`; }; From e5534cda24f38b78d5058de5d320b4091bd56c0d Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sat, 22 Mar 2025 00:58:36 +0800 Subject: [PATCH 14/79] fix: incorrect filter condition that causes empty tags --- lib/mq/task/getVideoDetails.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/mq/task/getVideoDetails.ts b/lib/mq/task/getVideoDetails.ts index 51a1876..ea5f903 100644 --- a/lib/mq/task/getVideoDetails.ts +++ b/lib/mq/task/getVideoDetails.ts @@ -19,7 +19,7 @@ export async function insertVideoInfo(client: Client, aid: number) { const desc = data.View.desc; const uid = data.View.owner.mid; const tags = data.Tags - .filter((tag) => tag.tag_type in ["old_channel", "topic"]) + .filter((tag) => !["old_channel", "topic"].indexOf(tag.tag_type)) .map((tag) => tag.tag_name).join(","); const title = data.View.title; const published_at = formatTimestampToPsql(data.View.pubdate * SECOND + 8 * HOUR); From e38dc9627593a4785d457f2b9eee946d7a60628c Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sat, 22 Mar 2025 20:51:28 +0800 Subject: [PATCH 15/79] update: insertion of snapshot schedule --- .zed/settings.json | 58 ++++++++--------- lib/db/allData.ts | 5 +- lib/db/schema.d.ts | 10 +++ lib/db/snapshotSchedule.ts | 121 +++++++++++++++++++++++++++++++++-- lib/ml/benchmark.ts | 1 - lib/ml/manager.ts | 8 +-- lib/ml/mantis.ts | 15 ++--- lib/mq/exec/snapshotTick.ts | 38 +++++++++-- lib/mq/scheduler.ts | 12 +++- lib/mq/task/getVideoStats.ts | 47 +++++++------- lib/net/bilibili.d.ts | 4 +- lib/utils/formatSeconds.ts | 2 +- src/worker.ts | 8 ++- test/ml/akari.json | 40 ++++++------ test/ml/akari.test.ts | 74 ++++++++++----------- 15 files changed, 301 insertions(+), 142 deletions(-) diff --git a/.zed/settings.json b/.zed/settings.json index 97f9ab8..a58d028 100644 --- a/.zed/settings.json +++ b/.zed/settings.json @@ -3,33 +3,33 @@ // For a full list of overridable settings, and general information on folder-specific settings, // see the documentation: https://zed.dev/docs/configuring-zed#settings-files { - "lsp": { - "deno": { - "settings": { - "deno": { - "enable": true - } - } - } - }, - "languages": { - "TypeScript": { - "language_servers": [ - "deno", - "!typescript-language-server", - "!vtsls", - "!eslint" - ], - "formatter": "language_server" - }, - "TSX": { - "language_servers": [ - "deno", - "!typescript-language-server", - "!vtsls", - "!eslint" - ], - "formatter": "language_server" - } - } + "lsp": { + "deno": { + "settings": { + "deno": { + "enable": true + } + } + } + }, + "languages": { + "TypeScript": { + "language_servers": [ + "deno", + "!typescript-language-server", + "!vtsls", + "!eslint" + ], + "formatter": "language_server" + }, + "TSX": { + "language_servers": [ + "deno", + "!typescript-language-server", + "!vtsls", + "!eslint" + ], + "formatter": "language_server" + } + } } diff --git a/lib/db/allData.ts b/lib/db/allData.ts index ddcb804..00fe22e 100644 --- a/lib/db/allData.ts +++ b/lib/db/allData.ts @@ -3,7 +3,10 @@ import { AllDataType, BiliUserType } from "lib/db/schema.d.ts"; import Akari from "lib/ml/akari.ts"; export async function videoExistsInAllData(client: Client, aid: number) { - return await client.queryObject<{ exists: boolean }>(`SELECT EXISTS(SELECT 1 FROM bilibili_metadata WHERE aid = $1)`, [aid]) + return await client.queryObject<{ exists: boolean }>( + `SELECT EXISTS(SELECT 1 FROM bilibili_metadata WHERE aid = $1)`, + [aid], + ) .then((result) => result.rows[0].exists); } diff --git a/lib/db/schema.d.ts b/lib/db/schema.d.ts index d93f736..983389c 100644 --- a/lib/db/schema.d.ts +++ b/lib/db/schema.d.ts @@ -31,3 +31,13 @@ export interface VideoSnapshotType { aid: bigint; replies: number; } + +export interface SnapshotScheduleType { + id: number; + aid: number; + type?: string; + created_at: string; + started_at?: string; + finished_at?: string; + status: string; +} diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 583d06a..8fd54fc 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -1,4 +1,7 @@ +import { DAY, MINUTE } from "$std/datetime/constants.ts"; import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { formatTimestampToPsql } from "lib/utils/formatTimestampToPostgre.ts"; +import { SnapshotScheduleType } from "./schema.d.ts"; /* Returns true if the specified `aid` has at least one record with "pending" or "processing" status. @@ -22,11 +25,12 @@ export async function findClosestSnapshot( targetTime: Date, ): Promise { const query = ` - SELECT created_at, views FROM video_snapshot + SELECT created_at, views + FROM video_snapshot WHERE aid = $1 - ORDER BY ABS(EXTRACT(EPOCH FROM (created_at - $2::timestamptz))) ASC + ORDER BY ABS(EXTRACT(EPOCH FROM (created_at - $2::timestamptz))) LIMIT 1 - `; + `; const result = await client.queryObject<{ created_at: string; views: number }>( query, [aid, targetTime.toISOString()], @@ -39,7 +43,7 @@ export async function findClosestSnapshot( }; } -export async function getLatestSnapshot(client: Client, aid: number): Promise{ +export async function getLatestSnapshot(client: Client, aid: number): Promise { const res = await client.queryObject<{ created_at: string; views: number }>( `SELECT created_at, views FROM video_snapshot WHERE aid = $1 ORDER BY created_at DESC LIMIT 1`, [aid], @@ -49,5 +53,112 @@ export async function getLatestSnapshot(client: Client, aid: number): Promise(query, [startTimeString, endTimeString]); + return res.rows[0].count; +} + +/* + * Creates a new snapshot schedule record. + * @param client The database client. + * @param aid The aid of the video. + * @param targetTime Scheduled time for snapshot. (Timestamp in milliseconds) + */ +export async function scheduleSnapshot(client: Client, aid: number, type: string, targetTime: number) { + const ajustedTime = await adjustSnapshotTime(client, new Date(targetTime)); + return client.queryObject( + `INSERT INTO snapshot_schedule (aid, type, started_at) VALUES ($1, $2, $3)`, + [aid, type, ajustedTime.toISOString()], + ); +} + +/** + * Adjust the trigger time of the snapshot to ensure it does not exceed the frequency limit + * @param client PostgreSQL client + * @param expectedStartTime The expected snapshot time + * @returns The adjusted actual snapshot time + */ +export async function adjustSnapshotTime( + client: Client, + expectedStartTime: Date, +): Promise { + const findWindowQuery = ` + WITH windows AS ( + SELECT generate_series( + $1::timestamp, -- Start time: current time truncated to the nearest 5-minute window + $2::timestamp, -- End time: 24 hours after the target time window starts + INTERVAL '5 MINUTES' + ) AS window_start + ) + SELECT w.window_start + FROM windows w + LEFT JOIN snapshot_schedule s ON s.started_at >= w.window_start + AND s.started_at < w.window_start + INTERVAL '5 MINUTES' + AND s.status = 'pending' + GROUP BY w.window_start + HAVING COUNT(s.*) < 2000 + ORDER BY w.window_start + LIMIT 1; + `; + for (let i = 0; i < 7; i++) { + const now = new Date(new Date().getTime() + 5 * MINUTE); + const nowTruncated = truncateTo5MinInterval(now); + const currentWindowStart = truncateTo5MinInterval(expectedStartTime); + const end = new Date(currentWindowStart.getTime() + 1 * DAY); + + const windowResult = await client.queryObject<{ window_start: Date }>( + findWindowQuery, + [nowTruncated, end], + ); + + const windowStart = windowResult.rows[0]?.window_start; + if (!windowStart) { + continue; + } + + return windowStart; + } + return expectedStartTime; +} + +/** + * Truncate the timestamp to the nearest 5-minute interval + * @param timestamp The timestamp + * @returns The truncated time + */ +function truncateTo5MinInterval(timestamp: Date): Date { + const minutes = timestamp.getMinutes() - (timestamp.getMinutes() % 5); + return new Date( + timestamp.getFullYear(), + timestamp.getMonth(), + timestamp.getDate(), + timestamp.getHours(), + minutes, + 0, + 0, + ); +} + +export async function getSnapshotsInNextSecond(client: Client) { + const res = await client.queryObject( + `SELECT * FROM cvsa.public.snapshot_schedule WHERE started_at <= NOW() + INTERVAL '1 second'`, + [], + ); + return res.rows; } diff --git a/lib/ml/benchmark.ts b/lib/ml/benchmark.ts index 3911c31..3fc76ac 100644 --- a/lib/ml/benchmark.ts +++ b/lib/ml/benchmark.ts @@ -1,7 +1,6 @@ import { AutoTokenizer, PreTrainedTokenizer } from "@huggingface/transformers"; import * as ort from "onnxruntime"; - function softmax(logits: Float32Array): number[] { const maxLogit = Math.max(...logits); const exponents = logits.map((logit) => Math.exp(logit - maxLogit)); diff --git a/lib/ml/manager.ts b/lib/ml/manager.ts index 8f15513..8230fcf 100644 --- a/lib/ml/manager.ts +++ b/lib/ml/manager.ts @@ -21,10 +21,10 @@ export class AIManager { } } - public getModelSession(key: string): ort.InferenceSession { - if (this.sessions[key] === undefined) { - throw new WorkerError(new Error(`Model ${key} not found / not initialized.`), "ml", "fn:getModelSession"); - } + public getModelSession(key: string): ort.InferenceSession { + if (this.sessions[key] === undefined) { + throw new WorkerError(new Error(`Model ${key} not found / not initialized.`), "ml", "fn:getModelSession"); + } return this.sessions[key]; } diff --git a/lib/ml/mantis.ts b/lib/ml/mantis.ts index 59bc09a..6960be9 100644 --- a/lib/ml/mantis.ts +++ b/lib/ml/mantis.ts @@ -6,19 +6,16 @@ import { WorkerError } from "lib/mq/schema.ts"; const modelPath = "./model/model.onnx"; class MantisProto extends AIManager { - constructor() { super(); - this.models = { - "predictor": modelPath, - } + this.models = { + "predictor": modelPath, + }; } - public override async init(): Promise { - await super.init(); - } - - + public override async init(): Promise { + await super.init(); + } } const Mantis = new MantisProto(); diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index e72d11f..86de99b 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -1,21 +1,37 @@ import { Job } from "bullmq"; import { db } from "lib/db/init.ts"; import { getVideosNearMilestone } from "lib/db/snapshot.ts"; -import { findClosestSnapshot, getLatestSnapshot, videoHasActiveSchedule } from "lib/db/snapshotSchedule.ts"; +import { + findClosestSnapshot, + getLatestSnapshot, + getSnapshotsInNextSecond, + scheduleSnapshot, + videoHasActiveSchedule, +} from "lib/db/snapshotSchedule.ts"; import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; import { HOUR, MINUTE } from "$std/datetime/constants.ts"; +import logger from "lib/log/logger.ts"; +import { SnapshotQueue } from "lib/mq/index.ts"; + +const priorityMap: { [key: string]: number } = { + "milestone": 1, +}; export const snapshotTickWorker = async (_job: Job) => { const client = await db.connect(); try { - // TODO: implement + const schedules = await getSnapshotsInNextSecond(client); + for (const schedule of schedules) { + let priority = 3; + if (schedule.type && priorityMap[schedule.type]) + priority = priorityMap[schedule.type]; + await SnapshotQueue.add("snapshotVideo", { aid: schedule.aid, priority }); + } } finally { client.release(); } }; -const log = (a: number, b: number = 10) => Math.log(a) / Math.log(b); - export const closetMilestone = (views: number) => { if (views < 100000) return 100000; if (views < 1000000) return 1000000; @@ -24,6 +40,12 @@ export const closetMilestone = (views: number) => { const log = (value: number, base: number = 10) => Math.log(value) / Math.log(base); +/* + * Returns the minimum ETA in hours for the next snapshot + * @param client - Postgres client + * @param aid - aid of the video + * @returns ETA in hours + */ const getAdjustedShortTermETA = async (client: Client, aid: number) => { const latestSnapshot = await getLatestSnapshot(client, aid); // Immediately dispatch a snapshot if there is no snapshot yet @@ -61,10 +83,12 @@ export const collectMilestoneSnapshotsWorker = async (_job: Job) => { if (await videoHasActiveSchedule(client, video.aid)) continue; const eta = await getAdjustedShortTermETA(client, video.aid); if (eta > 72) continue; - // TODO: dispatch snapshot job + const now = Date.now(); + const targetTime = now + eta * HOUR; + await scheduleSnapshot(client, video.aid, "milestone", targetTime); } - } catch (_e) { - // + } catch (e) { + logger.error(e as Error, "mq", "fn:collectMilestoneSnapshotsWorker"); } finally { client.release(); } diff --git a/lib/mq/scheduler.ts b/lib/mq/scheduler.ts index 00c3a4e..94c9361 100644 --- a/lib/mq/scheduler.ts +++ b/lib/mq/scheduler.ts @@ -288,7 +288,11 @@ class NetScheduler { const fileId = randomUUID(); await Deno.writeFile(`./logs/files/${fileId}.stdout`, output.stdout); await Deno.writeFile(`./logs/files/${fileId}.stderr`, output.stderr); - logger.log(`Returned non-200 status code. Raw ouput saved to ./logs/files/${fileId}.stdout/stderr`, "net", "fn:alicloudFcRequest") + logger.log( + `Returned non-200 status code. Raw ouput saved to ./logs/files/${fileId}.stdout/stderr`, + "net", + "fn:alicloudFcRequest", + ); throw new NetSchedulerError( `Error proxying ${url} to ali-fc region ${region}, code: ${rawData.statusCode}.`, "ALICLOUD_PROXY_ERR", @@ -301,7 +305,11 @@ class NetScheduler { const fileId = randomUUID(); rawOutput && await Deno.writeFile(`./logs/files/${fileId}.stdout`, rawOutput); rawErr && await Deno.writeFile(`./logs/files/${fileId}.stderr`, rawErr); - logger.log(`Error occurred. Raw ouput saved to ./logs/files/${fileId}.stdout/stderr`, "net", "fn:alicloudFcRequest") + logger.log( + `Error occurred. Raw ouput saved to ./logs/files/${fileId}.stdout/stderr`, + "net", + "fn:alicloudFcRequest", + ); } logger.error(e as Error, "net", "fn:alicloudFcRequest"); throw new NetSchedulerError(`Unhandled error: Cannot proxy ${url} to ali-fc.`, "ALICLOUD_PROXY_ERR", e); diff --git a/lib/mq/task/getVideoStats.ts b/lib/mq/task/getVideoStats.ts index 274d1bb..6f85035 100644 --- a/lib/mq/task/getVideoStats.ts +++ b/lib/mq/task/getVideoStats.ts @@ -3,30 +3,33 @@ import { getVideoInfo } from "lib/net/getVideoInfo.ts"; export async function insertVideoStats(client: Client, aid: number, task: string) { const data = await getVideoInfo(aid, task); - const time = new Date().getTime(); - if (typeof data == 'number') { + const time = new Date().getTime(); + if (typeof data == "number") { return data; } - const views = data.stat.view; - const danmakus = data.stat.danmaku; - const replies = data.stat.reply; - const likes = data.stat.like; - const coins = data.stat.coin; - const shares = data.stat.share; - const favorites = data.stat.favorite; - await client.queryObject(` + const views = data.stat.view; + const danmakus = data.stat.danmaku; + const replies = data.stat.reply; + const likes = data.stat.like; + const coins = data.stat.coin; + const shares = data.stat.share; + const favorites = data.stat.favorite; + await client.queryObject( + ` INSERT INTO video_snapshot (aid, views, danmakus, replies, likes, coins, shares, favorites) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) - `, [aid, views, danmakus, replies, likes, coins, shares, favorites]); - return { - aid, - views, - danmakus, - replies, - likes, - coins, - shares, - favorites, - time - } + `, + [aid, views, danmakus, replies, likes, coins, shares, favorites], + ); + return { + aid, + views, + danmakus, + replies, + likes, + coins, + shares, + favorites, + time, + }; } diff --git a/lib/net/bilibili.d.ts b/lib/net/bilibili.d.ts index 209b566..6a66ecc 100644 --- a/lib/net/bilibili.d.ts +++ b/lib/net/bilibili.d.ts @@ -26,8 +26,8 @@ interface VideoInfoData { mid: number; name: string; face: string; - }, - stat: VideoStats, + }; + stat: VideoStats; } interface VideoDetailsData { diff --git a/lib/utils/formatSeconds.ts b/lib/utils/formatSeconds.ts index 694f94c..491dfd6 100644 --- a/lib/utils/formatSeconds.ts +++ b/lib/utils/formatSeconds.ts @@ -1,6 +1,6 @@ export const formatSeconds = (seconds: number) => { if (seconds < 60) { - return `${(seconds).toFixed(1)}s`; + return `${seconds.toFixed(1)}s`; } if (seconds < 3600) { return `${Math.floor(seconds / 60)}m${(seconds % 60).toFixed(1)}s`; diff --git a/src/worker.ts b/src/worker.ts index 9523a42..da14706 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -5,7 +5,11 @@ import logger from "lib/log/logger.ts"; import { lockManager } from "lib/mq/lockManager.ts"; import { WorkerError } from "lib/mq/schema.ts"; import { getVideoInfoWorker } from "lib/mq/exec/getLatestVideos.ts"; -import { snapshotTickWorker, collectMilestoneSnapshotsWorker, takeSnapshotForVideoWorker } from "lib/mq/exec/snapshotTick.ts"; +import { + collectMilestoneSnapshotsWorker, + snapshotTickWorker, + takeSnapshotForVideoWorker, +} from "lib/mq/exec/snapshotTick.ts"; Deno.addSignalListener("SIGINT", async () => { logger.log("SIGINT Received: Shutting down workers...", "mq"); @@ -75,4 +79,4 @@ const snapshotWorker = new Worker( snapshotWorker.on("error", (err) => { const e = err as WorkerError; logger.error(e.rawError, e.service, e.codePath); -}) +}); diff --git a/test/ml/akari.json b/test/ml/akari.json index 7345078..9de1219 100644 --- a/test/ml/akari.json +++ b/test/ml/akari.json @@ -1,22 +1,22 @@ { - "test1": [ - { - "title": "【洛天依】《一花依世界》(2024重调版)|“抬头仰望,夜空多安详”【原创PV付】", - "desc": "本家:BV1Vs411H7JH\n作曲:LS\n作词:杏花包子\n调教:鬼面P\n混音:虎皮猫P\n演唱:洛天依\n曲绘:山下鸭鸭窝\n映像:阿妍\n——————————————————————\n本稿为同人二创,非本家重制", - "tags": "发现《一花依世界》, Vsinger创作激励计划, 洛天依, VOCALOID CHINA, 翻唱, 原创PV付, ACE虚拟歌姬, 中文VOCALOID, 国风电子, 一花依世界, ACE Studio, Vsinger创作激励计划2024冬季物语", - "label": 2 - }, - { - "title": "【鏡音レン】アカシア【VOCALOID Cover】", - "desc": "鏡音リン・レン 13th Anniversary\n\nMusic:BUMP OF CHICKEN https://youtu.be/BoZ0Zwab6Oc\nust:Maplestyle sm37853236\nOff Vocal: https://youtu.be/YMzrUzq1uX0\nSinger:鏡音レン\n\n氷雨ハルカ\nYoutube :https://t.co/8zuv6g7Acm\nniconico:https://t.co/C6DRfdYAp0\ntwitter :https://twitter.com/hisame_haruka\n\n転載禁止\nPlease do not reprint without my permission.", - "tags": "鏡音レン", - "label": 0 - }, - { - "title": "【洛天依原创曲】谪星【姆斯塔之谕】", - "desc": "谪星\n\n策划/世界观:听雨\n作词:听雨\n作曲/编曲:太白\n混音:虎皮猫\n人设:以木\n曲绘:Ar极光\n调校:哈士奇p\n视频:苏卿白", - "tags": "2025虚拟歌手贺岁纪, 洛天依, 原创歌曲, VOCALOID, 虚拟歌手, 原创音乐, 姆斯塔, 中文VOCALOID", - "label": 1 - } - ] + "test1": [ + { + "title": "【洛天依】《一花依世界》(2024重调版)|“抬头仰望,夜空多安详”【原创PV付】", + "desc": "本家:BV1Vs411H7JH\n作曲:LS\n作词:杏花包子\n调教:鬼面P\n混音:虎皮猫P\n演唱:洛天依\n曲绘:山下鸭鸭窝\n映像:阿妍\n——————————————————————\n本稿为同人二创,非本家重制", + "tags": "发现《一花依世界》, Vsinger创作激励计划, 洛天依, VOCALOID CHINA, 翻唱, 原创PV付, ACE虚拟歌姬, 中文VOCALOID, 国风电子, 一花依世界, ACE Studio, Vsinger创作激励计划2024冬季物语", + "label": 2 + }, + { + "title": "【鏡音レン】アカシア【VOCALOID Cover】", + "desc": "鏡音リン・レン 13th Anniversary\n\nMusic:BUMP OF CHICKEN https://youtu.be/BoZ0Zwab6Oc\nust:Maplestyle sm37853236\nOff Vocal: https://youtu.be/YMzrUzq1uX0\nSinger:鏡音レン\n\n氷雨ハルカ\nYoutube :https://t.co/8zuv6g7Acm\nniconico:https://t.co/C6DRfdYAp0\ntwitter :https://twitter.com/hisame_haruka\n\n転載禁止\nPlease do not reprint without my permission.", + "tags": "鏡音レン", + "label": 0 + }, + { + "title": "【洛天依原创曲】谪星【姆斯塔之谕】", + "desc": "谪星\n\n策划/世界观:听雨\n作词:听雨\n作曲/编曲:太白\n混音:虎皮猫\n人设:以木\n曲绘:Ar极光\n调校:哈士奇p\n视频:苏卿白", + "tags": "2025虚拟歌手贺岁纪, 洛天依, 原创歌曲, VOCALOID, 虚拟歌手, 原创音乐, 姆斯塔, 中文VOCALOID", + "label": 1 + } + ] } diff --git a/test/ml/akari.test.ts b/test/ml/akari.test.ts index 958f34d..f254a01 100644 --- a/test/ml/akari.test.ts +++ b/test/ml/akari.test.ts @@ -4,43 +4,43 @@ import { join } from "$std/path/join.ts"; import { SECOND } from "$std/datetime/constants.ts"; Deno.test("Akari AI - normal cases accuracy test", async () => { - const path = import.meta.dirname!; - const dataPath = join(path, "akari.json"); - const rawData = await Deno.readTextFile(dataPath); - const data = JSON.parse(rawData); - await Akari.init(); - for (const testCase of data.test1) { - const result = await Akari.classifyVideo( - testCase.title, - testCase.desc, - testCase.tags - ); - assertEquals(result, testCase.label); - } + const path = import.meta.dirname!; + const dataPath = join(path, "akari.json"); + const rawData = await Deno.readTextFile(dataPath); + const data = JSON.parse(rawData); + await Akari.init(); + for (const testCase of data.test1) { + const result = await Akari.classifyVideo( + testCase.title, + testCase.desc, + testCase.tags, + ); + assertEquals(result, testCase.label); + } }); Deno.test("Akari AI - performance test", async () => { - const path = import.meta.dirname!; - const dataPath = join(path, "akari.json"); - const rawData = await Deno.readTextFile(dataPath); - const data = JSON.parse(rawData); - await Akari.init(); - const N = 200; - const testCase = data.test1[0]; - const title = testCase.title; - const desc = testCase.desc; - const tags = testCase.tags; - const time = performance.now(); - for (let i = 0; i < N; i++){ - await Akari.classifyVideo( - title, - desc, - tags - ); - } - const end = performance.now(); - const elapsed = (end - time) / SECOND; - const throughput = N / elapsed; - assertGreaterOrEqual(throughput, 100); - console.log(`Akari AI throughput: ${throughput.toFixed(1)} samples / sec`) -}); \ No newline at end of file + const path = import.meta.dirname!; + const dataPath = join(path, "akari.json"); + const rawData = await Deno.readTextFile(dataPath); + const data = JSON.parse(rawData); + await Akari.init(); + const N = 200; + const testCase = data.test1[0]; + const title = testCase.title; + const desc = testCase.desc; + const tags = testCase.tags; + const time = performance.now(); + for (let i = 0; i < N; i++) { + await Akari.classifyVideo( + title, + desc, + tags, + ); + } + const end = performance.now(); + const elapsed = (end - time) / SECOND; + const throughput = N / elapsed; + assertGreaterOrEqual(throughput, 100); + console.log(`Akari AI throughput: ${throughput.toFixed(1)} samples / sec`); +}); From b201bfd64db2ee696ca24754bf51a30f47d80235 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sat, 22 Mar 2025 23:49:48 +0800 Subject: [PATCH 16/79] fix: add type assertions to suppress errors --- src/filterWorker.ts | 4 ++-- src/worker.ts | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/filterWorker.ts b/src/filterWorker.ts index cb42048..b14ef07 100644 --- a/src/filterWorker.ts +++ b/src/filterWorker.ts @@ -1,4 +1,4 @@ -import { Job, Worker } from "bullmq"; +import { ConnectionOptions, Job, Worker } from "bullmq"; import { redis } from "lib/db/redis.ts"; import logger from "lib/log/logger.ts"; import { classifyVideosWorker, classifyVideoWorker } from "lib/mq/exec/classifyVideo.ts"; @@ -32,7 +32,7 @@ const filterWorker = new Worker( break; } }, - { connection: redis, concurrency: 2, removeOnComplete: { count: 1000 } }, + { connection: redis as ConnectionOptions, concurrency: 2, removeOnComplete: { count: 1000 } }, ); filterWorker.on("active", () => { diff --git a/src/worker.ts b/src/worker.ts index da14706..9998569 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -1,4 +1,4 @@ -import { Job, Worker } from "bullmq"; +import { ConnectionOptions, Job, Worker } from "bullmq"; import { collectSongsWorker, getLatestVideosWorker } from "lib/mq/executors.ts"; import { redis } from "lib/db/redis.ts"; import logger from "lib/log/logger.ts"; @@ -40,7 +40,7 @@ const latestVideoWorker = new Worker( break; } }, - { connection: redis, concurrency: 6, removeOnComplete: { count: 1440 }, removeOnFail: { count: 0 } }, + { connection: redis as ConnectionOptions, concurrency: 6, removeOnComplete: { count: 1440 }, removeOnFail: { count: 0 } }, ); latestVideoWorker.on("active", () => { @@ -73,7 +73,7 @@ const snapshotWorker = new Worker( break; } }, - { connection: redis, concurrency: 10, removeOnComplete: { count: 2000 } }, + { connection: redis as ConnectionOptions, concurrency: 10, removeOnComplete: { count: 2000 } }, ); snapshotWorker.on("error", (err) => { From 2c12310e8c077cb4a4658450ef11f160b136fb73 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sun, 23 Mar 2025 17:44:59 +0800 Subject: [PATCH 17/79] feat: snapshot based on persistent schedule --- lib/db/allData.ts | 7 ++ lib/db/schema.d.ts | 12 +++ lib/db/snapshot.ts | 198 ++--------------------------------- lib/db/snapshotSchedule.ts | 27 ++++- lib/mq/exec/snapshotTick.ts | 65 ++++++++++-- lib/mq/scheduler.ts | 46 +++----- lib/mq/task/getVideoStats.ts | 32 ++++-- lib/net/getVideoInfo.ts | 13 +++ src/worker.ts | 7 +- 9 files changed, 170 insertions(+), 237 deletions(-) diff --git a/lib/db/allData.ts b/lib/db/allData.ts index 00fe22e..6e9c509 100644 --- a/lib/db/allData.ts +++ b/lib/db/allData.ts @@ -68,3 +68,10 @@ export async function getUnArchivedBiliUsers(client: Client) { const rows = queryResult.rows; return rows.map((row) => row.uid); } + +export async function setBiliVideoStatus(client: Client, aid: number, status: number) { + return await client.queryObject( + `UPDATE bilibili_metadata SET status = $1 WHERE aid = $2`, + [status, aid], + ); +} diff --git a/lib/db/schema.d.ts b/lib/db/schema.d.ts index 983389c..d030308 100644 --- a/lib/db/schema.d.ts +++ b/lib/db/schema.d.ts @@ -32,6 +32,18 @@ export interface VideoSnapshotType { replies: number; } +export interface LatestSnapshotType { + aid: number; + time: number; + views: number; + danmakus: number; + replies: number; + likes: number; + coins: number; + shares: number; + favorites: number; +} + export interface SnapshotScheduleType { id: number; aid: number; diff --git a/lib/db/snapshot.ts b/lib/db/snapshot.ts index d838899..be7ec26 100644 --- a/lib/db/snapshot.ts +++ b/lib/db/snapshot.ts @@ -1,35 +1,17 @@ -import { DAY, SECOND } from "$std/datetime/constants.ts"; import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { VideoSnapshotType } from "lib/db/schema.d.ts"; -import { parseTimestampFromPsql } from "lib/utils/formatTimestampToPostgre.ts"; +import { LatestSnapshotType } from "lib/db/schema.d.ts"; export async function getVideosNearMilestone(client: Client) { - const queryResult = await client.queryObject(` - WITH filtered_snapshots AS ( - SELECT - vs.* - FROM - video_snapshot vs - WHERE - (vs.views >= 90000 AND vs.views < 100000) OR - (vs.views >= 900000 AND vs.views < 1000000) - ), - ranked_snapshots AS ( - SELECT - fs.*, - ROW_NUMBER() OVER (PARTITION BY fs.aid ORDER BY fs.created_at DESC) as rn, - MAX(fs.views) OVER (PARTITION BY fs.aid) as max_views_per_aid - FROM - filtered_snapshots fs - INNER JOIN - songs s ON fs.aid = s.aid - ) - SELECT - rs.id, rs.created_at, rs.views, rs.coins, rs.likes, rs.favorites, rs.shares, rs.danmakus, rs.aid, rs.replies - FROM - ranked_snapshots rs - WHERE - rs.rn = 1; + const queryResult = await client.queryObject(` + SELECT ls.* + FROM latest_video_snapshot ls + INNER JOIN + songs s ON ls.aid = s.aid + WHERE + s.deleted = false AND + (views >= 90000 AND views < 100000) OR + (views >= 900000 AND views < 1000000) OR + (views >= 9900000 AND views < 10000000) `); return queryResult.rows.map((row) => { return { @@ -38,161 +20,3 @@ export async function getVideosNearMilestone(client: Client) { }; }); } - -export async function getUnsnapshotedSongs(client: Client) { - const queryResult = await client.queryObject<{ aid: bigint }>(` - SELECT DISTINCT s.aid - FROM songs s - LEFT JOIN video_snapshot v ON s.aid = v.aid - WHERE v.aid IS NULL; - `); - return queryResult.rows.map((row) => Number(row.aid)); -} - -export async function getSongSnapshotCount(client: Client, aid: number) { - const queryResult = await client.queryObject<{ count: number }>( - ` - SELECT COUNT(*) AS count - FROM video_snapshot - WHERE aid = $1; - `, - [aid], - ); - return queryResult.rows[0].count; -} - -export async function getShortTermEtaPrediction(client: Client, aid: number) { - const queryResult = await client.queryObject<{ eta: number }>( - ` - WITH old_snapshot AS ( - SELECT created_at, views - FROM video_snapshot - WHERE aid = $1 AND - NOW() - created_at > '20 min' - ORDER BY created_at DESC - LIMIT 1 - ), - new_snapshot AS ( - SELECT created_at, views - FROM video_snapshot - WHERE aid = $1 - ORDER BY created_at DESC - LIMIT 1 - ) - SELECT - CASE - WHEN n.views > 100000 - THEN - (1000000 - n.views) -- Views remaining - / - ( - (n.views - o.views) -- Views delta - / - (EXTRACT(EPOCH FROM (n.created_at - o.created_at)) + 0.001) -- Time delta in seconds - + 0.001 - ) -- Increment per second - ELSE - (100000 - n.views) -- Views remaining - / - ( - (n.views - o.views) -- Views delta - / - (EXTRACT(EPOCH FROM (n.created_at - o.created_at)) + 0.001) -- Time delta in seconds - + 0.001 - ) -- Increment per second - END AS eta - FROM old_snapshot o, new_snapshot n; - `, - [aid], - ); - if (queryResult.rows.length === 0) { - return null; - } - return queryResult.rows[0].eta; -} - -export async function getIntervalFromLastSnapshotToNow(client: Client, aid: number) { - const queryResult = await client.queryObject<{ interval: number }>( - ` - SELECT EXTRACT(EPOCH FROM (NOW() - created_at)) AS interval - FROM video_snapshot - WHERE aid = $1 - ORDER BY created_at DESC - LIMIT 1; - `, - [aid], - ); - if (queryResult.rows.length === 0) { - return null; - } - return queryResult.rows[0].interval; -} - -export async function songEligibleForMilestoneSnapshot(client: Client, aid: number) { - const count = await getSongSnapshotCount(client, aid); - if (count < 2) { - return true; - } - const queryResult = await client.queryObject< - { views1: number; created_at1: string; views2: number; created_at2: string } - >( - ` - WITH latest_snapshot AS ( - SELECT - aid, - views, - created_at - FROM video_snapshot - WHERE aid = $1 - ORDER BY created_at DESC - LIMIT 1 - ), - pairs AS ( - SELECT - a.views AS views1, - a.created_at AS created_at1, - b.views AS views2, - b.created_at AS created_at2, - (b.created_at - a.created_at) AS interval - FROM video_snapshot a - JOIN latest_snapshot b - ON a.aid = b.aid - AND a.created_at < b.created_at - ) - SELECT - views1, - created_at1, - views2, - created_at2 - FROM ( - SELECT - *, - ROW_NUMBER() OVER ( - ORDER BY - CASE WHEN interval <= INTERVAL '3 days' THEN 0 ELSE 1 END, - CASE WHEN interval <= INTERVAL '3 days' THEN -interval ELSE interval END - ) AS rn - FROM pairs - ) ranked - WHERE rn = 1; - `, - [aid], - ); - if (queryResult.rows.length === 0) { - return true; - } - const recentViewsData = queryResult.rows[0]; - const time1 = parseTimestampFromPsql(recentViewsData.created_at1); - const time2 = parseTimestampFromPsql(recentViewsData.created_at2); - const intervalSec = (time2 - time1) / SECOND; - const views1 = recentViewsData.views1; - const views2 = recentViewsData.views2; - const viewsDiff = views2 - views1; - if (viewsDiff == 0) { - return false; - } - const nextMilestone = views2 >= 100000 ? 1000000 : 100000; - const expectedViewsDiff = nextMilestone - views2; - const expectedIntervalSec = expectedViewsDiff / viewsDiff * intervalSec; - return expectedIntervalSec <= 3 * DAY; -} diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 8fd54fc..bd4e805 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -14,6 +14,14 @@ export async function videoHasActiveSchedule(client: Client, aid: number) { return res.rows.length > 0; } +export async function videoHasProcessingSchedule(client: Client, aid: number) { + const res = await client.queryObject<{ status: string }>( + `SELECT status FROM snapshot_schedule WHERE aid = $1 AND status = 'processing'`, + [aid], + ); + return res.rows.length > 0; +} + interface Snapshot { created_at: number; views: number; @@ -156,9 +164,20 @@ function truncateTo5MinInterval(timestamp: Date): Date { } export async function getSnapshotsInNextSecond(client: Client) { - const res = await client.queryObject( - `SELECT * FROM cvsa.public.snapshot_schedule WHERE started_at <= NOW() + INTERVAL '1 second'`, - [], - ); + const query = ` + SELECT * + FROM snapshot_schedule + WHERE started_at + BETWEEN NOW() - INTERVAL '5 seconds' + AND NOW() + INTERVAL '1 seconds' + `; + const res = await client.queryObject(query, []); return res.rows; } + +export async function setSnapshotStatus(client: Client, id: number, status: string) { + return client.queryObject( + `UPDATE snapshot_schedule SET status = $2 WHERE id = $1`, + [id, status], + ); +} diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 86de99b..453ac65 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -6,27 +6,44 @@ import { getLatestSnapshot, getSnapshotsInNextSecond, scheduleSnapshot, + setSnapshotStatus, videoHasActiveSchedule, + videoHasProcessingSchedule, } from "lib/db/snapshotSchedule.ts"; import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { HOUR, MINUTE } from "$std/datetime/constants.ts"; +import { HOUR, MINUTE, SECOND } from "$std/datetime/constants.ts"; import logger from "lib/log/logger.ts"; import { SnapshotQueue } from "lib/mq/index.ts"; +import { insertVideoSnapshot } from "../task/getVideoStats.ts"; +import { NetSchedulerError } from "../scheduler.ts"; +import { setBiliVideoStatus } from "../../db/allData.ts"; const priorityMap: { [key: string]: number } = { "milestone": 1, }; +const snapshotTypeToTaskMap: { [key: string]: string } = { + "milestone": "snapshotMilestoneVideo", + "normal": "snapshotVideo", +}; + export const snapshotTickWorker = async (_job: Job) => { const client = await db.connect(); try { const schedules = await getSnapshotsInNextSecond(client); for (const schedule of schedules) { let priority = 3; - if (schedule.type && priorityMap[schedule.type]) + if (schedule.type && priorityMap[schedule.type]) { priority = priorityMap[schedule.type]; - await SnapshotQueue.add("snapshotVideo", { aid: schedule.aid, priority }); + } + await SnapshotQueue.add("snapshotVideo", { + aid: schedule.aid, + id: schedule.id, + type: schedule.type ?? "normal", + }, { priority }); } + } catch (e) { + logger.error(e as Error); } finally { client.release(); } @@ -52,7 +69,7 @@ const getAdjustedShortTermETA = async (client: Client, aid: number) => { if (!latestSnapshot) return 0; const currentTimestamp = Date.now(); - const timeIntervals = [20 * MINUTE, 1 * HOUR, 3 * HOUR, 6 * HOUR]; + const timeIntervals = [3 * MINUTE, 20 * MINUTE, 1 * HOUR, 3 * HOUR, 6 * HOUR]; const DELTA = 0.00001; let minETAHours = Infinity; @@ -94,6 +111,42 @@ export const collectMilestoneSnapshotsWorker = async (_job: Job) => { } }; -export const takeSnapshotForVideoWorker = async (_job: Job) => { - // TODO: implement +export const takeSnapshotForVideoWorker = async (job: Job) => { + const id = job.data.id; + const aid = job.data.aid; + const task = snapshotTypeToTaskMap[job.data.type] ?? "snapshotVideo"; + const client = await db.connect(); + try { + if (await videoHasProcessingSchedule(client, aid)) { + return `ALREADY_PROCESSING`; + } + await setSnapshotStatus(client, id, "processing"); + const stat = await insertVideoSnapshot(client, aid, task); + if (typeof stat === "number") { + await setBiliVideoStatus(client, aid, stat); + await setSnapshotStatus(client, id, "completed"); + return `BILI_STATUS_${stat}`; + } + const eta = await getAdjustedShortTermETA(client, aid); + if (eta > 72) return "ETA_TOO_LONG"; + const now = Date.now(); + const targetTime = now + eta * HOUR; + await setSnapshotStatus(client, id, "completed"); + await scheduleSnapshot(client, aid, "milestone", targetTime); + } catch (e) { + if (e instanceof NetSchedulerError && e.code === "NO_PROXY_AVAILABLE") { + logger.warn( + `No available proxy for aid ${job.data.aid}.`, + "mq", + "fn:takeSnapshotForVideoWorker", + ); + await setSnapshotStatus(client, id, "completed"); + await scheduleSnapshot(client, aid, "milestone", Date.now() + 5 * SECOND); + return; + } + logger.error(e as Error, "mq", "fn:takeSnapshotForVideoWorker"); + await setSnapshotStatus(client, id, "failed"); + } finally { + client.release(); + } }; diff --git a/lib/mq/scheduler.ts b/lib/mq/scheduler.ts index 94c9361..a711a11 100644 --- a/lib/mq/scheduler.ts +++ b/lib/mq/scheduler.ts @@ -21,7 +21,7 @@ interface ProxiesMap { } type NetSchedulerErrorCode = - | "NO_AVAILABLE_PROXY" + | "NO_PROXY_AVAILABLE" | "PROXY_RATE_LIMITED" | "PROXY_NOT_FOUND" | "FETCH_ERROR" @@ -143,10 +143,10 @@ class NetScheduler { * @param {string} method - The HTTP method to use for the request. Default is "GET". * @returns {Promise} - A promise that resolves to the response body. * @throws {NetSchedulerError} - The error will be thrown in following cases: - * - No available proxy currently: with error code NO_AVAILABLE_PROXY - * - Proxy is under rate limit: with error code PROXY_RATE_LIMITED - * - The native `fetch` function threw an error: with error code FETCH_ERROR - * - The proxy type is not supported: with error code NOT_IMPLEMENTED + * - No proxy is available currently: with error code `NO_PROXY_AVAILABLE` + * - The native `fetch` function threw an error: with error code `FETCH_ERROR` + * - The alicloud-fc threw an error: with error code `ALICLOUD_FC_ERROR` + * - The proxy type is not supported: with error code `NOT_IMPLEMENTED` */ async request(url: string, task: string, method: string = "GET"): Promise { // find a available proxy @@ -156,7 +156,7 @@ class NetScheduler { return await this.proxyRequest(url, proxyName, task, method); } } - throw new NetSchedulerError("No available proxy currently.", "NO_AVAILABLE_PROXY"); + throw new NetSchedulerError("No proxy is available currently.", "NO_PROXY_AVAILABLE"); } /* @@ -168,10 +168,11 @@ class NetScheduler { * @param {boolean} force - If true, the request will be made even if the proxy is rate limited. Default is false. * @returns {Promise} - A promise that resolves to the response body. * @throws {NetSchedulerError} - The error will be thrown in following cases: - * - Proxy not found: with error code PROXY_NOT_FOUND - * - Proxy is under rate limit: with error code PROXY_RATE_LIMITED - * - The native `fetch` function threw an error: with error code FETCH_ERROR - * - The proxy type is not supported: with error code NOT_IMPLEMENTED + * - Proxy not found: with error code `PROXY_NOT_FOUND` + * - Proxy is under rate limit: with error code `PROXY_RATE_LIMITED` + * - The native `fetch` function threw an error: with error code `FETCH_ERROR` + * - The alicloud-fc threw an error: with error code `ALICLOUD_FC_ERROR` + * - The proxy type is not supported: with error code `NOT_IMPLEMENTED` */ async proxyRequest( url: string, @@ -255,8 +256,6 @@ class NetScheduler { } private async alicloudFcRequest(url: string, region: string): Promise { - let rawOutput: null | Uint8Array = null; - let rawErr: null | Uint8Array = null; try { const decoder = new TextDecoder(); const output = await new Deno.Command("aliyun", { @@ -280,19 +279,9 @@ class NetScheduler { `CVSA-${region}`, ], }).output(); - rawOutput = output.stdout; - rawErr = output.stderr; const out = decoder.decode(output.stdout); const rawData = JSON.parse(out); if (rawData.statusCode !== 200) { - const fileId = randomUUID(); - await Deno.writeFile(`./logs/files/${fileId}.stdout`, output.stdout); - await Deno.writeFile(`./logs/files/${fileId}.stderr`, output.stderr); - logger.log( - `Returned non-200 status code. Raw ouput saved to ./logs/files/${fileId}.stdout/stderr`, - "net", - "fn:alicloudFcRequest", - ); throw new NetSchedulerError( `Error proxying ${url} to ali-fc region ${region}, code: ${rawData.statusCode}.`, "ALICLOUD_PROXY_ERR", @@ -301,16 +290,6 @@ class NetScheduler { return JSON.parse(JSON.parse(rawData.body)) as R; } } catch (e) { - if (rawOutput !== null || rawErr !== null) { - const fileId = randomUUID(); - rawOutput && await Deno.writeFile(`./logs/files/${fileId}.stdout`, rawOutput); - rawErr && await Deno.writeFile(`./logs/files/${fileId}.stderr`, rawErr); - logger.log( - `Error occurred. Raw ouput saved to ./logs/files/${fileId}.stdout/stderr`, - "net", - "fn:alicloudFcRequest", - ); - } logger.error(e as Error, "net", "fn:alicloudFcRequest"); throw new NetSchedulerError(`Unhandled error: Cannot proxy ${url} to ali-fc.`, "ALICLOUD_PROXY_ERR", e); } @@ -369,7 +348,8 @@ Execution order for setup: - Call after addProxy and addTask. Configures rate limiters specifically for tasks and their associated proxies. - Depends on tasks and proxies being defined to apply limiters correctly. 4. setProviderLimiter(providerName, config): - - Call after addProxy and addTask. Sets rate limiters at the provider level, affecting all proxies used by tasks of that provider. + - Call after addProxy and addTask. + - It sets rate limiters at the provider level, affecting all proxies used by tasks of that provider. - Depends on tasks and proxies being defined to identify which proxies to apply provider-level limiters to. In summary: addProxy -> addTask -> (setTaskLimiter and/or setProviderLimiter). diff --git a/lib/mq/task/getVideoStats.ts b/lib/mq/task/getVideoStats.ts index 6f85035..8e3530a 100644 --- a/lib/mq/task/getVideoStats.ts +++ b/lib/mq/task/getVideoStats.ts @@ -1,12 +1,27 @@ import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; import { getVideoInfo } from "lib/net/getVideoInfo.ts"; +import { LatestSnapshotType } from "lib/db/schema.d.ts"; -export async function insertVideoStats(client: Client, aid: number, task: string) { +/* + * Fetch video stats from bilibili API and insert into database + * @returns {Promise} + * A number indicating the status code when receiving non-0 status code from bilibili, + * otherwise an VideoSnapshot object containing the video stats + * @throws {NetSchedulerError} - The error will be thrown in following cases: + * - No proxy is available currently: with error code `NO_PROXY_AVAILABLE` + * - The native `fetch` function threw an error: with error code `FETCH_ERROR` + * - The alicloud-fc threw an error: with error code `ALICLOUD_FC_ERROR` + */ +export async function insertVideoSnapshot( + client: Client, + aid: number, + task: string, +): Promise { const data = await getVideoInfo(aid, task); - const time = new Date().getTime(); if (typeof data == "number") { return data; } + const time = new Date().getTime(); const views = data.stat.view; const danmakus = data.stat.danmaku; const replies = data.stat.reply; @@ -14,14 +29,17 @@ export async function insertVideoStats(client: Client, aid: number, task: string const coins = data.stat.coin; const shares = data.stat.share; const favorites = data.stat.favorite; - await client.queryObject( - ` + + const query: string = ` INSERT INTO video_snapshot (aid, views, danmakus, replies, likes, coins, shares, favorites) VALUES ($1, $2, $3, $4, $5, $6, $7, $8) - `, + `; + await client.queryObject( + query, [aid, views, danmakus, replies, likes, coins, shares, favorites], ); - return { + + const snapshot: LatestSnapshotType = { aid, views, danmakus, @@ -32,4 +50,6 @@ export async function insertVideoStats(client: Client, aid: number, task: string favorites, time, }; + + return snapshot; } diff --git a/lib/net/getVideoInfo.ts b/lib/net/getVideoInfo.ts index c35bf56..897fc62 100644 --- a/lib/net/getVideoInfo.ts +++ b/lib/net/getVideoInfo.ts @@ -2,6 +2,19 @@ import netScheduler from "lib/mq/scheduler.ts"; import { VideoInfoData, VideoInfoResponse } from "lib/net/bilibili.d.ts"; import logger from "lib/log/logger.ts"; +/* + * Fetch video metadata from bilibili API + * @param {number} aid - The video's aid + * @param {string} task - The task name used in scheduler. It can be one of the following: + * - snapshotVideo + * - getVideoInfo + * - snapshotMilestoneVideo + * @returns {Promise} VideoInfoData or the error code returned by bilibili API + * @throws {NetSchedulerError} - The error will be thrown in following cases: + * - No proxy is available currently: with error code `NO_PROXY_AVAILABLE` + * - The native `fetch` function threw an error: with error code `FETCH_ERROR` + * - The alicloud-fc threw an error: with error code `ALICLOUD_FC_ERROR` + */ export async function getVideoInfo(aid: number, task: string): Promise { const url = `https://api.bilibili.com/x/web-interface/view?aid=${aid}`; const data = await netScheduler.request(url, task); diff --git a/src/worker.ts b/src/worker.ts index 9998569..ba2b510 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -40,7 +40,12 @@ const latestVideoWorker = new Worker( break; } }, - { connection: redis as ConnectionOptions, concurrency: 6, removeOnComplete: { count: 1440 }, removeOnFail: { count: 0 } }, + { + connection: redis as ConnectionOptions, + concurrency: 6, + removeOnComplete: { count: 1440 }, + removeOnFail: { count: 0 }, + }, ); latestVideoWorker.on("active", () => { From 18fc9752bb06fbdb67cfa28b6b37eeab723f0161 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sun, 23 Mar 2025 18:46:25 +0800 Subject: [PATCH 18/79] fix: several bugs of snapshot scheduling --- lib/db/snapshotSchedule.ts | 49 +++++++++++++++++++++++-------------- lib/mq/exec/snapshotTick.ts | 30 ++++++++++++++++------- lib/mq/init.ts | 5 ++++ src/worker.ts | 2 ++ 4 files changed, 59 insertions(+), 27 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index bd4e805..697a680 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -2,6 +2,7 @@ import { DAY, MINUTE } from "$std/datetime/constants.ts"; import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; import { formatTimestampToPsql } from "lib/utils/formatTimestampToPostgre.ts"; import { SnapshotScheduleType } from "./schema.d.ts"; +import logger from "../log/logger.ts"; /* Returns true if the specified `aid` has at least one record with "pending" or "processing" status. @@ -51,6 +52,14 @@ export async function findClosestSnapshot( }; } +export async function hasAtLeast2Snapshots(client: Client, aid: number) { + const res = await client.queryObject<{ count: number }>( + `SELECT COUNT(*) FROM video_snapshot WHERE aid = $1`, + [aid], + ); + return res.rows[0].count >= 2; +} + export async function getLatestSnapshot(client: Client, aid: number): Promise { const res = await client.queryObject<{ created_at: string; views: number }>( `SELECT created_at, views FROM video_snapshot WHERE aid = $1 ORDER BY created_at DESC LIMIT 1`, @@ -89,10 +98,11 @@ export async function getSnapshotScheduleCountWithinRange(client: Client, start: * @param targetTime Scheduled time for snapshot. (Timestamp in milliseconds) */ export async function scheduleSnapshot(client: Client, aid: number, type: string, targetTime: number) { - const ajustedTime = await adjustSnapshotTime(client, new Date(targetTime)); + const adjustedTime = (await adjustSnapshotTime(client, new Date(targetTime))); + logger.log(`Scheduled snapshot for ${aid} at ${adjustedTime.toISOString()}`, "mq", "fn:scheduleSnapshot"); return client.queryObject( `INSERT INTO snapshot_schedule (aid, type, started_at) VALUES ($1, $2, $3)`, - [aid, type, ajustedTime.toISOString()], + [aid, type, adjustedTime.toISOString()], ); } @@ -124,25 +134,28 @@ export async function adjustSnapshotTime( ORDER BY w.window_start LIMIT 1; `; - for (let i = 0; i < 7; i++) { - const now = new Date(new Date().getTime() + 5 * MINUTE); - const nowTruncated = truncateTo5MinInterval(now); - const currentWindowStart = truncateTo5MinInterval(expectedStartTime); - const end = new Date(currentWindowStart.getTime() + 1 * DAY); + const now = new Date(new Date().getTime() + 5 * MINUTE); + const nowTruncated = truncateTo5MinInterval(now); + const currentWindowStart = truncateTo5MinInterval(expectedStartTime); + const end = new Date(currentWindowStart.getTime() + 1 * DAY); - const windowResult = await client.queryObject<{ window_start: Date }>( - findWindowQuery, - [nowTruncated, end], - ); + const windowResult = await client.queryObject<{ window_start: Date }>( + findWindowQuery, + [nowTruncated, end], + ); - const windowStart = windowResult.rows[0]?.window_start; - if (!windowStart) { - continue; - } - - return windowStart; + const windowStart = windowResult.rows[0]?.window_start; + if (!windowStart) { + return expectedStartTime; + } + + // Returns windowStart if it is within the next 5 minutes + if (windowStart.getTime() > new Date().getTime() + 5 * MINUTE) { + return windowStart + } + else { + return expectedStartTime; } - return expectedStartTime; } /** diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 453ac65..253629e 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -5,6 +5,7 @@ import { findClosestSnapshot, getLatestSnapshot, getSnapshotsInNextSecond, + hasAtLeast2Snapshots, scheduleSnapshot, setSnapshotStatus, videoHasActiveSchedule, @@ -17,6 +18,7 @@ import { SnapshotQueue } from "lib/mq/index.ts"; import { insertVideoSnapshot } from "../task/getVideoStats.ts"; import { NetSchedulerError } from "../scheduler.ts"; import { setBiliVideoStatus } from "../../db/allData.ts"; +import {truncate} from "../../utils/truncate.ts"; const priorityMap: { [key: string]: number } = { "milestone": 1, @@ -36,8 +38,9 @@ export const snapshotTickWorker = async (_job: Job) => { if (schedule.type && priorityMap[schedule.type]) { priority = priorityMap[schedule.type]; } + const aid = Number(schedule.aid); await SnapshotQueue.add("snapshotVideo", { - aid: schedule.aid, + aid: aid, id: schedule.id, type: schedule.type ?? "normal", }, { priority }); @@ -67,8 +70,10 @@ const getAdjustedShortTermETA = async (client: Client, aid: number) => { const latestSnapshot = await getLatestSnapshot(client, aid); // Immediately dispatch a snapshot if there is no snapshot yet if (!latestSnapshot) return 0; + const snapshotsEnough = await hasAtLeast2Snapshots(client, aid); + if (!snapshotsEnough) return 0; - const currentTimestamp = Date.now(); + const currentTimestamp = new Date().getTime() const timeIntervals = [3 * MINUTE, 20 * MINUTE, 1 * HOUR, 3 * HOUR, 6 * HOUR]; const DELTA = 0.00001; let minETAHours = Infinity; @@ -77,13 +82,15 @@ const getAdjustedShortTermETA = async (client: Client, aid: number) => { const date = new Date(currentTimestamp - timeInterval); const snapshot = await findClosestSnapshot(client, aid, date); if (!snapshot) continue; - const hoursDiff = (currentTimestamp - snapshot.created_at) / HOUR; - const viewsDiff = snapshot.views - latestSnapshot.views; + const hoursDiff = (latestSnapshot.created_at - snapshot.created_at) / HOUR; + const viewsDiff = latestSnapshot.views - snapshot.views; + if (viewsDiff <= 0) continue; const speed = viewsDiff / (hoursDiff + DELTA); const target = closetMilestone(latestSnapshot.views); const viewsToIncrease = target - latestSnapshot.views; const eta = viewsToIncrease / (speed + DELTA); - const factor = log(2.97 / log(viewsToIncrease + 1), 1.14); + let factor = log(2.97 / log(viewsToIncrease + 1), 1.14); + factor = truncate(factor, 3, 100) const adjustedETA = eta / factor; if (adjustedETA < minETAHours) { minETAHours = adjustedETA; @@ -97,12 +104,17 @@ export const collectMilestoneSnapshotsWorker = async (_job: Job) => { try { const videos = await getVideosNearMilestone(client); for (const video of videos) { - if (await videoHasActiveSchedule(client, video.aid)) continue; - const eta = await getAdjustedShortTermETA(client, video.aid); + const aid = Number(video.aid) + if (await videoHasActiveSchedule(client, aid)) continue; + const eta = await getAdjustedShortTermETA(client, aid); if (eta > 72) continue; const now = Date.now(); - const targetTime = now + eta * HOUR; - await scheduleSnapshot(client, video.aid, "milestone", targetTime); + const scheduledNextSnapshotDelay = eta * HOUR; + const maxInterval = 60 * MINUTE; + const minInterval = 1 * SECOND; + const delay = truncate(scheduledNextSnapshotDelay, minInterval, maxInterval); + const targetTime = now + delay; + await scheduleSnapshot(client, aid, "milestone", targetTime); } } catch (e) { logger.error(e as Error, "mq", "fn:collectMilestoneSnapshotsWorker"); diff --git a/lib/mq/init.ts b/lib/mq/init.ts index 688dd4a..17619b2 100644 --- a/lib/mq/init.ts +++ b/lib/mq/init.ts @@ -18,6 +18,11 @@ export async function initMQ() { await SnapshotQueue.upsertJobScheduler("snapshotTick", { every: 1 * SECOND, immediately: true, + }, { + opts: { + removeOnComplete: 1, + removeOnFail: 1, + }, }); await SnapshotQueue.upsertJobScheduler("collectMilestoneSnapshots", { every: 5 * MINUTE, diff --git a/src/worker.ts b/src/worker.ts index ba2b510..6781035 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -14,12 +14,14 @@ import { Deno.addSignalListener("SIGINT", async () => { logger.log("SIGINT Received: Shutting down workers...", "mq"); await latestVideoWorker.close(true); + await snapshotWorker.close(true); Deno.exit(); }); Deno.addSignalListener("SIGTERM", async () => { logger.log("SIGTERM Received: Shutting down workers...", "mq"); await latestVideoWorker.close(true); + await snapshotWorker.close(true); Deno.exit(); }); From 8652ac8fb736ab07b227aaa796d6b1fe299dff4c Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sun, 23 Mar 2025 19:45:52 +0800 Subject: [PATCH 19/79] fix: bugs of snapshot scheduling --- lib/db/snapshotSchedule.ts | 56 ++++++++++++++++++++++++------------- lib/mq/exec/snapshotTick.ts | 51 +++++++++++++++++++++++---------- lib/mq/init.ts | 6 ++++ src/worker.ts | 5 +++- 4 files changed, 84 insertions(+), 34 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 697a680..9ce304b 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -1,7 +1,7 @@ -import { DAY, MINUTE } from "$std/datetime/constants.ts"; -import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { formatTimestampToPsql } from "lib/utils/formatTimestampToPostgre.ts"; -import { SnapshotScheduleType } from "./schema.d.ts"; +import {DAY, HOUR, MINUTE} from "$std/datetime/constants.ts"; +import {Client} from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import {formatTimestampToPsql} from "lib/utils/formatTimestampToPostgre.ts"; +import {SnapshotScheduleType} from "./schema.d.ts"; import logger from "../log/logger.ts"; /* @@ -98,7 +98,8 @@ export async function getSnapshotScheduleCountWithinRange(client: Client, start: * @param targetTime Scheduled time for snapshot. (Timestamp in milliseconds) */ export async function scheduleSnapshot(client: Client, aid: number, type: string, targetTime: number) { - const adjustedTime = (await adjustSnapshotTime(client, new Date(targetTime))); + const allowedCount = type === "milestone" ? 2000 : 800; + const adjustedTime = await adjustSnapshotTime(client, new Date(targetTime), allowedCount); logger.log(`Scheduled snapshot for ${aid} at ${adjustedTime.toISOString()}`, "mq", "fn:scheduleSnapshot"); return client.queryObject( `INSERT INTO snapshot_schedule (aid, type, started_at) VALUES ($1, $2, $3)`, @@ -110,18 +111,20 @@ export async function scheduleSnapshot(client: Client, aid: number, type: string * Adjust the trigger time of the snapshot to ensure it does not exceed the frequency limit * @param client PostgreSQL client * @param expectedStartTime The expected snapshot time + * @param allowedCounts The number of snapshots allowed in the 5-minutes windows. * @returns The adjusted actual snapshot time */ export async function adjustSnapshotTime( client: Client, expectedStartTime: Date, + allowedCounts: number = 2000 ): Promise { const findWindowQuery = ` WITH windows AS ( SELECT generate_series( - $1::timestamp, -- Start time: current time truncated to the nearest 5-minute window - $2::timestamp, -- End time: 24 hours after the target time window starts - INTERVAL '5 MINUTES' + $1::timestamp, -- Start time: current time truncated to the nearest 5-minute window + $2::timestamp, -- End time: 24 hours after the target time window starts + INTERVAL '5 MINUTES' ) AS window_start ) SELECT w.window_start @@ -130,30 +133,34 @@ export async function adjustSnapshotTime( AND s.started_at < w.window_start + INTERVAL '5 MINUTES' AND s.status = 'pending' GROUP BY w.window_start - HAVING COUNT(s.*) < 2000 + HAVING COUNT(s.*) < ${allowedCounts} ORDER BY w.window_start LIMIT 1; `; - const now = new Date(new Date().getTime() + 5 * MINUTE); - const nowTruncated = truncateTo5MinInterval(now); - const currentWindowStart = truncateTo5MinInterval(expectedStartTime); - const end = new Date(currentWindowStart.getTime() + 1 * DAY); + const now = new Date(); + const targetTime = expectedStartTime.getTime(); + let start = new Date(targetTime - 2 * HOUR); + if (start.getTime() <= now.getTime()) { + start = now; + } + const startTruncated = truncateTo5MinInterval(start); + const end = new Date(startTruncated.getTime() + 1 * DAY); const windowResult = await client.queryObject<{ window_start: Date }>( findWindowQuery, - [nowTruncated, end], + [startTruncated, end], ); + const windowStart = windowResult.rows[0]?.window_start; if (!windowStart) { return expectedStartTime; } - // Returns windowStart if it is within the next 5 minutes if (windowStart.getTime() > new Date().getTime() + 5 * MINUTE) { - return windowStart - } - else { + const randomDelay = Math.floor(Math.random() * 5 * MINUTE); + return new Date(windowStart.getTime() + randomDelay); + } else { return expectedStartTime; } } @@ -189,8 +196,19 @@ export async function getSnapshotsInNextSecond(client: Client) { } export async function setSnapshotStatus(client: Client, id: number, status: string) { - return client.queryObject( + return await client.queryObject( `UPDATE snapshot_schedule SET status = $2 WHERE id = $1`, [id, status], ); } + +export async function getVideosWithoutActiveSnapshotSchedule(client: Client) { + const query: string = ` + SELECT s.aid + FROM songs s + LEFT JOIN snapshot_schedule ss ON s.aid = ss.aid AND (ss.status = 'pending' OR ss.status = 'processing') + WHERE ss.aid IS NULL + `; + const res = await client.queryObject<{ aid: number }>(query, []); + return res.rows.map((r) => Number(r.aid)); +} diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 253629e..0d1bf44 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -4,7 +4,7 @@ import { getVideosNearMilestone } from "lib/db/snapshot.ts"; import { findClosestSnapshot, getLatestSnapshot, - getSnapshotsInNextSecond, + getSnapshotsInNextSecond, getVideosWithoutActiveSnapshotSchedule, hasAtLeast2Snapshots, scheduleSnapshot, setSnapshotStatus, @@ -15,13 +15,14 @@ import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; import { HOUR, MINUTE, SECOND } from "$std/datetime/constants.ts"; import logger from "lib/log/logger.ts"; import { SnapshotQueue } from "lib/mq/index.ts"; -import { insertVideoSnapshot } from "../task/getVideoStats.ts"; -import { NetSchedulerError } from "../scheduler.ts"; -import { setBiliVideoStatus } from "../../db/allData.ts"; -import {truncate} from "../../utils/truncate.ts"; +import { insertVideoSnapshot } from "lib/mq/task/getVideoStats.ts"; +import { NetSchedulerError } from "lib/mq/scheduler.ts"; +import { setBiliVideoStatus } from "lib/db/allData.ts"; +import { truncate } from "lib/utils/truncate.ts"; const priorityMap: { [key: string]: number } = { "milestone": 1, + "normal": 3, }; const snapshotTypeToTaskMap: { [key: string]: string } = { @@ -73,7 +74,7 @@ const getAdjustedShortTermETA = async (client: Client, aid: number) => { const snapshotsEnough = await hasAtLeast2Snapshots(client, aid); if (!snapshotsEnough) return 0; - const currentTimestamp = new Date().getTime() + const currentTimestamp = new Date().getTime(); const timeIntervals = [3 * MINUTE, 20 * MINUTE, 1 * HOUR, 3 * HOUR, 6 * HOUR]; const DELTA = 0.00001; let minETAHours = Infinity; @@ -90,7 +91,7 @@ const getAdjustedShortTermETA = async (client: Client, aid: number) => { const viewsToIncrease = target - latestSnapshot.views; const eta = viewsToIncrease / (speed + DELTA); let factor = log(2.97 / log(viewsToIncrease + 1), 1.14); - factor = truncate(factor, 3, 100) + factor = truncate(factor, 3, 100); const adjustedETA = eta / factor; if (adjustedETA < minETAHours) { minETAHours = adjustedETA; @@ -104,13 +105,13 @@ export const collectMilestoneSnapshotsWorker = async (_job: Job) => { try { const videos = await getVideosNearMilestone(client); for (const video of videos) { - const aid = Number(video.aid) + const aid = Number(video.aid); if (await videoHasActiveSchedule(client, aid)) continue; const eta = await getAdjustedShortTermETA(client, aid); if (eta > 72) continue; const now = Date.now(); const scheduledNextSnapshotDelay = eta * HOUR; - const maxInterval = 60 * MINUTE; + const maxInterval = 4 * HOUR; const minInterval = 1 * SECOND; const delay = truncate(scheduledNextSnapshotDelay, minInterval, maxInterval); const targetTime = now + delay; @@ -123,11 +124,31 @@ export const collectMilestoneSnapshotsWorker = async (_job: Job) => { } }; +export const regularSnapshotsWorker = async (_job: Job) => { + const client = await db.connect(); + try { + const aids = await getVideosWithoutActiveSnapshotSchedule(client); + for (const rawAid of aids) { + const aid = Number(rawAid); + if (await videoHasActiveSchedule(client, aid)) continue; + const now = Date.now(); + const targetTime = now + 24 * HOUR; + await scheduleSnapshot(client, aid, "normal", targetTime); + } + } catch (e) { + logger.error(e as Error, "mq", "fn:regularSnapshotsWorker"); + } finally { + client.release(); + } +}; + export const takeSnapshotForVideoWorker = async (job: Job) => { const id = job.data.id; - const aid = job.data.aid; - const task = snapshotTypeToTaskMap[job.data.type] ?? "snapshotVideo"; + const aid = Number(job.data.aid); + const type = job.data.type; + const task = snapshotTypeToTaskMap[type] ?? "snapshotVideo"; const client = await db.connect(); + const retryInterval = type === "milestone" ? 5 * SECOND : 2 * MINUTE; try { if (await videoHasProcessingSchedule(client, aid)) { return `ALREADY_PROCESSING`; @@ -139,12 +160,14 @@ export const takeSnapshotForVideoWorker = async (job: Job) => { await setSnapshotStatus(client, id, "completed"); return `BILI_STATUS_${stat}`; } + await setSnapshotStatus(client, id, "completed"); + if (type !== "milestone") return `DONE`; const eta = await getAdjustedShortTermETA(client, aid); if (eta > 72) return "ETA_TOO_LONG"; const now = Date.now(); const targetTime = now + eta * HOUR; - await setSnapshotStatus(client, id, "completed"); - await scheduleSnapshot(client, aid, "milestone", targetTime); + await scheduleSnapshot(client, aid, type, targetTime); + return `DONE`; } catch (e) { if (e instanceof NetSchedulerError && e.code === "NO_PROXY_AVAILABLE") { logger.warn( @@ -153,7 +176,7 @@ export const takeSnapshotForVideoWorker = async (job: Job) => { "fn:takeSnapshotForVideoWorker", ); await setSnapshotStatus(client, id, "completed"); - await scheduleSnapshot(client, aid, "milestone", Date.now() + 5 * SECOND); + await scheduleSnapshot(client, aid, type, Date.now() + retryInterval); return; } logger.error(e as Error, "mq", "fn:takeSnapshotForVideoWorker"); diff --git a/lib/mq/init.ts b/lib/mq/init.ts index 17619b2..f019f28 100644 --- a/lib/mq/init.ts +++ b/lib/mq/init.ts @@ -24,10 +24,16 @@ export async function initMQ() { removeOnFail: 1, }, }); + await SnapshotQueue.upsertJobScheduler("collectMilestoneSnapshots", { every: 5 * MINUTE, immediately: true, }); + await SnapshotQueue.upsertJobScheduler("dispatchRegularSnapshots", { + every: 30 * MINUTE, + immediately: true, + }); + logger.log("Message queue initialized."); } diff --git a/src/worker.ts b/src/worker.ts index 6781035..7362864 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -6,7 +6,7 @@ import { lockManager } from "lib/mq/lockManager.ts"; import { WorkerError } from "lib/mq/schema.ts"; import { getVideoInfoWorker } from "lib/mq/exec/getLatestVideos.ts"; import { - collectMilestoneSnapshotsWorker, + collectMilestoneSnapshotsWorker, regularSnapshotsWorker, snapshotTickWorker, takeSnapshotForVideoWorker, } from "lib/mq/exec/snapshotTick.ts"; @@ -76,6 +76,9 @@ const snapshotWorker = new Worker( case "collectMilestoneSnapshots": await collectMilestoneSnapshotsWorker(job); break; + case "dispatchRegularSnapshots": + await regularSnapshotsWorker(job); + break; default: break; } From 7768a202b2f54681fd5de7ed50479c476738af70 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sun, 23 Mar 2025 20:23:00 +0800 Subject: [PATCH 20/79] fix: failed to serialize bigint correctly --- lib/db/snapshot.ts | 17 +++++++++++++++++ lib/mq/exec/snapshotTick.ts | 12 +++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/lib/db/snapshot.ts b/lib/db/snapshot.ts index be7ec26..09f14b5 100644 --- a/lib/db/snapshot.ts +++ b/lib/db/snapshot.ts @@ -20,3 +20,20 @@ export async function getVideosNearMilestone(client: Client) { }; }); } + +export async function getLatestVideoSnapshot(client: Client, aid: number): Promise { + const queryResult = await client.queryObject(` + SELECT * + FROM latest_video_snapshot + WHERE aid = $1 + `, [aid]); + if (queryResult.rows.length === 0) { + return null; + } + return queryResult.rows.map((row) => { + return { + ...row, + aid: Number(row.aid), + } + })[0]; +} diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 0d1bf44..95457e0 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -1,6 +1,6 @@ import { Job } from "bullmq"; import { db } from "lib/db/init.ts"; -import { getVideosNearMilestone } from "lib/db/snapshot.ts"; +import {getLatestVideoSnapshot, getVideosNearMilestone} from "lib/db/snapshot.ts"; import { findClosestSnapshot, getLatestSnapshot, @@ -42,7 +42,7 @@ export const snapshotTickWorker = async (_job: Job) => { const aid = Number(schedule.aid); await SnapshotQueue.add("snapshotVideo", { aid: aid, - id: schedule.id, + id: Number(schedule.id), type: schedule.type ?? "normal", }, { priority }); } @@ -131,8 +131,10 @@ export const regularSnapshotsWorker = async (_job: Job) => { for (const rawAid of aids) { const aid = Number(rawAid); if (await videoHasActiveSchedule(client, aid)) continue; + const latestSnapshot = await getLatestVideoSnapshot(client, aid); const now = Date.now(); - const targetTime = now + 24 * HOUR; + const lastSnapshotedAt = latestSnapshot?.time ?? now; + const targetTime = truncate(lastSnapshotedAt + 24 * HOUR, now + 1, Infinity); await scheduleSnapshot(client, aid, "normal", targetTime); } } catch (e) { @@ -161,6 +163,10 @@ export const takeSnapshotForVideoWorker = async (job: Job) => { return `BILI_STATUS_${stat}`; } await setSnapshotStatus(client, id, "completed"); + if (type === "normal") { + await scheduleSnapshot(client, aid, type, Date.now() + 24 * HOUR); + return `DONE`; + } if (type !== "milestone") return `DONE`; const eta = await getAdjustedShortTermETA(client, aid); if (eta > 72) return "ETA_TOO_LONG"; From b654eb3643a6bcd7dc9c7f089e657be74aebd8a5 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sun, 23 Mar 2025 20:30:06 +0800 Subject: [PATCH 21/79] fix: potential NaN as delay time when scheduling --- lib/mq/exec/snapshotTick.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 95457e0..0919164 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -97,6 +97,11 @@ const getAdjustedShortTermETA = async (client: Client, aid: number) => { minETAHours = adjustedETA; } } + + if (isNaN(minETAHours)) { + minETAHours = Infinity; + } + return minETAHours; }; From 33c63fc29f3eb7d6b7e1bec9e97d5db97142b1f9 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sun, 23 Mar 2025 20:41:13 +0800 Subject: [PATCH 22/79] fix: NaN caused by not converting date string to timestamp --- lib/db/snapshot.ts | 1 + lib/mq/exec/snapshotTick.ts | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/db/snapshot.ts b/lib/db/snapshot.ts index 09f14b5..34ffe82 100644 --- a/lib/db/snapshot.ts +++ b/lib/db/snapshot.ts @@ -34,6 +34,7 @@ export async function getLatestVideoSnapshot(client: Client, aid: number): Promi return { ...row, aid: Number(row.aid), + time: new Date(row.time).getTime(), } })[0]; } diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 0919164..be0b3e2 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -12,7 +12,7 @@ import { videoHasProcessingSchedule, } from "lib/db/snapshotSchedule.ts"; import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { HOUR, MINUTE, SECOND } from "$std/datetime/constants.ts"; +import { WEEK, HOUR, MINUTE, SECOND } from "$std/datetime/constants.ts"; import logger from "lib/log/logger.ts"; import { SnapshotQueue } from "lib/mq/index.ts"; import { insertVideoSnapshot } from "lib/mq/task/getVideoStats.ts"; @@ -139,7 +139,7 @@ export const regularSnapshotsWorker = async (_job: Job) => { const latestSnapshot = await getLatestVideoSnapshot(client, aid); const now = Date.now(); const lastSnapshotedAt = latestSnapshot?.time ?? now; - const targetTime = truncate(lastSnapshotedAt + 24 * HOUR, now + 1, Infinity); + const targetTime = truncate(lastSnapshotedAt + 24 * HOUR, now + 1, now + 100000 * WEEK); await scheduleSnapshot(client, aid, "normal", targetTime); } } catch (e) { From 9e764746fb694d39190bd490a4cafe24c1a62bcf Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sun, 23 Mar 2025 20:49:41 +0800 Subject: [PATCH 23/79] update: better task retrieval for snapshotTick add: logging when snapshot the video --- lib/db/snapshotSchedule.ts | 15 ++++++++++----- lib/mq/task/getVideoStats.ts | 3 +++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 9ce304b..7968cba 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -185,11 +185,16 @@ function truncateTo5MinInterval(timestamp: Date): Date { export async function getSnapshotsInNextSecond(client: Client) { const query = ` - SELECT * - FROM snapshot_schedule - WHERE started_at - BETWEEN NOW() - INTERVAL '5 seconds' - AND NOW() + INTERVAL '1 seconds' + SELECT * + FROM snapshot_schedule + WHERE started_at <= NOW() + INTERVAL '1 seconds' AND status = 'pending' + ORDER BY + CASE + WHEN type = 'milestone' THEN 0 + ELSE 1 + END, + started_at + LIMIT 3; `; const res = await client.queryObject(query, []); return res.rows; diff --git a/lib/mq/task/getVideoStats.ts b/lib/mq/task/getVideoStats.ts index 8e3530a..3be1cd7 100644 --- a/lib/mq/task/getVideoStats.ts +++ b/lib/mq/task/getVideoStats.ts @@ -1,6 +1,7 @@ import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; import { getVideoInfo } from "lib/net/getVideoInfo.ts"; import { LatestSnapshotType } from "lib/db/schema.d.ts"; +import logger from "lib/log/logger.ts"; /* * Fetch video stats from bilibili API and insert into database @@ -39,6 +40,8 @@ export async function insertVideoSnapshot( [aid, views, danmakus, replies, likes, coins, shares, favorites], ); + logger.log(`Taken snapshot for video ${aid}.`, "net", "fn:insertVideoSnapshot"); + const snapshot: LatestSnapshotType = { aid, views, From 3a0dd26c682d8e135be419d230c9570447d8d10a Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sun, 23 Mar 2025 21:36:40 +0800 Subject: [PATCH 24/79] fix: only 1 pending task at a time --- lib/db/snapshotSchedule.ts | 1 + lib/mq/exec/snapshotTick.ts | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 7968cba..8d64f36 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -98,6 +98,7 @@ export async function getSnapshotScheduleCountWithinRange(client: Client, start: * @param targetTime Scheduled time for snapshot. (Timestamp in milliseconds) */ export async function scheduleSnapshot(client: Client, aid: number, type: string, targetTime: number) { + if (await videoHasActiveSchedule(client, aid)) return; const allowedCount = type === "milestone" ? 2000 : 800; const adjustedTime = await adjustSnapshotTime(client, new Date(targetTime), allowedCount); logger.log(`Scheduled snapshot for ${aid} at ${adjustedTime.toISOString()}`, "mq", "fn:scheduleSnapshot"); diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index be0b3e2..519ecf3 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -8,7 +8,6 @@ import { hasAtLeast2Snapshots, scheduleSnapshot, setSnapshotStatus, - videoHasActiveSchedule, videoHasProcessingSchedule, } from "lib/db/snapshotSchedule.ts"; import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; @@ -111,7 +110,6 @@ export const collectMilestoneSnapshotsWorker = async (_job: Job) => { const videos = await getVideosNearMilestone(client); for (const video of videos) { const aid = Number(video.aid); - if (await videoHasActiveSchedule(client, aid)) continue; const eta = await getAdjustedShortTermETA(client, aid); if (eta > 72) continue; const now = Date.now(); @@ -135,7 +133,6 @@ export const regularSnapshotsWorker = async (_job: Job) => { const aids = await getVideosWithoutActiveSnapshotSchedule(client); for (const rawAid of aids) { const aid = Number(rawAid); - if (await videoHasActiveSchedule(client, aid)) continue; const latestSnapshot = await getLatestVideoSnapshot(client, aid); const now = Date.now(); const lastSnapshotedAt = latestSnapshot?.time ?? now; From 35d58be8fd945bcf5b9576654c3e3d75b19f57c8 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sun, 23 Mar 2025 21:42:05 +0800 Subject: [PATCH 25/79] fix: may accessing a non-existent schedule --- lib/db/snapshotSchedule.ts | 10 +++++++++- lib/mq/exec/snapshotTick.ts | 5 +++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 8d64f36..2c56f0d 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -2,7 +2,15 @@ import {DAY, HOUR, MINUTE} from "$std/datetime/constants.ts"; import {Client} from "https://deno.land/x/postgres@v0.19.3/mod.ts"; import {formatTimestampToPsql} from "lib/utils/formatTimestampToPostgre.ts"; import {SnapshotScheduleType} from "./schema.d.ts"; -import logger from "../log/logger.ts"; +import logger from "lib/log/logger.ts"; + +export async function snapshotScheduleExists(client: Client, id: number) { + const res = await client.queryObject<{ id: number }>( + `SELECT id FROM snapshot_schedule WHERE id = $1`, + [id], + ); + return res.rows.length > 0; +} /* Returns true if the specified `aid` has at least one record with "pending" or "processing" status. diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 519ecf3..9843788 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -8,6 +8,7 @@ import { hasAtLeast2Snapshots, scheduleSnapshot, setSnapshotStatus, + snapshotScheduleExists, videoHasProcessingSchedule, } from "lib/db/snapshotSchedule.ts"; import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; @@ -153,6 +154,10 @@ export const takeSnapshotForVideoWorker = async (job: Job) => { const task = snapshotTypeToTaskMap[type] ?? "snapshotVideo"; const client = await db.connect(); const retryInterval = type === "milestone" ? 5 * SECOND : 2 * MINUTE; + const exists = await snapshotScheduleExists(client, id); + if (!exists) { + return; + } try { if (await videoHasProcessingSchedule(client, aid)) { return `ALREADY_PROCESSING`; From f9dd53c2508ed8671ebded79e8c34f5aaf2d8c80 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sun, 23 Mar 2025 21:48:42 +0800 Subject: [PATCH 26/79] update: remove old jobScheduler --- lib/mq/init.ts | 2 ++ lib/mq/scheduler.ts | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/mq/init.ts b/lib/mq/init.ts index f019f28..0e377f1 100644 --- a/lib/mq/init.ts +++ b/lib/mq/init.ts @@ -35,5 +35,7 @@ export async function initMQ() { immediately: true, }); + await SnapshotQueue.removeJobScheduler('scheduleSnapshotTick'); + logger.log("Message queue initialized."); } diff --git a/lib/mq/scheduler.ts b/lib/mq/scheduler.ts index a711a11..748ef45 100644 --- a/lib/mq/scheduler.ts +++ b/lib/mq/scheduler.ts @@ -4,7 +4,6 @@ import { SlidingWindow } from "lib/mq/slidingWindow.ts"; import { redis } from "lib/db/redis.ts"; import Redis from "ioredis"; import { SECOND } from "$std/datetime/constants.ts"; -import { randomUUID } from "node:crypto"; interface Proxy { type: string; From 0be961e70958aafd69605443bd79335dc2bc49fd Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sun, 23 Mar 2025 23:31:16 +0800 Subject: [PATCH 27/79] improve: target time finding --- lib/db/snapshot.ts | 9 ++- lib/db/snapshotSchedule.ts | 115 +++++++++++++++++------------------- lib/mq/exec/snapshotTick.ts | 11 +++- lib/mq/init.ts | 2 +- src/worker.ts | 3 +- 5 files changed, 72 insertions(+), 68 deletions(-) diff --git a/lib/db/snapshot.ts b/lib/db/snapshot.ts index 34ffe82..c160d79 100644 --- a/lib/db/snapshot.ts +++ b/lib/db/snapshot.ts @@ -22,11 +22,14 @@ export async function getVideosNearMilestone(client: Client) { } export async function getLatestVideoSnapshot(client: Client, aid: number): Promise { - const queryResult = await client.queryObject(` + const queryResult = await client.queryObject( + ` SELECT * FROM latest_video_snapshot WHERE aid = $1 - `, [aid]); + `, + [aid], + ); if (queryResult.rows.length === 0) { return null; } @@ -35,6 +38,6 @@ export async function getLatestVideoSnapshot(client: Client, aid: number): Promi ...row, aid: Number(row.aid), time: new Date(row.time).getTime(), - } + }; })[0]; } diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 2c56f0d..4377d94 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -1,8 +1,8 @@ -import {DAY, HOUR, MINUTE} from "$std/datetime/constants.ts"; -import {Client} from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import {formatTimestampToPsql} from "lib/utils/formatTimestampToPostgre.ts"; -import {SnapshotScheduleType} from "./schema.d.ts"; +import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { formatTimestampToPsql } from "lib/utils/formatTimestampToPostgre.ts"; +import { SnapshotScheduleType } from "./schema.d.ts"; import logger from "lib/log/logger.ts"; +import { MINUTE } from "$std/datetime/constants.ts"; export async function snapshotScheduleExists(client: Client, id: number) { const res = await client.queryObject<{ id: number }>( @@ -120,78 +120,73 @@ export async function scheduleSnapshot(client: Client, aid: number, type: string * Adjust the trigger time of the snapshot to ensure it does not exceed the frequency limit * @param client PostgreSQL client * @param expectedStartTime The expected snapshot time - * @param allowedCounts The number of snapshots allowed in the 5-minutes windows. - * @returns The adjusted actual snapshot time + * @param allowedCounts The number of snapshots allowed in a 5-minute window (default: 2000) + * @returns The adjusted actual snapshot time within the first available window */ export async function adjustSnapshotTime( client: Client, expectedStartTime: Date, - allowedCounts: number = 2000 + allowedCounts: number = 2000, ): Promise { + // Query to find the closest available window by checking both past and future windows const findWindowQuery = ` - WITH windows AS ( - SELECT generate_series( - $1::timestamp, -- Start time: current time truncated to the nearest 5-minute window - $2::timestamp, -- End time: 24 hours after the target time window starts - INTERVAL '5 MINUTES' - ) AS window_start - ) - SELECT w.window_start - FROM windows w - LEFT JOIN snapshot_schedule s ON s.started_at >= w.window_start - AND s.started_at < w.window_start + INTERVAL '5 MINUTES' + WITH base AS ( + SELECT + date_trunc('minute', $1::timestamp) + - (EXTRACT(minute FROM $1::timestamp)::int % 5 * INTERVAL '1 minute') AS base_time + ), + offsets AS ( + SELECT generate_series(-100, 100) AS "offset" + ), + candidate_windows AS ( + SELECT + (base.base_time + ("offset" * INTERVAL '5 minutes')) AS window_start, + ABS("offset") AS distance + FROM base + CROSS JOIN offsets + ) + SELECT + window_start + FROM + candidate_windows cw + LEFT JOIN + snapshot_schedule s + ON + s.started_at >= cw.window_start + AND s.started_at < cw.window_start + INTERVAL '5 minutes' AND s.status = 'pending' - GROUP BY w.window_start - HAVING COUNT(s.*) < ${allowedCounts} - ORDER BY w.window_start - LIMIT 1; - `; - const now = new Date(); - const targetTime = expectedStartTime.getTime(); - let start = new Date(targetTime - 2 * HOUR); - if (start.getTime() <= now.getTime()) { - start = now; - } - const startTruncated = truncateTo5MinInterval(start); - const end = new Date(startTruncated.getTime() + 1 * DAY); + GROUP BY + cw.window_start, cw.distance + HAVING + COUNT(s.*) < $2 + ORDER BY + cw.distance, cw.window_start + LIMIT 1; + `; - const windowResult = await client.queryObject<{ window_start: Date }>( - findWindowQuery, - [startTruncated, end], - ); + try { + // Execute query to find the first available window + const windowResult = await client.queryObject<{ window_start: Date }>( + findWindowQuery, + [expectedStartTime, allowedCounts], + ); + // If no available window found, return original time (may exceed limit) + if (windowResult.rows.length === 0) { + return expectedStartTime; + } - const windowStart = windowResult.rows[0]?.window_start; - if (!windowStart) { - return expectedStartTime; - } + // Get the target window start time + const windowStart = windowResult.rows[0].window_start; - if (windowStart.getTime() > new Date().getTime() + 5 * MINUTE) { + // Add random delay within the 5-minute window to distribute load const randomDelay = Math.floor(Math.random() * 5 * MINUTE); return new Date(windowStart.getTime() + randomDelay); - } else { - return expectedStartTime; + } catch { + return expectedStartTime; // Fallback to original time } } -/** - * Truncate the timestamp to the nearest 5-minute interval - * @param timestamp The timestamp - * @returns The truncated time - */ -function truncateTo5MinInterval(timestamp: Date): Date { - const minutes = timestamp.getMinutes() - (timestamp.getMinutes() % 5); - return new Date( - timestamp.getFullYear(), - timestamp.getMonth(), - timestamp.getDate(), - timestamp.getHours(), - minutes, - 0, - 0, - ); -} - export async function getSnapshotsInNextSecond(client: Client) { const query = ` SELECT * diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 9843788..b94eb81 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -1,10 +1,11 @@ import { Job } from "bullmq"; import { db } from "lib/db/init.ts"; -import {getLatestVideoSnapshot, getVideosNearMilestone} from "lib/db/snapshot.ts"; +import { getLatestVideoSnapshot, getVideosNearMilestone } from "lib/db/snapshot.ts"; import { findClosestSnapshot, getLatestSnapshot, - getSnapshotsInNextSecond, getVideosWithoutActiveSnapshotSchedule, + getSnapshotsInNextSecond, + getVideosWithoutActiveSnapshotSchedule, hasAtLeast2Snapshots, scheduleSnapshot, setSnapshotStatus, @@ -12,7 +13,7 @@ import { videoHasProcessingSchedule, } from "lib/db/snapshotSchedule.ts"; import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { WEEK, HOUR, MINUTE, SECOND } from "$std/datetime/constants.ts"; +import { HOUR, MINUTE, SECOND, WEEK } from "$std/datetime/constants.ts"; import logger from "lib/log/logger.ts"; import { SnapshotQueue } from "lib/mq/index.ts"; import { insertVideoSnapshot } from "lib/mq/task/getVideoStats.ts"; @@ -107,6 +108,7 @@ const getAdjustedShortTermETA = async (client: Client, aid: number) => { export const collectMilestoneSnapshotsWorker = async (_job: Job) => { const client = await db.connect(); + const startedAt = Date.now(); try { const videos = await getVideosNearMilestone(client); for (const video of videos) { @@ -120,6 +122,9 @@ export const collectMilestoneSnapshotsWorker = async (_job: Job) => { const delay = truncate(scheduledNextSnapshotDelay, minInterval, maxInterval); const targetTime = now + delay; await scheduleSnapshot(client, aid, "milestone", targetTime); + if (now - startedAt > 25 * MINUTE) { + return; + } } } catch (e) { logger.error(e as Error, "mq", "fn:collectMilestoneSnapshotsWorker"); diff --git a/lib/mq/init.ts b/lib/mq/init.ts index 0e377f1..8466280 100644 --- a/lib/mq/init.ts +++ b/lib/mq/init.ts @@ -35,7 +35,7 @@ export async function initMQ() { immediately: true, }); - await SnapshotQueue.removeJobScheduler('scheduleSnapshotTick'); + await SnapshotQueue.removeJobScheduler("scheduleSnapshotTick"); logger.log("Message queue initialized."); } diff --git a/src/worker.ts b/src/worker.ts index 7362864..8198d09 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -6,7 +6,8 @@ import { lockManager } from "lib/mq/lockManager.ts"; import { WorkerError } from "lib/mq/schema.ts"; import { getVideoInfoWorker } from "lib/mq/exec/getLatestVideos.ts"; import { - collectMilestoneSnapshotsWorker, regularSnapshotsWorker, + collectMilestoneSnapshotsWorker, + regularSnapshotsWorker, snapshotTickWorker, takeSnapshotForVideoWorker, } from "lib/mq/exec/snapshotTick.ts"; From 9060d2882339dbd1ad03aebb777a888c2873de8e Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 24 Mar 2025 00:57:48 +0800 Subject: [PATCH 28/79] fix: prevent dispatchRegularSnapshots from running for too long --- lib/mq/exec/snapshotTick.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index b94eb81..c4882c4 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -108,7 +108,6 @@ const getAdjustedShortTermETA = async (client: Client, aid: number) => { export const collectMilestoneSnapshotsWorker = async (_job: Job) => { const client = await db.connect(); - const startedAt = Date.now(); try { const videos = await getVideosNearMilestone(client); for (const video of videos) { @@ -122,9 +121,6 @@ export const collectMilestoneSnapshotsWorker = async (_job: Job) => { const delay = truncate(scheduledNextSnapshotDelay, minInterval, maxInterval); const targetTime = now + delay; await scheduleSnapshot(client, aid, "milestone", targetTime); - if (now - startedAt > 25 * MINUTE) { - return; - } } } catch (e) { logger.error(e as Error, "mq", "fn:collectMilestoneSnapshotsWorker"); @@ -135,6 +131,7 @@ export const collectMilestoneSnapshotsWorker = async (_job: Job) => { export const regularSnapshotsWorker = async (_job: Job) => { const client = await db.connect(); + const startedAt = Date.now(); try { const aids = await getVideosWithoutActiveSnapshotSchedule(client); for (const rawAid of aids) { @@ -144,6 +141,9 @@ export const regularSnapshotsWorker = async (_job: Job) => { const lastSnapshotedAt = latestSnapshot?.time ?? now; const targetTime = truncate(lastSnapshotedAt + 24 * HOUR, now + 1, now + 100000 * WEEK); await scheduleSnapshot(client, aid, "normal", targetTime); + if (now - startedAt > 25 * MINUTE) { + return; + } } } catch (e) { logger.error(e as Error, "mq", "fn:regularSnapshotsWorker"); From 723b6090c49ef1fcf54703b2af347131e6c70b63 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 24 Mar 2025 01:06:48 +0800 Subject: [PATCH 29/79] fix: 1 dispatchRegularSnapshots working at a time --- lib/mq/exec/snapshotTick.ts | 3 +++ src/worker.ts | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index c4882c4..8fd7247 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -20,6 +20,7 @@ import { insertVideoSnapshot } from "lib/mq/task/getVideoStats.ts"; import { NetSchedulerError } from "lib/mq/scheduler.ts"; import { setBiliVideoStatus } from "lib/db/allData.ts"; import { truncate } from "lib/utils/truncate.ts"; +import { lockManager } from "lib/mq/lockManager.ts"; const priorityMap: { [key: string]: number } = { "milestone": 1, @@ -132,6 +133,7 @@ export const collectMilestoneSnapshotsWorker = async (_job: Job) => { export const regularSnapshotsWorker = async (_job: Job) => { const client = await db.connect(); const startedAt = Date.now(); + await lockManager.acquireLock("dispatchRegularSnapshots"); try { const aids = await getVideosWithoutActiveSnapshotSchedule(client); for (const rawAid of aids) { @@ -148,6 +150,7 @@ export const regularSnapshotsWorker = async (_job: Job) => { } catch (e) { logger.error(e as Error, "mq", "fn:regularSnapshotsWorker"); } finally { + lockManager.releaseLock("dispatchRegularSnapshots"); client.release(); } }; diff --git a/src/worker.ts b/src/worker.ts index 8198d09..aa11c25 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -91,3 +91,7 @@ snapshotWorker.on("error", (err) => { const e = err as WorkerError; logger.error(e.rawError, e.service, e.codePath); }); + +snapshotWorker.on("closed", async () => { + await lockManager.releaseLock("dispatchRegularSnapshots"); +}) From 0455abce2e9916cc1ad0d3375b3894b789c13623 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 24 Mar 2025 01:10:27 +0800 Subject: [PATCH 30/79] fix: did not quit when job is already workingx --- lib/mq/exec/snapshotTick.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 8fd7247..7eb93e2 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -133,7 +133,11 @@ export const collectMilestoneSnapshotsWorker = async (_job: Job) => { export const regularSnapshotsWorker = async (_job: Job) => { const client = await db.connect(); const startedAt = Date.now(); - await lockManager.acquireLock("dispatchRegularSnapshots"); + if (await lockManager.isLocked("dispatchRegularSnapshots")) { + logger.log("dispatchRegularSnapshots is already running", "mq"); + return; + } + await lockManager.acquireLock("dispatchRegularSnapshots", 30 * 60); try { const aids = await getVideosWithoutActiveSnapshotSchedule(client); for (const rawAid of aids) { From 3028dc13c7756e5f66ef34ecb19042fad80ec730 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 24 Mar 2025 02:35:55 +0800 Subject: [PATCH 31/79] improve: performance when dispatching jobs --- lib/db/snapshotSchedule.ts | 163 ++++++++++++++++++------------- lib/mq/exec/snapshotTick.ts | 2 +- lib/mq/init.ts | 75 ++++++++------ src/worker.ts | 2 +- test/db/snapshotSchedule.test.ts | 18 ---- 5 files changed, 143 insertions(+), 117 deletions(-) delete mode 100644 test/db/snapshotSchedule.test.ts diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 4377d94..bc8a039 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -2,7 +2,62 @@ import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; import { formatTimestampToPsql } from "lib/utils/formatTimestampToPostgre.ts"; import { SnapshotScheduleType } from "./schema.d.ts"; import logger from "lib/log/logger.ts"; -import { MINUTE } from "$std/datetime/constants.ts"; +import { DAY, MINUTE } from "$std/datetime/constants.ts"; +import { redis } from "lib/db/redis.ts"; +import { Redis } from "ioredis"; + +const WINDOW_SIZE = 2880; // 每天 2880 个 5 分钟窗口 +const REDIS_KEY = "cvsa:snapshot_window_counts"; // Redis Key 名称 + +// 获取当前时间对应的窗口索引 +function getCurrentWindowIndex(): number { + const now = new Date(); + const minutesSinceMidnight = now.getHours() * 60 + now.getMinutes(); + const currentWindow = Math.floor(minutesSinceMidnight / 5); + return currentWindow; +} + +// 刷新内存数组 +export async function refreshSnapshotWindowCounts(client: Client, redisClient: Redis) { + const now = new Date(); + const startTime = now.getTime(); + + const result = await client.queryObject<{ window_start: Date; count: number }>` + SELECT + date_trunc('hour', started_at) + + (EXTRACT(minute FROM started_at)::int / 5 * INTERVAL '5 minutes') AS window_start, + COUNT(*) AS count + FROM snapshot_schedule + WHERE started_at >= NOW() AND status = 'pending' AND started_at <= NOW() + INTERVAL '10 days' + GROUP BY 1 + ORDER BY window_start + ` + + await redisClient.del(REDIS_KEY); + + for (const row of result.rows) { + const offset = Math.floor((row.window_start.getTime() - startTime) / (5 * MINUTE)); + if (offset >= 0 && offset < WINDOW_SIZE) { + await redisClient.hset(REDIS_KEY, offset.toString(), Number(row.count)); + } + } +} + +export async function initSnapshotWindowCounts(client: Client, redisClient: Redis) { + await refreshSnapshotWindowCounts(client, redisClient); + setInterval(async () => { + await refreshSnapshotWindowCounts(client, redisClient); + }, 5 * MINUTE); +} + +async function getWindowCount(redisClient: Redis, offset: number): Promise { + const count = await redisClient.hget(REDIS_KEY, offset.toString()); + return count ? parseInt(count, 10) : 0; +} + +async function updateWindowCount(redisClient: Redis, offset: number, increment: number): Promise { + await redisClient.hincrby(REDIS_KEY, offset.toString(), increment); +} export async function snapshotScheduleExists(client: Client, id: number) { const res = await client.queryObject<{ id: number }>( @@ -12,9 +67,6 @@ export async function snapshotScheduleExists(client: Client, id: number) { return res.rows.length > 0; } -/* - Returns true if the specified `aid` has at least one record with "pending" or "processing" status. -*/ export async function videoHasActiveSchedule(client: Client, aid: number) { const res = await client.queryObject<{ status: string }>( `SELECT status FROM snapshot_schedule WHERE aid = $1 AND (status = 'pending' OR status = 'processing')`, @@ -107,8 +159,10 @@ export async function getSnapshotScheduleCountWithinRange(client: Client, start: */ export async function scheduleSnapshot(client: Client, aid: number, type: string, targetTime: number) { if (await videoHasActiveSchedule(client, aid)) return; - const allowedCount = type === "milestone" ? 2000 : 800; - const adjustedTime = await adjustSnapshotTime(client, new Date(targetTime), allowedCount); + let adjustedTime = new Date(targetTime); + if (type !== "milestone") { + adjustedTime = await adjustSnapshotTime(new Date(targetTime), 1000, redis); + } logger.log(`Scheduled snapshot for ${aid} at ${adjustedTime.toISOString()}`, "mq", "fn:scheduleSnapshot"); return client.queryObject( `INSERT INTO snapshot_schedule (aid, type, started_at) VALUES ($1, $2, $3)`, @@ -116,74 +170,51 @@ export async function scheduleSnapshot(client: Client, aid: number, type: string ); } -/** - * Adjust the trigger time of the snapshot to ensure it does not exceed the frequency limit - * @param client PostgreSQL client - * @param expectedStartTime The expected snapshot time - * @param allowedCounts The number of snapshots allowed in a 5-minute window (default: 2000) - * @returns The adjusted actual snapshot time within the first available window - */ export async function adjustSnapshotTime( - client: Client, expectedStartTime: Date, - allowedCounts: number = 2000, + allowedCounts: number = 1000, + redisClient: Redis, ): Promise { - // Query to find the closest available window by checking both past and future windows - const findWindowQuery = ` - WITH base AS ( - SELECT - date_trunc('minute', $1::timestamp) - - (EXTRACT(minute FROM $1::timestamp)::int % 5 * INTERVAL '1 minute') AS base_time - ), - offsets AS ( - SELECT generate_series(-100, 100) AS "offset" - ), - candidate_windows AS ( - SELECT - (base.base_time + ("offset" * INTERVAL '5 minutes')) AS window_start, - ABS("offset") AS distance - FROM base - CROSS JOIN offsets - ) - SELECT - window_start - FROM - candidate_windows cw - LEFT JOIN - snapshot_schedule s - ON - s.started_at >= cw.window_start - AND s.started_at < cw.window_start + INTERVAL '5 minutes' - AND s.status = 'pending' - GROUP BY - cw.window_start, cw.distance - HAVING - COUNT(s.*) < $2 - ORDER BY - cw.distance, cw.window_start - LIMIT 1; - `; + const currentWindow = getCurrentWindowIndex(); - try { - // Execute query to find the first available window - const windowResult = await client.queryObject<{ window_start: Date }>( - findWindowQuery, - [expectedStartTime, allowedCounts], - ); + // 计算目标窗口偏移量 + const targetOffset = Math.floor((expectedStartTime.getTime() - Date.now()) / (5 * 60 * 1000)); - // If no available window found, return original time (may exceed limit) - if (windowResult.rows.length === 0) { - return expectedStartTime; + // 在 Redis 中查找可用窗口 + for (let i = 0; i < WINDOW_SIZE; i++) { + const offset = (currentWindow + targetOffset + i) % WINDOW_SIZE; + const count = await getWindowCount(redisClient, offset); + + if (count < allowedCounts) { + // 找到可用窗口,更新计数 + await updateWindowCount(redisClient, offset, 1); + + // 计算具体时间 + const windowStart = new Date(Date.now() + offset * 5 * 60 * 1000); + const randomDelay = Math.floor(Math.random() * 5 * 60 * 1000); + return new Date(windowStart.getTime() + randomDelay); } + } - // Get the target window start time - const windowStart = windowResult.rows[0].window_start; + // 如果没有找到可用窗口,返回原始时间 + return expectedStartTime; +} - // Add random delay within the 5-minute window to distribute load - const randomDelay = Math.floor(Math.random() * 5 * MINUTE); - return new Date(windowStart.getTime() + randomDelay); - } catch { - return expectedStartTime; // Fallback to original time +export async function cleanupExpiredWindows(redisClient: Redis): Promise { + const now = new Date(); + const startTime = new Date(now.getTime() - 10 * DAY); // 保留最近 10 天的数据 + + // 获取所有窗口索引 + const allOffsets = await redisClient.hkeys(REDIS_KEY); + + // 删除过期窗口 + for (const offsetStr of allOffsets) { + const offset = parseInt(offsetStr, 10); + const windowStart = new Date(startTime.getTime() + offset * 5 * MINUTE); + + if (windowStart < startTime) { + await redisClient.hdel(REDIS_KEY, offsetStr); + } } } diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 7eb93e2..a9a377a 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -148,7 +148,7 @@ export const regularSnapshotsWorker = async (_job: Job) => { const targetTime = truncate(lastSnapshotedAt + 24 * HOUR, now + 1, now + 100000 * WEEK); await scheduleSnapshot(client, aid, "normal", targetTime); if (now - startedAt > 25 * MINUTE) { - return; + return; } } } catch (e) { diff --git a/lib/mq/init.ts b/lib/mq/init.ts index 8466280..b149e76 100644 --- a/lib/mq/init.ts +++ b/lib/mq/init.ts @@ -1,41 +1,54 @@ import { MINUTE, SECOND } from "$std/datetime/constants.ts"; import { ClassifyVideoQueue, LatestVideosQueue, SnapshotQueue } from "lib/mq/index.ts"; import logger from "lib/log/logger.ts"; +import { initSnapshotWindowCounts } from "lib/db/snapshotSchedule.ts"; +import { db } from "lib/db/init.ts"; +import { redis } from "lib/db/redis.ts"; export async function initMQ() { - await LatestVideosQueue.upsertJobScheduler("getLatestVideos", { - every: 1 * MINUTE, - immediately: true, - }); - await ClassifyVideoQueue.upsertJobScheduler("classifyVideos", { - every: 5 * MINUTE, - immediately: true, - }); - await LatestVideosQueue.upsertJobScheduler("collectSongs", { - every: 3 * MINUTE, - immediately: true, - }); - await SnapshotQueue.upsertJobScheduler("snapshotTick", { - every: 1 * SECOND, - immediately: true, - }, { - opts: { - removeOnComplete: 1, - removeOnFail: 1, - }, - }); + const client = await db.connect(); + try { + await initSnapshotWindowCounts(client, redis); - await SnapshotQueue.upsertJobScheduler("collectMilestoneSnapshots", { - every: 5 * MINUTE, - immediately: true, - }); + await LatestVideosQueue.upsertJobScheduler("getLatestVideos", { + every: 1 * MINUTE, + immediately: true, + }); - await SnapshotQueue.upsertJobScheduler("dispatchRegularSnapshots", { - every: 30 * MINUTE, - immediately: true, - }); + await ClassifyVideoQueue.upsertJobScheduler("classifyVideos", { + every: 5 * MINUTE, + immediately: true, + }); - await SnapshotQueue.removeJobScheduler("scheduleSnapshotTick"); + await LatestVideosQueue.upsertJobScheduler("collectSongs", { + every: 3 * MINUTE, + immediately: true, + }); - logger.log("Message queue initialized."); + await SnapshotQueue.upsertJobScheduler("snapshotTick", { + every: 1 * SECOND, + immediately: true, + }, { + opts: { + removeOnComplete: 1, + removeOnFail: 1, + }, + }); + + await SnapshotQueue.upsertJobScheduler("collectMilestoneSnapshots", { + every: 5 * MINUTE, + immediately: true, + }); + + await SnapshotQueue.upsertJobScheduler("dispatchRegularSnapshots", { + every: 30 * MINUTE, + immediately: true, + }); + + await SnapshotQueue.removeJobScheduler("scheduleSnapshotTick"); + + logger.log("Message queue initialized."); + } finally { + client.release(); + } } diff --git a/src/worker.ts b/src/worker.ts index aa11c25..c2f4d7d 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -94,4 +94,4 @@ snapshotWorker.on("error", (err) => { snapshotWorker.on("closed", async () => { await lockManager.releaseLock("dispatchRegularSnapshots"); -}) +}); diff --git a/test/db/snapshotSchedule.test.ts b/test/db/snapshotSchedule.test.ts deleted file mode 100644 index a5e1d6a..0000000 --- a/test/db/snapshotSchedule.test.ts +++ /dev/null @@ -1,18 +0,0 @@ -import { assertEquals, assertInstanceOf, assertNotEquals } from "@std/assert"; -import { findClosestSnapshot } from "lib/db/snapshotSchedule.ts"; -import { postgresConfig } from "lib/db/pgConfig.ts"; -import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; - -Deno.test("Snapshot Schedule - getShortTermTimeFeaturesForVideo", async () => { - const client = new Client(postgresConfig); - try { - const result = await findClosestSnapshot(client, 247308539, new Date(1741983383000)); - assertNotEquals(result, null); - const created_at = result!.created_at; - const views = result!.views; - assertInstanceOf(created_at, Date); - assertEquals(typeof views, "number"); - } finally { - client.end(); - } -}); From c99318e2d3f02fe16f1873b7b835d63021b97635 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 24 Mar 2025 02:39:57 +0800 Subject: [PATCH 32/79] update: missing env in deno job --- deno.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deno.json b/deno.json index 07a17bd..f4ff4ee 100644 --- a/deno.json +++ b/deno.json @@ -12,7 +12,7 @@ "update": "deno run -A -r https://fresh.deno.dev/update .", "worker:main": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net --allow-write --allow-run ./src/worker.ts", "worker:filter": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net --allow-write ./src/filterWorker.ts", - "adder": "deno run --allow-env --allow-read --allow-ffi --allow-net ./src/jobAdder.ts", + "adder": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net ./src/jobAdder.ts", "bullui": "deno run --allow-read --allow-env --allow-ffi --allow-net ./src/bullui.ts", "all": "concurrently 'deno task worker:main' 'deno task adder' 'deno task bullui' 'deno task worker:filter'", "test": "deno test ./test/ --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run" From 6f4a26e8b36cc49fa342a24e9a4f77aa8b086f05 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 24 Mar 2025 03:43:26 +0800 Subject: [PATCH 33/79] test: dynamic interval for reglar snapshots --- lib/mq/exec/snapshotTick.ts | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index a9a377a..e2dc0c1 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -130,6 +130,23 @@ export const collectMilestoneSnapshotsWorker = async (_job: Job) => { } }; +const getRegularSnapshotInterval = async (client: Client, aid: number) => { + const now = Date.now(); + const date = new Date(now - 24 * HOUR); + const oldSnapshot = await findClosestSnapshot(client, aid, date); + const latestSnapshot = await getLatestSnapshot(client, aid); + if (!oldSnapshot || !latestSnapshot) return 0; + const hoursDiff = (latestSnapshot.created_at - oldSnapshot.created_at) / HOUR; + if (hoursDiff < 8) return 24; + const viewsDiff = latestSnapshot.views - oldSnapshot.views; + if (viewsDiff === 0) return 72; + const speedPerDay = viewsDiff / hoursDiff * 24; + if (speedPerDay < 6) return 36; + if (speedPerDay < 120) return 24; + if (speedPerDay < 320) return 12; + return 6; +} + export const regularSnapshotsWorker = async (_job: Job) => { const client = await db.connect(); const startedAt = Date.now(); @@ -145,7 +162,9 @@ export const regularSnapshotsWorker = async (_job: Job) => { const latestSnapshot = await getLatestVideoSnapshot(client, aid); const now = Date.now(); const lastSnapshotedAt = latestSnapshot?.time ?? now; - const targetTime = truncate(lastSnapshotedAt + 24 * HOUR, now + 1, now + 100000 * WEEK); + const interval = await getRegularSnapshotInterval(client, aid); + logger.debug(`${interval} hours for aid ${aid}`, "mq") + const targetTime = truncate(lastSnapshotedAt + interval * HOUR, now + 1, now + 100000 * WEEK); await scheduleSnapshot(client, aid, "normal", targetTime); if (now - startedAt > 25 * MINUTE) { return; From 8be68248df4e209e9da18ef97eeca9abba2594a8 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 24 Mar 2025 03:49:06 +0800 Subject: [PATCH 34/79] fix: ignored the case where snapshots are actually the same --- lib/mq/exec/snapshotTick.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index e2dc0c1..4d63c6e 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -136,6 +136,7 @@ const getRegularSnapshotInterval = async (client: Client, aid: number) => { const oldSnapshot = await findClosestSnapshot(client, aid, date); const latestSnapshot = await getLatestSnapshot(client, aid); if (!oldSnapshot || !latestSnapshot) return 0; + if (oldSnapshot.created_at === latestSnapshot.created_at) return 0; const hoursDiff = (latestSnapshot.created_at - oldSnapshot.created_at) / HOUR; if (hoursDiff < 8) return 24; const viewsDiff = latestSnapshot.views - oldSnapshot.views; From cb573e55d9d97e1e502a3987cfa098242b9defee Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 24 Mar 2025 04:01:44 +0800 Subject: [PATCH 35/79] fix: incorrect offset when initializing --- lib/db/snapshotSchedule.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index bc8a039..4b0829e 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -35,8 +35,11 @@ export async function refreshSnapshotWindowCounts(client: Client, redisClient: R await redisClient.del(REDIS_KEY); + const currentWindow = getCurrentWindowIndex(); + for (const row of result.rows) { - const offset = Math.floor((row.window_start.getTime() - startTime) / (5 * MINUTE)); + const targetOffset = Math.floor((row.window_start.getTime() - startTime) / (5 * MINUTE)); + const offset = (currentWindow + targetOffset) % WINDOW_SIZE; if (offset >= 0 && offset < WINDOW_SIZE) { await redisClient.hset(REDIS_KEY, offset.toString(), Number(row.count)); } From 20731c0530075d540b7ec1a0189c6d10908daa79 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 24 Mar 2025 04:05:22 +0800 Subject: [PATCH 36/79] test: debug for init window --- lib/db/snapshotSchedule.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 4b0829e..8216dda 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -39,7 +39,8 @@ export async function refreshSnapshotWindowCounts(client: Client, redisClient: R for (const row of result.rows) { const targetOffset = Math.floor((row.window_start.getTime() - startTime) / (5 * MINUTE)); - const offset = (currentWindow + targetOffset) % WINDOW_SIZE; + const offset = (currentWindow + targetOffset); + logger.debug(`window_start: ${row.window_start}, count: ${row.count}, offset: ${offset}`); if (offset >= 0 && offset < WINDOW_SIZE) { await redisClient.hset(REDIS_KEY, offset.toString(), Number(row.count)); } From 584a1be9f9b292c89f73c78861001bfcfb7f5674 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 24 Mar 2025 04:08:34 +0800 Subject: [PATCH 37/79] test: debug for adjusting snapshot time --- lib/db/snapshotSchedule.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 8216dda..e5a98ee 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -40,7 +40,6 @@ export async function refreshSnapshotWindowCounts(client: Client, redisClient: R for (const row of result.rows) { const targetOffset = Math.floor((row.window_start.getTime() - startTime) / (5 * MINUTE)); const offset = (currentWindow + targetOffset); - logger.debug(`window_start: ${row.window_start}, count: ${row.count}, offset: ${offset}`); if (offset >= 0 && offset < WINDOW_SIZE) { await redisClient.hset(REDIS_KEY, offset.toString(), Number(row.count)); } @@ -188,6 +187,7 @@ export async function adjustSnapshotTime( for (let i = 0; i < WINDOW_SIZE; i++) { const offset = (currentWindow + targetOffset + i) % WINDOW_SIZE; const count = await getWindowCount(redisClient, offset); + logger.debug(`offset: ${offset}, count: ${count}, expectedStartTime: ${expectedStartTime}`); if (count < allowedCounts) { // 找到可用窗口,更新计数 From 0a73d28623ecc5fb82e729d3da7a721f6f4ec49e Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 24 Mar 2025 04:15:51 +0800 Subject: [PATCH 38/79] fix: incorrect time base when adjusting snapshot time --- lib/db/snapshotSchedule.ts | 38 +++++++++++--------------------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index e5a98ee..702c6dd 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -2,14 +2,14 @@ import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; import { formatTimestampToPsql } from "lib/utils/formatTimestampToPostgre.ts"; import { SnapshotScheduleType } from "./schema.d.ts"; import logger from "lib/log/logger.ts"; -import { DAY, MINUTE } from "$std/datetime/constants.ts"; +import { MINUTE } from "$std/datetime/constants.ts"; import { redis } from "lib/db/redis.ts"; import { Redis } from "ioredis"; -const WINDOW_SIZE = 2880; // 每天 2880 个 5 分钟窗口 -const REDIS_KEY = "cvsa:snapshot_window_counts"; // Redis Key 名称 +const WINDOW_SIZE = 2880; +const REDIS_KEY = "cvsa:snapshot_window_counts"; + -// 获取当前时间对应的窗口索引 function getCurrentWindowIndex(): number { const now = new Date(); const minutesSinceMidnight = now.getHours() * 60 + now.getMinutes(); @@ -17,7 +17,6 @@ function getCurrentWindowIndex(): number { return currentWindow; } -// 刷新内存数组 export async function refreshSnapshotWindowCounts(client: Client, redisClient: Redis) { const now = new Date(); const startTime = now.getTime(); @@ -181,22 +180,24 @@ export async function adjustSnapshotTime( const currentWindow = getCurrentWindowIndex(); // 计算目标窗口偏移量 - const targetOffset = Math.floor((expectedStartTime.getTime() - Date.now()) / (5 * 60 * 1000)); + const targetOffset = Math.floor((expectedStartTime.getTime() - Date.now()) / (5 * MINUTE)); // 在 Redis 中查找可用窗口 for (let i = 0; i < WINDOW_SIZE; i++) { const offset = (currentWindow + targetOffset + i) % WINDOW_SIZE; const count = await getWindowCount(redisClient, offset); - logger.debug(`offset: ${offset}, count: ${count}, expectedStartTime: ${expectedStartTime}`); if (count < allowedCounts) { // 找到可用窗口,更新计数 await updateWindowCount(redisClient, offset, 1); // 计算具体时间 - const windowStart = new Date(Date.now() + offset * 5 * 60 * 1000); - const randomDelay = Math.floor(Math.random() * 5 * 60 * 1000); - return new Date(windowStart.getTime() + randomDelay); + const startPoint = new Date(); + startPoint.setHours(0, 0, 0, 0); + const startTime = startPoint.getTime(); + const windowStart = startTime + offset * 5 * MINUTE; + const randomDelay = Math.floor(Math.random() * 5 * MINUTE); + return new Date(windowStart + randomDelay); } } @@ -204,23 +205,6 @@ export async function adjustSnapshotTime( return expectedStartTime; } -export async function cleanupExpiredWindows(redisClient: Redis): Promise { - const now = new Date(); - const startTime = new Date(now.getTime() - 10 * DAY); // 保留最近 10 天的数据 - - // 获取所有窗口索引 - const allOffsets = await redisClient.hkeys(REDIS_KEY); - - // 删除过期窗口 - for (const offsetStr of allOffsets) { - const offset = parseInt(offsetStr, 10); - const windowStart = new Date(startTime.getTime() + offset * 5 * MINUTE); - - if (windowStart < startTime) { - await redisClient.hdel(REDIS_KEY, offsetStr); - } - } -} export async function getSnapshotsInNextSecond(client: Client) { const query = ` From 42db333d1a43625f9365879d3fb920ddccfac82e Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 24 Mar 2025 04:40:10 +0800 Subject: [PATCH 39/79] temp: schedule for new songs --- lib/db/snapshotSchedule.ts | 2 +- lib/mq/exec/snapshotTick.ts | 8 ++++++-- lib/mq/task/collectSongs.ts | 3 +++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 702c6dd..e77f225 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -162,7 +162,7 @@ export async function getSnapshotScheduleCountWithinRange(client: Client, start: export async function scheduleSnapshot(client: Client, aid: number, type: string, targetTime: number) { if (await videoHasActiveSchedule(client, aid)) return; let adjustedTime = new Date(targetTime); - if (type !== "milestone") { + if (type !== "milestone" && type !== "new") { adjustedTime = await adjustSnapshotTime(new Date(targetTime), 1000, redis); } logger.log(`Scheduled snapshot for ${aid} at ${adjustedTime.toISOString()}`, "mq", "fn:scheduleSnapshot"); diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 4d63c6e..de7b039 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -30,6 +30,7 @@ const priorityMap: { [key: string]: number } = { const snapshotTypeToTaskMap: { [key: string]: string } = { "milestone": "snapshotMilestoneVideo", "normal": "snapshotVideo", + "new": "snapshotMilestoneVideo" }; export const snapshotTickWorker = async (_job: Job) => { @@ -164,7 +165,6 @@ export const regularSnapshotsWorker = async (_job: Job) => { const now = Date.now(); const lastSnapshotedAt = latestSnapshot?.time ?? now; const interval = await getRegularSnapshotInterval(client, aid); - logger.debug(`${interval} hours for aid ${aid}`, "mq") const targetTime = truncate(lastSnapshotedAt + interval * HOUR, now + 1, now + 100000 * WEEK); await scheduleSnapshot(client, aid, "normal", targetTime); if (now - startedAt > 25 * MINUTE) { @@ -203,8 +203,12 @@ export const takeSnapshotForVideoWorker = async (job: Job) => { } await setSnapshotStatus(client, id, "completed"); if (type === "normal") { - await scheduleSnapshot(client, aid, type, Date.now() + 24 * HOUR); + const interval = await getRegularSnapshotInterval(client, aid); + await scheduleSnapshot(client, aid, type, Date.now() + interval * HOUR); return `DONE`; + } + else if (type === "new") { + } if (type !== "milestone") return `DONE`; const eta = await getAdjustedShortTermETA(client, aid); diff --git a/lib/mq/task/collectSongs.ts b/lib/mq/task/collectSongs.ts index 9c49823..51dbb9f 100644 --- a/lib/mq/task/collectSongs.ts +++ b/lib/mq/task/collectSongs.ts @@ -1,6 +1,8 @@ import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; import { aidExistsInSongs, getNotCollectedSongs } from "lib/db/songs.ts"; import logger from "lib/log/logger.ts"; +import { scheduleSnapshot } from "lib/db/snapshotSchedule.ts"; +import { MINUTE } from "$std/datetime/constants.ts"; export async function collectSongs(client: Client) { const aids = await getNotCollectedSongs(client); @@ -8,6 +10,7 @@ export async function collectSongs(client: Client) { const exists = await aidExistsInSongs(client, aid); if (exists) continue; await insertIntoSongs(client, aid); + await scheduleSnapshot(client, aid, "new", Date.now() + 10 * MINUTE); logger.log(`Video ${aid} was added into the songs table.`, "mq", "fn:collectSongs"); } } From 48b1130cba13a024741f7367da73abb2e6ac7d95 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 24 Mar 2025 04:53:43 +0800 Subject: [PATCH 40/79] feat: continuous monitoring of new songs --- lib/db/songs.ts | 18 +++++++++++++++++- lib/mq/exec/snapshotTick.ts | 22 +++++++++++++++++++++- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/lib/db/songs.ts b/lib/db/songs.ts index 0d5a096..15a49b3 100644 --- a/lib/db/songs.ts +++ b/lib/db/songs.ts @@ -1,4 +1,5 @@ import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { parseTimestampFromPsql } from "lib/utils/formatTimestampToPostgre.ts"; export async function getNotCollectedSongs(client: Client) { const queryResult = await client.queryObject<{ aid: number }>(` @@ -22,8 +23,23 @@ export async function aidExistsInSongs(client: Client, aid: number) { FROM songs WHERE aid = $1 ); - `, + `, [aid], ); return queryResult.rows[0].exists; } + +export async function getSongsPublihsedAt(client: Client, aid: number) { + const queryResult = await client.queryObject<{ published_at: string }>( + ` + SELECT published_at + FROM songs + WHERE aid = $1; + `, + [aid], + ); + if (queryResult.rows.length === 0) { + return null; + } + return parseTimestampFromPsql(queryResult.rows[0].published_at); +} diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index de7b039..f14ebfd 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -21,6 +21,7 @@ import { NetSchedulerError } from "lib/mq/scheduler.ts"; import { setBiliVideoStatus } from "lib/db/allData.ts"; import { truncate } from "lib/utils/truncate.ts"; import { lockManager } from "lib/mq/lockManager.ts"; +import { getSongsPublihsedAt } from "lib/db/songs.ts"; const priorityMap: { [key: string]: number } = { "milestone": 1, @@ -208,7 +209,26 @@ export const takeSnapshotForVideoWorker = async (job: Job) => { return `DONE`; } else if (type === "new") { - + const publihsedAt = await getSongsPublihsedAt(client, aid); + const timeSincePublished = stat.time - publihsedAt!; + const viewsPerHour = stat.views / timeSincePublished * HOUR; + if (timeSincePublished > 48 * HOUR) { + return `DONE` + } + if (timeSincePublished > 2 * HOUR && viewsPerHour < 10) { + return `DONE` + } + let intervalMins = 240; + if (viewsPerHour > 50) { + intervalMins = 120; + } + if (viewsPerHour > 100) { + intervalMins = 60; + } + if (viewsPerHour > 1000) { + intervalMins = 15; + } + await scheduleSnapshot(client, aid, type, Date.now() + intervalMins * MINUTE); } if (type !== "milestone") return `DONE`; const eta = await getAdjustedShortTermETA(client, aid); From fa058b22fe9e319e356e362726e091a1c60f8b0b Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 24 Mar 2025 23:36:01 +0800 Subject: [PATCH 41/79] fix: job consumption rate too low, add outdated job cleanup --- lib/db/snapshotSchedule.ts | 2 +- lib/mq/exec/snapshotTick.ts | 44 ++++++++++++++++++++++++++++--------- src/worker.ts | 4 ++++ 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index e77f225..4e6e244 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -217,7 +217,7 @@ export async function getSnapshotsInNextSecond(client: Client) { ELSE 1 END, started_at - LIMIT 3; + LIMIT 10; `; const res = await client.queryObject(query, []); return res.rows; diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index f14ebfd..b96083b 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -31,7 +31,7 @@ const priorityMap: { [key: string]: number } = { const snapshotTypeToTaskMap: { [key: string]: string } = { "milestone": "snapshotMilestoneVideo", "normal": "snapshotVideo", - "new": "snapshotMilestoneVideo" + "new": "snapshotMilestoneVideo", }; export const snapshotTickWorker = async (_job: Job) => { @@ -134,7 +134,7 @@ export const collectMilestoneSnapshotsWorker = async (_job: Job) => { const getRegularSnapshotInterval = async (client: Client, aid: number) => { const now = Date.now(); - const date = new Date(now - 24 * HOUR); + const date = new Date(now - 24 * HOUR); const oldSnapshot = await findClosestSnapshot(client, aid, date); const latestSnapshot = await getLatestSnapshot(client, aid); if (!oldSnapshot || !latestSnapshot) return 0; @@ -148,7 +148,7 @@ const getRegularSnapshotInterval = async (client: Client, aid: number) => { if (speedPerDay < 120) return 24; if (speedPerDay < 320) return 12; return 6; -} +}; export const regularSnapshotsWorker = async (_job: Job) => { const client = await db.connect(); @@ -207,26 +207,25 @@ export const takeSnapshotForVideoWorker = async (job: Job) => { const interval = await getRegularSnapshotInterval(client, aid); await scheduleSnapshot(client, aid, type, Date.now() + interval * HOUR); return `DONE`; - } - else if (type === "new") { + } else if (type === "new") { const publihsedAt = await getSongsPublihsedAt(client, aid); const timeSincePublished = stat.time - publihsedAt!; const viewsPerHour = stat.views / timeSincePublished * HOUR; if (timeSincePublished > 48 * HOUR) { - return `DONE` + return `DONE`; } if (timeSincePublished > 2 * HOUR && viewsPerHour < 10) { - return `DONE` + return `DONE`; } let intervalMins = 240; if (viewsPerHour > 50) { - intervalMins = 120; + intervalMins = 120; } if (viewsPerHour > 100) { - intervalMins = 60; + intervalMins = 60; } if (viewsPerHour > 1000) { - intervalMins = 15; + intervalMins = 15; } await scheduleSnapshot(client, aid, type, Date.now() + intervalMins * MINUTE); } @@ -254,3 +253,28 @@ export const takeSnapshotForVideoWorker = async (job: Job) => { client.release(); } }; + +export const scheduleCleanupWorker = async (_job: Job) => { + const client = await db.connect(); + try { + const query = ` + SELECT id, aid, type + FROM snapshot_schedule + WHERE status IN ('pending', 'processing') + AND started_at < NOW() - INTERVAL '5 minutes' + `; + const { rows } = await client.queryObject<{ id: bigint; aid: bigint; type: string }>(query); + if (rows.length === 0) return; + for (const row of rows) { + const id = Number(row.id); + const aid = Number(row.aid); + const type = row.type; + await setSnapshotStatus(client, id, "timeout"); + await scheduleSnapshot(client, aid, type, Date.now() + 10 * SECOND); + } + } catch (e) { + logger.error(e as Error, "mq", "fn:scheduleCleanupWorker"); + } finally { + client.release(); + } +}; diff --git a/src/worker.ts b/src/worker.ts index c2f4d7d..ad0c9b6 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -10,6 +10,7 @@ import { regularSnapshotsWorker, snapshotTickWorker, takeSnapshotForVideoWorker, + scheduleCleanupWorker } from "lib/mq/exec/snapshotTick.ts"; Deno.addSignalListener("SIGINT", async () => { @@ -80,6 +81,9 @@ const snapshotWorker = new Worker( case "dispatchRegularSnapshots": await regularSnapshotsWorker(job); break; + case "scheduleCleanup": + await scheduleCleanupWorker(job); + break; default: break; } From d8b47f8fc8c3068709f7a24794ea1a812080c698 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 24 Mar 2025 23:37:46 +0800 Subject: [PATCH 42/79] add: log for scheduleCleanupWorker --- lib/mq/exec/snapshotTick.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index b96083b..7c3280c 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -271,6 +271,7 @@ export const scheduleCleanupWorker = async (_job: Job) => { const type = row.type; await setSnapshotStatus(client, id, "timeout"); await scheduleSnapshot(client, aid, type, Date.now() + 10 * SECOND); + logger.log(`Schedule ${id} has no response received for 5 minutes, rescheduled.`, "mq", "fn:scheduleCleanupWorker") } } catch (e) { logger.error(e as Error, "mq", "fn:scheduleCleanupWorker"); From 314beb54b5c0fa20e890e851063e6b94b96206dd Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 24 Mar 2025 23:39:29 +0800 Subject: [PATCH 43/79] add: upsertJob for scheduleCleanup --- lib/mq/init.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/mq/init.ts b/lib/mq/init.ts index b149e76..e416988 100644 --- a/lib/mq/init.ts +++ b/lib/mq/init.ts @@ -45,7 +45,10 @@ export async function initMQ() { immediately: true, }); - await SnapshotQueue.removeJobScheduler("scheduleSnapshotTick"); + await SnapshotQueue.upsertJobScheduler("scheduleCleanup", { + every: 30 * MINUTE, + immediately: true, + }); logger.log("Message queue initialized."); } finally { From 6b7142a6d5aac0afed8776a0f6b3a89438845961 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 24 Mar 2025 23:43:54 +0800 Subject: [PATCH 44/79] fix: new jobs may be scheduled before the current time --- lib/db/snapshotSchedule.ts | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 4e6e244..90e5b07 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -179,29 +179,31 @@ export async function adjustSnapshotTime( ): Promise { const currentWindow = getCurrentWindowIndex(); - // 计算目标窗口偏移量 const targetOffset = Math.floor((expectedStartTime.getTime() - Date.now()) / (5 * MINUTE)); - // 在 Redis 中查找可用窗口 for (let i = 0; i < WINDOW_SIZE; i++) { const offset = (currentWindow + targetOffset + i) % WINDOW_SIZE; const count = await getWindowCount(redisClient, offset); if (count < allowedCounts) { - // 找到可用窗口,更新计数 await updateWindowCount(redisClient, offset, 1); - // 计算具体时间 const startPoint = new Date(); startPoint.setHours(0, 0, 0, 0); const startTime = startPoint.getTime(); const windowStart = startTime + offset * 5 * MINUTE; const randomDelay = Math.floor(Math.random() * 5 * MINUTE); - return new Date(windowStart + randomDelay); + const delayedDate = new Date(windowStart + randomDelay); + const now = new Date(); + + if (delayedDate.getTime() < now.getTime()) { + return now; + } + + return delayedDate; } } - // 如果没有找到可用窗口,返回原始时间 return expectedStartTime; } From a178b7fc169a73121ac5991bdfdc52c9eadc256a Mon Sep 17 00:00:00 2001 From: alikia2x Date: Tue, 25 Mar 2025 04:52:19 +0800 Subject: [PATCH 45/79] update: force schedule for new songs --- lib/db/snapshotSchedule.ts | 4 ++-- lib/mq/exec/snapshotTick.ts | 2 +- lib/mq/task/collectSongs.ts | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 90e5b07..4816693 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -159,8 +159,8 @@ export async function getSnapshotScheduleCountWithinRange(client: Client, start: * @param aid The aid of the video. * @param targetTime Scheduled time for snapshot. (Timestamp in milliseconds) */ -export async function scheduleSnapshot(client: Client, aid: number, type: string, targetTime: number) { - if (await videoHasActiveSchedule(client, aid)) return; +export async function scheduleSnapshot(client: Client, aid: number, type: string, targetTime: number, force: boolean = false) { + if (await videoHasActiveSchedule(client, aid) && !force) return; let adjustedTime = new Date(targetTime); if (type !== "milestone" && type !== "new") { adjustedTime = await adjustSnapshotTime(new Date(targetTime), 1000, redis); diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 7c3280c..55d3f70 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -227,7 +227,7 @@ export const takeSnapshotForVideoWorker = async (job: Job) => { if (viewsPerHour > 1000) { intervalMins = 15; } - await scheduleSnapshot(client, aid, type, Date.now() + intervalMins * MINUTE); + await scheduleSnapshot(client, aid, type, Date.now() + intervalMins * MINUTE, true); } if (type !== "milestone") return `DONE`; const eta = await getAdjustedShortTermETA(client, aid); diff --git a/lib/mq/task/collectSongs.ts b/lib/mq/task/collectSongs.ts index 51dbb9f..b71aa3b 100644 --- a/lib/mq/task/collectSongs.ts +++ b/lib/mq/task/collectSongs.ts @@ -10,7 +10,7 @@ export async function collectSongs(client: Client) { const exists = await aidExistsInSongs(client, aid); if (exists) continue; await insertIntoSongs(client, aid); - await scheduleSnapshot(client, aid, "new", Date.now() + 10 * MINUTE); + await scheduleSnapshot(client, aid, "new", Date.now() + 10 * MINUTE, true); logger.log(`Video ${aid} was added into the songs table.`, "mq", "fn:collectSongs"); } } From 76c92566620631eb28d4c9446fff63001f7fed5f Mon Sep 17 00:00:00 2001 From: alikia2x Date: Tue, 25 Mar 2025 04:57:24 +0800 Subject: [PATCH 46/79] test: looser rate limits for snapshot --- lib/mq/scheduler.ts | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/mq/scheduler.ts b/lib/mq/scheduler.ts index 748ef45..ac09c2a 100644 --- a/lib/mq/scheduler.ts +++ b/lib/mq/scheduler.ts @@ -333,6 +333,9 @@ const biliLimiterConfig: RateLimiterConfig[] = [ }, ]; +const bili_test = biliLimiterConfig; +bili_test[3].max = 500 + /* Execution order for setup: @@ -364,7 +367,7 @@ for (const region of regions) { netScheduler.addTask("getVideoInfo", "bilibili", "all"); netScheduler.addTask("getLatestVideos", "bilibili", "all"); netScheduler.addTask("snapshotMilestoneVideo", "bilibili", regions.map((region) => `alicloud-${region}`)); -netScheduler.addTask("snapshotVideo", "bilibili", [ +netScheduler.addTask("snapshotVideo", "bili_test", [ "alicloud-qingdao", "alicloud-shanghai", "alicloud-zhangjiakou", @@ -375,7 +378,8 @@ netScheduler.addTask("snapshotVideo", "bilibili", [ netScheduler.setTaskLimiter("getVideoInfo", videoInfoRateLimiterConfig); netScheduler.setTaskLimiter("getLatestVideos", null); netScheduler.setTaskLimiter("snapshotMilestoneVideo", null); -netScheduler.setTaskLimiter("snapshotVideo", videoInfoRateLimiterConfig); +netScheduler.setTaskLimiter("snapshotVideo", null); netScheduler.setProviderLimiter("bilibili", biliLimiterConfig); +netScheduler.setProviderLimiter("bili_test", bili_test); export default netScheduler; From 86337e3802ab49fe8734e2236f3c4c730f716a77 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Tue, 25 Mar 2025 05:05:33 +0800 Subject: [PATCH 47/79] add: set scheduled job status to 'processing' before adding to bullmq --- lib/mq/exec/snapshotTick.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 55d3f70..93f0907 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -44,6 +44,7 @@ export const snapshotTickWorker = async (_job: Job) => { priority = priorityMap[schedule.type]; } const aid = Number(schedule.aid); + await setSnapshotStatus(client, schedule.id, "processing"); await SnapshotQueue.add("snapshotVideo", { aid: aid, id: Number(schedule.id), From b33fd790d147439b99a5db1625a9eb5296a11b56 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Tue, 25 Mar 2025 21:29:47 +0800 Subject: [PATCH 48/79] fix: snapshot process early returned incorrectly --- lib/mq/exec/snapshotTick.ts | 5 ----- 1 file changed, 5 deletions(-) diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 93f0907..63dec8d 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -10,7 +10,6 @@ import { scheduleSnapshot, setSnapshotStatus, snapshotScheduleExists, - videoHasProcessingSchedule, } from "lib/db/snapshotSchedule.ts"; import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; import { HOUR, MINUTE, SECOND, WEEK } from "$std/datetime/constants.ts"; @@ -193,10 +192,6 @@ export const takeSnapshotForVideoWorker = async (job: Job) => { return; } try { - if (await videoHasProcessingSchedule(client, aid)) { - return `ALREADY_PROCESSING`; - } - await setSnapshotStatus(client, id, "processing"); const stat = await insertVideoSnapshot(client, aid, task); if (typeof stat === "number") { await setBiliVideoStatus(client, aid, stat); From 2d8e990bc94270db40aa629c968f388b021ce233 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Tue, 25 Mar 2025 21:33:58 +0800 Subject: [PATCH 49/79] fix: prevent duplicate jobs added to queue --- lib/mq/exec/snapshotTick.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 63dec8d..d4149ce 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -10,6 +10,7 @@ import { scheduleSnapshot, setSnapshotStatus, snapshotScheduleExists, + videoHasProcessingSchedule, } from "lib/db/snapshotSchedule.ts"; import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; import { HOUR, MINUTE, SECOND, WEEK } from "$std/datetime/constants.ts"; @@ -38,6 +39,9 @@ export const snapshotTickWorker = async (_job: Job) => { try { const schedules = await getSnapshotsInNextSecond(client); for (const schedule of schedules) { + if (await videoHasProcessingSchedule(client, schedule.aid)) { + return `ALREADY_PROCESSING`; + } let priority = 3; if (schedule.type && priorityMap[schedule.type]) { priority = priorityMap[schedule.type]; @@ -192,6 +196,7 @@ export const takeSnapshotForVideoWorker = async (job: Job) => { return; } try { + await setSnapshotStatus(client, id, "processing"); const stat = await insertVideoSnapshot(client, aid, task); if (typeof stat === "number") { await setBiliVideoStatus(client, aid, stat); From 17ded637581b2b8eef1e9e27318767570163b88d Mon Sep 17 00:00:00 2001 From: alikia2x Date: Tue, 25 Mar 2025 22:38:57 +0800 Subject: [PATCH 50/79] update: perf monitor for adjustSnapshotTime --- lib/db/snapshotSchedule.ts | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 4816693..26f1df2 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -181,6 +181,8 @@ export async function adjustSnapshotTime( const targetOffset = Math.floor((expectedStartTime.getTime() - Date.now()) / (5 * MINUTE)); + let timePerIteration = 0; + const t = performance.now(); for (let i = 0; i < WINDOW_SIZE; i++) { const offset = (currentWindow + targetOffset + i) % WINDOW_SIZE; const count = await getWindowCount(redisClient, offset); @@ -197,13 +199,20 @@ export async function adjustSnapshotTime( const now = new Date(); if (delayedDate.getTime() < now.getTime()) { + const elapsed = performance.now() - t; + timePerIteration = elapsed / i; + logger.log(`Time per iteration: ${timePerIteration.toFixed(3)}ms`, "perf", "fn:adjustSnapshotTime"); return now; } - + const elapsed = performance.now() - t; + timePerIteration = elapsed / i; + logger.log(`Time per iteration: ${timePerIteration.toFixed(3)}ms`, "perf", "fn:adjustSnapshotTime"); return delayedDate; } } - + const elapsed = performance.now() - t; + timePerIteration = elapsed / WINDOW_SIZE; + logger.log(`Time per iteration: ${timePerIteration.toFixed(3)}ms`, "perf", "fn:adjustSnapshotTime"); return expectedStartTime; } From cabb360a16c7da3428e275774be6cc3b03ed6ade Mon Sep 17 00:00:00 2001 From: alikia2x Date: Tue, 25 Mar 2025 22:58:56 +0800 Subject: [PATCH 51/79] fix: missing entrance for schedule type 'new' update: perf log text --- lib/db/snapshotSchedule.ts | 6 +++--- lib/mq/exec/classifyVideo.ts | 3 +++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 26f1df2..e38fa83 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -201,18 +201,18 @@ export async function adjustSnapshotTime( if (delayedDate.getTime() < now.getTime()) { const elapsed = performance.now() - t; timePerIteration = elapsed / i; - logger.log(`Time per iteration: ${timePerIteration.toFixed(3)}ms`, "perf", "fn:adjustSnapshotTime"); + logger.log(`${timePerIteration.toFixed(3)}ms * ${i}iterations`, "perf", "fn:adjustSnapshotTime"); return now; } const elapsed = performance.now() - t; timePerIteration = elapsed / i; - logger.log(`Time per iteration: ${timePerIteration.toFixed(3)}ms`, "perf", "fn:adjustSnapshotTime"); + logger.log(`${timePerIteration.toFixed(3)}ms * ${i}iterations`, "perf", "fn:adjustSnapshotTime"); return delayedDate; } } const elapsed = performance.now() - t; timePerIteration = elapsed / WINDOW_SIZE; - logger.log(`Time per iteration: ${timePerIteration.toFixed(3)}ms`, "perf", "fn:adjustSnapshotTime"); + logger.log(`${timePerIteration.toFixed(3)}ms * ${WINDOW_SIZE}iterations`, "perf", "fn:adjustSnapshotTime"); return expectedStartTime; } diff --git a/lib/mq/exec/classifyVideo.ts b/lib/mq/exec/classifyVideo.ts index b86a9a6..20545a0 100644 --- a/lib/mq/exec/classifyVideo.ts +++ b/lib/mq/exec/classifyVideo.ts @@ -7,6 +7,8 @@ import logger from "lib/log/logger.ts"; import { lockManager } from "lib/mq/lockManager.ts"; import { aidExistsInSongs } from "lib/db/songs.ts"; import { insertIntoSongs } from "lib/mq/task/collectSongs.ts"; +import { scheduleSnapshot } from "lib/db/snapshotSchedule.ts"; +import { MINUTE } from "$std/datetime/constants.ts"; export const classifyVideoWorker = async (job: Job) => { const client = await db.connect(); @@ -27,6 +29,7 @@ export const classifyVideoWorker = async (job: Job) => { const exists = await aidExistsInSongs(client, aid); if (!exists && label !== 0) { + await scheduleSnapshot(client, aid, "new", Date.now() + 10 * MINUTE, true); await insertIntoSongs(client, aid); } From b286a9d7b10e0abde311c56eea9d576cc691f192 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Wed, 26 Mar 2025 22:59:52 +0800 Subject: [PATCH 52/79] update: use findSnapshotBefore instead of findClosetSnapshot in regular snapshot scheduling --- lib/db/snapshotSchedule.ts | 25 +++++++++++++++++++++++++ lib/mq/exec/snapshotTick.ts | 3 ++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index e38fa83..630643a 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -114,6 +114,31 @@ export async function findClosestSnapshot( }; } +export async function findSnapshotBefore( + client: Client, + aid: number, + targetTime: Date, +): Promise { + const query = ` + SELECT created_at, views + FROM video_snapshot + WHERE aid = $1 + AND created_at <= $2::timestamptz + ORDER BY created_at DESC + LIMIT 1 + `; + const result = await client.queryObject<{ created_at: string; views: number }>( + query, + [aid, targetTime.toISOString()], + ); + if (result.rows.length === 0) return null; + const row = result.rows[0]; + return { + created_at: new Date(row.created_at).getTime(), + views: row.views, + }; +} + export async function hasAtLeast2Snapshots(client: Client, aid: number) { const res = await client.queryObject<{ count: number }>( `SELECT COUNT(*) FROM video_snapshot WHERE aid = $1`, diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index d4149ce..4312caf 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -3,6 +3,7 @@ import { db } from "lib/db/init.ts"; import { getLatestVideoSnapshot, getVideosNearMilestone } from "lib/db/snapshot.ts"; import { findClosestSnapshot, + findSnapshotBefore, getLatestSnapshot, getSnapshotsInNextSecond, getVideosWithoutActiveSnapshotSchedule, @@ -139,7 +140,7 @@ export const collectMilestoneSnapshotsWorker = async (_job: Job) => { const getRegularSnapshotInterval = async (client: Client, aid: number) => { const now = Date.now(); const date = new Date(now - 24 * HOUR); - const oldSnapshot = await findClosestSnapshot(client, aid, date); + const oldSnapshot = await findSnapshotBefore(client, aid, date); const latestSnapshot = await getLatestSnapshot(client, aid); if (!oldSnapshot || !latestSnapshot) return 0; if (oldSnapshot.created_at === latestSnapshot.created_at) return 0; From 7adc370ba26985626f439bfad620d7238389f376 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Wed, 26 Mar 2025 23:00:45 +0800 Subject: [PATCH 53/79] update: increase concurrency of snapshot worker --- src/worker.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/worker.ts b/src/worker.ts index ad0c9b6..c8cbb6d 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -88,7 +88,7 @@ const snapshotWorker = new Worker( break; } }, - { connection: redis as ConnectionOptions, concurrency: 10, removeOnComplete: { count: 2000 } }, + { connection: redis as ConnectionOptions, concurrency: 50, removeOnComplete: { count: 2000 } }, ); snapshotWorker.on("error", (err) => { From 2c51c3c09cd9c9a1d7228edfab67ac388abc3998 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Wed, 26 Mar 2025 23:04:08 +0800 Subject: [PATCH 54/79] add: log for getRegularSnapshotInterval --- lib/mq/exec/snapshotTick.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 4312caf..9428ffe 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -171,6 +171,7 @@ export const regularSnapshotsWorker = async (_job: Job) => { const now = Date.now(); const lastSnapshotedAt = latestSnapshot?.time ?? now; const interval = await getRegularSnapshotInterval(client, aid); + logger.log(`Schedule regular snapshot for aid ${aid} in ${interval} hours.`, "mq") const targetTime = truncate(lastSnapshotedAt + interval * HOUR, now + 1, now + 100000 * WEEK); await scheduleSnapshot(client, aid, "normal", targetTime); if (now - startedAt > 25 * MINUTE) { @@ -207,6 +208,7 @@ export const takeSnapshotForVideoWorker = async (job: Job) => { await setSnapshotStatus(client, id, "completed"); if (type === "normal") { const interval = await getRegularSnapshotInterval(client, aid); + logger.log(`Schedule regular snapshot for aid ${aid} in ${interval} hours.`, "mq") await scheduleSnapshot(client, aid, type, Date.now() + interval * HOUR); return `DONE`; } else if (type === "new") { From 5450c17e13f61d3e2a316307c898078830ff9346 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Wed, 26 Mar 2025 23:13:53 +0800 Subject: [PATCH 55/79] update: reduce rate limit, no longer collect deleted videos for milestone monitoring --- lib/db/snapshot.ts | 1 + lib/mq/scheduler.ts | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/db/snapshot.ts b/lib/db/snapshot.ts index c160d79..726bfc5 100644 --- a/lib/db/snapshot.ts +++ b/lib/db/snapshot.ts @@ -7,6 +7,7 @@ export async function getVideosNearMilestone(client: Client) { FROM latest_video_snapshot ls INNER JOIN songs s ON ls.aid = s.aid + AND s.deleted = false WHERE s.deleted = false AND (views >= 90000 AND views < 100000) OR diff --git a/lib/mq/scheduler.ts b/lib/mq/scheduler.ts index ac09c2a..c31d8ef 100644 --- a/lib/mq/scheduler.ts +++ b/lib/mq/scheduler.ts @@ -334,7 +334,10 @@ const biliLimiterConfig: RateLimiterConfig[] = [ ]; const bili_test = biliLimiterConfig; -bili_test[3].max = 500 +bili_test[0].max = 10; +bili_test[1].max = 36; +bili_test[2].max = 150; +bili_test[3].max = 1000; /* Execution order for setup: From afee7f58bfa33d5876960382b446ab9e99a8f4a7 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Wed, 26 Mar 2025 23:25:07 +0800 Subject: [PATCH 56/79] update: return when meeting non-0 video status code in snapshotVideo job --- lib/db/allData.ts | 10 ++++++++++ lib/mq/exec/snapshotTick.ts | 6 ++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/lib/db/allData.ts b/lib/db/allData.ts index 6e9c509..bf92edd 100644 --- a/lib/db/allData.ts +++ b/lib/db/allData.ts @@ -75,3 +75,13 @@ export async function setBiliVideoStatus(client: Client, aid: number, status: nu [status, aid], ); } + +export async function getBiliVideoStatus(client: Client, aid: number) { + const queryResult = await client.queryObject<{ status: number }>( + `SELECT status FROM bilibili_metadata WHERE aid = $1`, + [aid], + ); + const rows = queryResult.rows; + if (rows.length === 0) return 0; + return rows[0].status; +} diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 9428ffe..14875f2 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -19,7 +19,7 @@ import logger from "lib/log/logger.ts"; import { SnapshotQueue } from "lib/mq/index.ts"; import { insertVideoSnapshot } from "lib/mq/task/getVideoStats.ts"; import { NetSchedulerError } from "lib/mq/scheduler.ts"; -import { setBiliVideoStatus } from "lib/db/allData.ts"; +import { getBiliVideoStatus, setBiliVideoStatus } from "lib/db/allData.ts"; import { truncate } from "lib/utils/truncate.ts"; import { lockManager } from "lib/mq/lockManager.ts"; import { getSongsPublihsedAt } from "lib/db/songs.ts"; @@ -197,13 +197,15 @@ export const takeSnapshotForVideoWorker = async (job: Job) => { if (!exists) { return; } + const status = await getBiliVideoStatus(client, aid); + if (status !== 0) return `REFUSE_WORKING_BILI_STATUS_${status}` try { await setSnapshotStatus(client, id, "processing"); const stat = await insertVideoSnapshot(client, aid, task); if (typeof stat === "number") { await setBiliVideoStatus(client, aid, stat); await setSnapshotStatus(client, id, "completed"); - return `BILI_STATUS_${stat}`; + return `GET_BILI_STATUS_${stat}`; } await setSnapshotStatus(client, id, "completed"); if (type === "normal") { From 92678066a7ab63e80648d640c91f29d1dd4e7860 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Wed, 26 Mar 2025 23:37:12 +0800 Subject: [PATCH 57/79] update: forgot to plus one for iteration count --- lib/db/snapshotSchedule.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 630643a..9e8a893 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -225,13 +225,13 @@ export async function adjustSnapshotTime( if (delayedDate.getTime() < now.getTime()) { const elapsed = performance.now() - t; - timePerIteration = elapsed / i; - logger.log(`${timePerIteration.toFixed(3)}ms * ${i}iterations`, "perf", "fn:adjustSnapshotTime"); + timePerIteration = elapsed / (i+1); + logger.log(`${timePerIteration.toFixed(3)}ms * ${i+1}iterations`, "perf", "fn:adjustSnapshotTime"); return now; } const elapsed = performance.now() - t; - timePerIteration = elapsed / i; - logger.log(`${timePerIteration.toFixed(3)}ms * ${i}iterations`, "perf", "fn:adjustSnapshotTime"); + timePerIteration = elapsed / (i+1); + logger.log(`${timePerIteration.toFixed(3)}ms * ${i+1}iterations`, "perf", "fn:adjustSnapshotTime"); return delayedDate; } } From c80b047e0ca7a0de8e0c560f1be0773fd0ebbd46 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 27 Mar 2025 01:31:54 +0800 Subject: [PATCH 58/79] fix: did not release db client before quiting --- lib/mq/exec/snapshotTick.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 14875f2..2a25265 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -195,10 +195,14 @@ export const takeSnapshotForVideoWorker = async (job: Job) => { const retryInterval = type === "milestone" ? 5 * SECOND : 2 * MINUTE; const exists = await snapshotScheduleExists(client, id); if (!exists) { + client.release(); return; } const status = await getBiliVideoStatus(client, aid); - if (status !== 0) return `REFUSE_WORKING_BILI_STATUS_${status}` + if (status !== 0) { + client.release(); + return `REFUSE_WORKING_BILI_STATUS_${status}`; + } try { await setSnapshotStatus(client, id, "processing"); const stat = await insertVideoSnapshot(client, aid, task); From d5c278ae066ba260f629d958d07685a888437823 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 27 Mar 2025 01:48:10 +0800 Subject: [PATCH 59/79] improve: cache result for adjustSnapshotTime --- lib/db/snapshotSchedule.ts | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 9e8a893..1aa2d06 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -9,6 +9,7 @@ import { Redis } from "ioredis"; const WINDOW_SIZE = 2880; const REDIS_KEY = "cvsa:snapshot_window_counts"; +let lastAvailableWindow: { offset: number; count: number } | null = null; function getCurrentWindowIndex(): number { const now = new Date(); @@ -43,6 +44,8 @@ export async function refreshSnapshotWindowCounts(client: Client, redisClient: R await redisClient.hset(REDIS_KEY, offset.toString(), Number(row.count)); } } + + lastAvailableWindow = null; } export async function initSnapshotWindowCounts(client: Client, redisClient: Redis) { @@ -57,10 +60,6 @@ async function getWindowCount(redisClient: Redis, offset: number): Promise { - await redisClient.hincrby(REDIS_KEY, offset.toString(), increment); -} - export async function snapshotScheduleExists(client: Client, id: number) { const res = await client.queryObject<{ id: number }>( `SELECT id FROM snapshot_schedule WHERE id = $1`, @@ -203,17 +202,23 @@ export async function adjustSnapshotTime( redisClient: Redis, ): Promise { const currentWindow = getCurrentWindowIndex(); - const targetOffset = Math.floor((expectedStartTime.getTime() - Date.now()) / (5 * MINUTE)); + let initialOffset = 0; + + if (lastAvailableWindow && lastAvailableWindow.count < allowedCounts) { + initialOffset = lastAvailableWindow.offset; + } + let timePerIteration = 0; const t = performance.now(); - for (let i = 0; i < WINDOW_SIZE; i++) { + for (let i = initialOffset; i < WINDOW_SIZE; i++) { const offset = (currentWindow + targetOffset + i) % WINDOW_SIZE; const count = await getWindowCount(redisClient, offset); if (count < allowedCounts) { - await updateWindowCount(redisClient, offset, 1); + const newCount = await redisClient.hincrby(REDIS_KEY, offset.toString(), 1); + lastAvailableWindow = { offset, count: newCount }; const startPoint = new Date(); startPoint.setHours(0, 0, 0, 0); From d6dd4d933437087d30ae0830aaacc3fda3f7a57a Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 27 Mar 2025 02:09:48 +0800 Subject: [PATCH 60/79] fix: potential shifting for obtained window offset in last commit --- lib/db/snapshotSchedule.ts | 2 +- lib/mq/exec/snapshotTick.ts | 1 + lib/net/bilibili.d.ts | 18 ++++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 1aa2d06..04bbe6e 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -207,7 +207,7 @@ export async function adjustSnapshotTime( let initialOffset = 0; if (lastAvailableWindow && lastAvailableWindow.count < allowedCounts) { - initialOffset = lastAvailableWindow.offset; + initialOffset = Math.max(lastAvailableWindow.offset - 2, 0); } let timePerIteration = 0; diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 2a25265..e2c4ccf 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -160,6 +160,7 @@ export const regularSnapshotsWorker = async (_job: Job) => { const startedAt = Date.now(); if (await lockManager.isLocked("dispatchRegularSnapshots")) { logger.log("dispatchRegularSnapshots is already running", "mq"); + client.release(); return; } await lockManager.acquireLock("dispatchRegularSnapshots", 30 * 60); diff --git a/lib/net/bilibili.d.ts b/lib/net/bilibili.d.ts index 6a66ecc..964a87e 100644 --- a/lib/net/bilibili.d.ts +++ b/lib/net/bilibili.d.ts @@ -9,6 +9,24 @@ export type VideoListResponse = BaseResponse; export type VideoDetailsResponse = BaseResponse; export type VideoTagsResponse = BaseResponse; export type VideoInfoResponse = BaseResponse; +export type MediaListInfoResponse = BaseResponse; + +type MediaListInfoData = MediaListInfoItem[]; + + +interface MediaListInfoItem { + bvid: string; + id: number; + cnt_info: { + coin: number; + collect: number; + danmaku: number; + play: number; + reply: number; + share: number; + thumb_up: number; + } +} interface VideoInfoData { bvid: string; From d44ba8a0aee3f93802905a3bbb1dada481349fdc Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 27 Mar 2025 02:13:33 +0800 Subject: [PATCH 61/79] fix: incorrect offset calculation in adjustSnapshotTime --- lib/db/snapshotSchedule.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 04bbe6e..8c825b7 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -202,9 +202,9 @@ export async function adjustSnapshotTime( redisClient: Redis, ): Promise { const currentWindow = getCurrentWindowIndex(); - const targetOffset = Math.floor((expectedStartTime.getTime() - Date.now()) / (5 * MINUTE)); + const targetOffset = Math.floor((expectedStartTime.getTime() - Date.now()) / (5 * MINUTE)) - 6; - let initialOffset = 0; + let initialOffset = currentWindow + Math.max(targetOffset, 0); if (lastAvailableWindow && lastAvailableWindow.count < allowedCounts) { initialOffset = Math.max(lastAvailableWindow.offset - 2, 0); @@ -213,7 +213,7 @@ export async function adjustSnapshotTime( let timePerIteration = 0; const t = performance.now(); for (let i = initialOffset; i < WINDOW_SIZE; i++) { - const offset = (currentWindow + targetOffset + i) % WINDOW_SIZE; + const offset = i; const count = await getWindowCount(redisClient, offset); if (count < allowedCounts) { From aea9e10d1a9d0fca46d7a54fb4bea733c0f58366 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 27 Mar 2025 02:33:54 +0800 Subject: [PATCH 62/79] update: bulk fetch --- lib/db/snapshotSchedule.ts | 7 +++++++ lib/mq/exec/snapshotTick.ts | 27 +++++++++++++++++++++++++++ lib/mq/scheduler.ts | 16 ++++++++++++++++ lib/net/bilibili.d.ts | 5 +++-- lib/net/bulkGetVideoStats.ts | 27 +++++++++++++++++++++++++++ 5 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 lib/net/bulkGetVideoStats.ts diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 8c825b7..158fef9 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -271,6 +271,13 @@ export async function setSnapshotStatus(client: Client, id: number, status: stri ); } +export async function bulkSetSnapshotStatus(client: Client, ids: number[], status: string) { + return await client.queryObject( + `UPDATE snapshot_schedule SET status = $2 WHERE id = ANY($1)`, + [ids, status], + ); +} + export async function getVideosWithoutActiveSnapshotSchedule(client: Client) { const query: string = ` SELECT s.aid diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index e2c4ccf..f29dfce 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -2,6 +2,7 @@ import { Job } from "bullmq"; import { db } from "lib/db/init.ts"; import { getLatestVideoSnapshot, getVideosNearMilestone } from "lib/db/snapshot.ts"; import { + bulkSetSnapshotStatus, findClosestSnapshot, findSnapshotBefore, getLatestSnapshot, @@ -23,6 +24,7 @@ import { getBiliVideoStatus, setBiliVideoStatus } from "lib/db/allData.ts"; import { truncate } from "lib/utils/truncate.ts"; import { lockManager } from "lib/mq/lockManager.ts"; import { getSongsPublihsedAt } from "lib/db/songs.ts"; +import { bulkGetVideoStats } from "lib/net/bulkGetVideoStats.ts"; const priorityMap: { [key: string]: number } = { "milestone": 1, @@ -187,6 +189,31 @@ export const regularSnapshotsWorker = async (_job: Job) => { } }; +export const takeBulkSnapshotForVideosWorker = async (job: Job) => { + const dataMap: {[key: number]: number} = job.data.map; + const ids = Object.keys(dataMap).map((id) => Number(id)); + const aidsToFetch: number[] = []; + const client = await db.connect(); + try { + for (const id of ids) { + const aid = Number(dataMap[id]); + const exists = await snapshotScheduleExists(client, id); + if (!exists) { + continue + } + aidsToFetch.push(aid); + } + const data = await bulkGetVideoStats(aidsToFetch); + if (typeof data === "number") { + await bulkSetSnapshotStatus(client, ids, "failed"); + return `GET_BILI_STATUS_${data}`; + } + } + finally { + client.release(); + } +} + export const takeSnapshotForVideoWorker = async (job: Job) => { const id = job.data.id; const aid = Number(job.data.aid); diff --git a/lib/mq/scheduler.ts b/lib/mq/scheduler.ts index c31d8ef..6345755 100644 --- a/lib/mq/scheduler.ts +++ b/lib/mq/scheduler.ts @@ -339,6 +339,12 @@ bili_test[1].max = 36; bili_test[2].max = 150; bili_test[3].max = 1000; +const bili_strict = biliLimiterConfig; +bili_strict[0].max = 4; +bili_strict[1].max = 8; +bili_strict[2].max = 30; +bili_strict[3].max = 100; + /* Execution order for setup: @@ -378,11 +384,21 @@ netScheduler.addTask("snapshotVideo", "bili_test", [ "alicloud-shenzhen", "alicloud-hohhot", ]); +netScheduler.addTask("bulkSnapshot", "bili_strict", [ + "alicloud-qingdao", + "alicloud-shanghai", + "alicloud-zhangjiakou", + "alicloud-chengdu", + "alicloud-shenzhen", + "alicloud-hohhot", +]); netScheduler.setTaskLimiter("getVideoInfo", videoInfoRateLimiterConfig); netScheduler.setTaskLimiter("getLatestVideos", null); netScheduler.setTaskLimiter("snapshotMilestoneVideo", null); netScheduler.setTaskLimiter("snapshotVideo", null); +netScheduler.setTaskLimiter("bulkSnapshot", null); netScheduler.setProviderLimiter("bilibili", biliLimiterConfig); netScheduler.setProviderLimiter("bili_test", bili_test); +netScheduler.setProviderLimiter("bili_strict", bili_strict); export default netScheduler; diff --git a/lib/net/bilibili.d.ts b/lib/net/bilibili.d.ts index 964a87e..19e1ba2 100644 --- a/lib/net/bilibili.d.ts +++ b/lib/net/bilibili.d.ts @@ -11,10 +11,11 @@ export type VideoTagsResponse = BaseResponse; export type VideoInfoResponse = BaseResponse; export type MediaListInfoResponse = BaseResponse; -type MediaListInfoData = MediaListInfoItem[]; +export type MediaListInfoData = MediaListInfoItem[]; -interface MediaListInfoItem { +export interface MediaListInfoItem { + attr: number; bvid: string; id: number; cnt_info: { diff --git a/lib/net/bulkGetVideoStats.ts b/lib/net/bulkGetVideoStats.ts new file mode 100644 index 0000000..7240bed --- /dev/null +++ b/lib/net/bulkGetVideoStats.ts @@ -0,0 +1,27 @@ +import netScheduler from "lib/mq/scheduler.ts"; +import { MediaListInfoData, MediaListInfoResponse } from "lib/net/bilibili.d.ts"; +import logger from "lib/log/logger.ts"; + +/* + * Bulk fetch video metadata from bilibili API + * @param {number[]} aids - The aid list to fetch + * @returns {Promise} MediaListInfoData or the error code returned by bilibili API + * @throws {NetSchedulerError} - The error will be thrown in following cases: + * - No proxy is available currently: with error code `NO_PROXY_AVAILABLE` + * - The native `fetch` function threw an error: with error code `FETCH_ERROR` + * - The alicloud-fc threw an error: with error code `ALICLOUD_FC_ERROR` + */ +export async function bulkGetVideoStats(aids: number[]): Promise { + const baseURL = `https://api.bilibili.com/medialist/gateway/base/resource/infos?resources=`; + let url = baseURL; + for (const aid of aids) { + url += `${aid}:2,`; + } + const data = await netScheduler.request(url, "bulkSnapshot"); + const errMessage = `Error fetching metadata for aid list: ${aids.join(",")}:`; + if (data.code !== 0) { + logger.error(errMessage + data.code + "-" + data.message, "net", "fn:getVideoInfo"); + return data.code; + } + return data.data; +} From d229e49ff2becc2017734c5033c825ced83c12bb Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 27 Mar 2025 02:58:42 +0800 Subject: [PATCH 63/79] feat: bulk fetch --- lib/db/snapshotSchedule.ts | 14 ++++++++ lib/mq/exec/snapshotTick.ts | 65 ++++++++++++++++++++++++++++++++----- src/worker.ts | 6 +++- 3 files changed, 76 insertions(+), 9 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 158fef9..9c64d1d 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -84,6 +84,14 @@ export async function videoHasProcessingSchedule(client: Client, aid: number) { return res.rows.length > 0; } +export async function bulkGetVideosWithoutProcessingSchedules(client: Client, aids: number[]) { + const res = await client.queryObject<{ aid: number }>( + `SELECT aid FROM snapshot_schedule WHERE aid = ANY($1) AND status != 'processing' GROUP BY aid`, + [aids], + ); + return res.rows.map((row) => row.aid); +} + interface Snapshot { created_at: number; views: number; @@ -196,6 +204,12 @@ export async function scheduleSnapshot(client: Client, aid: number, type: string ); } +export async function bulkScheduleSnapshot(client: Client, aids: number[], type: string, targetTime: number, force: boolean = false) { + for (const aid of aids) { + await scheduleSnapshot(client, aid, type, targetTime, force); + } +} + export async function adjustSnapshotTime( expectedStartTime: Date, allowedCounts: number = 1000, diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index f29dfce..3f1203b 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -2,6 +2,8 @@ import { Job } from "bullmq"; import { db } from "lib/db/init.ts"; import { getLatestVideoSnapshot, getVideosNearMilestone } from "lib/db/snapshot.ts"; import { + bulkGetVideosWithoutProcessingSchedules, + bulkScheduleSnapshot, bulkSetSnapshotStatus, findClosestSnapshot, findSnapshotBefore, @@ -41,6 +43,22 @@ export const snapshotTickWorker = async (_job: Job) => { const client = await db.connect(); try { const schedules = await getSnapshotsInNextSecond(client); + const count = schedules.length; + const groups = Math.ceil(count / 30); + for (let i = 0; i < groups; i++) { + const group = schedules.slice(i * 30, (i + 1) * 30); + const aids = group.map((schedule) => schedule.aid); + const filteredAids = await bulkGetVideosWithoutProcessingSchedules(client, aids); + if (filteredAids.length === 0) continue; + await bulkSetSnapshotStatus(client, filteredAids, "processing"); + const dataMap: { [key: number]: number } = {}; + for (const schedule of group) { + dataMap[schedule.id] = schedule.aid; + } + await SnapshotQueue.add("bulkSnapshotVideo", { + map: dataMap, + }, { priority: 3 }); + } for (const schedule of schedules) { if (await videoHasProcessingSchedule(client, schedule.aid)) { return `ALREADY_PROCESSING`; @@ -174,7 +192,7 @@ export const regularSnapshotsWorker = async (_job: Job) => { const now = Date.now(); const lastSnapshotedAt = latestSnapshot?.time ?? now; const interval = await getRegularSnapshotInterval(client, aid); - logger.log(`Schedule regular snapshot for aid ${aid} in ${interval} hours.`, "mq") + logger.log(`Schedule regular snapshot for aid ${aid} in ${interval} hours.`, "mq"); const targetTime = truncate(lastSnapshotedAt + interval * HOUR, now + 1, now + 100000 * WEEK); await scheduleSnapshot(client, aid, "normal", targetTime); if (now - startedAt > 25 * MINUTE) { @@ -190,7 +208,7 @@ export const regularSnapshotsWorker = async (_job: Job) => { }; export const takeBulkSnapshotForVideosWorker = async (job: Job) => { - const dataMap: {[key: number]: number} = job.data.map; + const dataMap: { [key: number]: number } = job.data.map; const ids = Object.keys(dataMap).map((id) => Number(id)); const aidsToFetch: number[] = []; const client = await db.connect(); @@ -199,20 +217,47 @@ export const takeBulkSnapshotForVideosWorker = async (job: Job) => { const aid = Number(dataMap[id]); const exists = await snapshotScheduleExists(client, id); if (!exists) { - continue + continue; } aidsToFetch.push(aid); } const data = await bulkGetVideoStats(aidsToFetch); if (typeof data === "number") { await bulkSetSnapshotStatus(client, ids, "failed"); + await bulkScheduleSnapshot(client, aidsToFetch, "normal", Date.now() + 15 * SECOND); return `GET_BILI_STATUS_${data}`; } - } - finally { + for (const video of data) { + const aid = video.id; + const stat = video.cnt_info; + const views = stat.play; + const danmakus = stat.danmaku; + const replies = stat.reply; + const likes = stat.thumb_up; + const coins = stat.coin; + const shares = stat.share; + const favorites = stat.collect; + const query: string = ` + INSERT INTO video_snapshot (aid, views, danmakus, replies, likes, coins, shares, favorites) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8) + `; + await client.queryObject( + query, + [aid, views, danmakus, replies, likes, coins, shares, favorites], + ); + + logger.log(`Taken snapshot for video ${aid} in bulk.`, "net", "fn:takeBulkSnapshotForVideosWorker"); + } + for (const aid of aidsToFetch) { + const interval = await getRegularSnapshotInterval(client, aid); + logger.log(`Schedule regular snapshot for aid ${aid} in ${interval} hours.`, "mq"); + await scheduleSnapshot(client, aid, "normal", Date.now() + interval * HOUR); + } + return `DONE`; + } finally { client.release(); } -} +}; export const takeSnapshotForVideoWorker = async (job: Job) => { const id = job.data.id; @@ -242,7 +287,7 @@ export const takeSnapshotForVideoWorker = async (job: Job) => { await setSnapshotStatus(client, id, "completed"); if (type === "normal") { const interval = await getRegularSnapshotInterval(client, aid); - logger.log(`Schedule regular snapshot for aid ${aid} in ${interval} hours.`, "mq") + logger.log(`Schedule regular snapshot for aid ${aid} in ${interval} hours.`, "mq"); await scheduleSnapshot(client, aid, type, Date.now() + interval * HOUR); return `DONE`; } else if (type === "new") { @@ -309,7 +354,11 @@ export const scheduleCleanupWorker = async (_job: Job) => { const type = row.type; await setSnapshotStatus(client, id, "timeout"); await scheduleSnapshot(client, aid, type, Date.now() + 10 * SECOND); - logger.log(`Schedule ${id} has no response received for 5 minutes, rescheduled.`, "mq", "fn:scheduleCleanupWorker") + logger.log( + `Schedule ${id} has no response received for 5 minutes, rescheduled.`, + "mq", + "fn:scheduleCleanupWorker", + ); } } catch (e) { logger.error(e as Error, "mq", "fn:scheduleCleanupWorker"); diff --git a/src/worker.ts b/src/worker.ts index c8cbb6d..883ec74 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -10,7 +10,8 @@ import { regularSnapshotsWorker, snapshotTickWorker, takeSnapshotForVideoWorker, - scheduleCleanupWorker + scheduleCleanupWorker, + takeBulkSnapshotForVideosWorker } from "lib/mq/exec/snapshotTick.ts"; Deno.addSignalListener("SIGINT", async () => { @@ -84,6 +85,9 @@ const snapshotWorker = new Worker( case "scheduleCleanup": await scheduleCleanupWorker(job); break; + case "bulkSnapshotVideo": + await takeBulkSnapshotForVideosWorker(job); + break; default: break; } From d8201c7f8e382935500663d0da1869164e0a7cb3 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 27 Mar 2025 03:13:05 +0800 Subject: [PATCH 64/79] update: shallow copy instead of re-asignment when creating copy of rate limitor config --- lib/mq/scheduler.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/mq/scheduler.ts b/lib/mq/scheduler.ts index 6345755..798a249 100644 --- a/lib/mq/scheduler.ts +++ b/lib/mq/scheduler.ts @@ -333,13 +333,13 @@ const biliLimiterConfig: RateLimiterConfig[] = [ }, ]; -const bili_test = biliLimiterConfig; +const bili_test = [...biliLimiterConfig]; bili_test[0].max = 10; bili_test[1].max = 36; bili_test[2].max = 150; bili_test[3].max = 1000; -const bili_strict = biliLimiterConfig; +const bili_strict = [...biliLimiterConfig]; bili_strict[0].max = 4; bili_strict[1].max = 8; bili_strict[2].max = 30; From 767e19b42530fa41a4437aa2eabc35c3caadbb9b Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 27 Mar 2025 03:15:35 +0800 Subject: [PATCH 65/79] fix: bigint serialization failed --- lib/mq/exec/snapshotTick.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 3f1203b..ec6ab7d 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -47,20 +47,21 @@ export const snapshotTickWorker = async (_job: Job) => { const groups = Math.ceil(count / 30); for (let i = 0; i < groups; i++) { const group = schedules.slice(i * 30, (i + 1) * 30); - const aids = group.map((schedule) => schedule.aid); + const aids = group.map((schedule) => Number(schedule.aid)); const filteredAids = await bulkGetVideosWithoutProcessingSchedules(client, aids); if (filteredAids.length === 0) continue; await bulkSetSnapshotStatus(client, filteredAids, "processing"); const dataMap: { [key: number]: number } = {}; for (const schedule of group) { - dataMap[schedule.id] = schedule.aid; + const id = Number(schedule.id); + dataMap[id] = Number(schedule.aid); } await SnapshotQueue.add("bulkSnapshotVideo", { map: dataMap, }, { priority: 3 }); } for (const schedule of schedules) { - if (await videoHasProcessingSchedule(client, schedule.aid)) { + if (await videoHasProcessingSchedule(client, Number(schedule.aid))) { return `ALREADY_PROCESSING`; } let priority = 3; From de061eeb0f5244708a51d2a0fa3f20d04bbe19d5 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 27 Mar 2025 03:21:01 +0800 Subject: [PATCH 66/79] update: rate limit config and number of schedules obtained by tick --- lib/db/snapshotSchedule.ts | 2 +- lib/mq/scheduler.ts | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 9c64d1d..ba288a4 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -272,7 +272,7 @@ export async function getSnapshotsInNextSecond(client: Client) { ELSE 1 END, started_at - LIMIT 10; + LIMIT 100; `; const res = await client.queryObject(query, []); return res.rows; diff --git a/lib/mq/scheduler.ts b/lib/mq/scheduler.ts index 798a249..6722519 100644 --- a/lib/mq/scheduler.ts +++ b/lib/mq/scheduler.ts @@ -340,9 +340,9 @@ bili_test[2].max = 150; bili_test[3].max = 1000; const bili_strict = [...biliLimiterConfig]; -bili_strict[0].max = 4; -bili_strict[1].max = 8; -bili_strict[2].max = 30; +bili_strict[0].max = 1; +bili_strict[1].max = 4; +bili_strict[2].max = 12; bili_strict[3].max = 100; /* From 7ad4255fa773a22ab89a9dee4b39c080061b8328 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 27 Mar 2025 03:26:13 +0800 Subject: [PATCH 67/79] update: remove lastAvailableWindow --- lib/db/snapshotSchedule.ts | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index ba288a4..9309fa2 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -9,8 +9,6 @@ import { Redis } from "ioredis"; const WINDOW_SIZE = 2880; const REDIS_KEY = "cvsa:snapshot_window_counts"; -let lastAvailableWindow: { offset: number; count: number } | null = null; - function getCurrentWindowIndex(): number { const now = new Date(); const minutesSinceMidnight = now.getHours() * 60 + now.getMinutes(); @@ -44,8 +42,6 @@ export async function refreshSnapshotWindowCounts(client: Client, redisClient: R await redisClient.hset(REDIS_KEY, offset.toString(), Number(row.count)); } } - - lastAvailableWindow = null; } export async function initSnapshotWindowCounts(client: Client, redisClient: Redis) { @@ -218,11 +214,7 @@ export async function adjustSnapshotTime( const currentWindow = getCurrentWindowIndex(); const targetOffset = Math.floor((expectedStartTime.getTime() - Date.now()) / (5 * MINUTE)) - 6; - let initialOffset = currentWindow + Math.max(targetOffset, 0); - - if (lastAvailableWindow && lastAvailableWindow.count < allowedCounts) { - initialOffset = Math.max(lastAvailableWindow.offset - 2, 0); - } + const initialOffset = currentWindow + Math.max(targetOffset, 0); let timePerIteration = 0; const t = performance.now(); @@ -231,8 +223,7 @@ export async function adjustSnapshotTime( const count = await getWindowCount(redisClient, offset); if (count < allowedCounts) { - const newCount = await redisClient.hincrby(REDIS_KEY, offset.toString(), 1); - lastAvailableWindow = { offset, count: newCount }; + await redisClient.hincrby(REDIS_KEY, offset.toString(), 1); const startPoint = new Date(); startPoint.setHours(0, 0, 0, 0); From 189bb294cb8ceb0a3baf1b71de25aafeef360a48 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 27 Mar 2025 03:31:48 +0800 Subject: [PATCH 68/79] improve: logging --- lib/db/snapshotSchedule.ts | 6 +++--- lib/mq/exec/snapshotTick.ts | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 9309fa2..620df60 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -236,18 +236,18 @@ export async function adjustSnapshotTime( if (delayedDate.getTime() < now.getTime()) { const elapsed = performance.now() - t; timePerIteration = elapsed / (i+1); - logger.log(`${timePerIteration.toFixed(3)}ms * ${i+1}iterations`, "perf", "fn:adjustSnapshotTime"); + logger.log(`${timePerIteration.toFixed(3)}ms * ${i+1} iterations`, "perf", "fn:adjustSnapshotTime"); return now; } const elapsed = performance.now() - t; timePerIteration = elapsed / (i+1); - logger.log(`${timePerIteration.toFixed(3)}ms * ${i+1}iterations`, "perf", "fn:adjustSnapshotTime"); + logger.log(`${timePerIteration.toFixed(3)}ms * ${i+1} iterations`, "perf", "fn:adjustSnapshotTime"); return delayedDate; } } const elapsed = performance.now() - t; timePerIteration = elapsed / WINDOW_SIZE; - logger.log(`${timePerIteration.toFixed(3)}ms * ${WINDOW_SIZE}iterations`, "perf", "fn:adjustSnapshotTime"); + logger.log(`${timePerIteration.toFixed(3)}ms * ${WINDOW_SIZE} iterations`, "perf", "fn:adjustSnapshotTime"); return expectedStartTime; } diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index ec6ab7d..86c34ba 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -193,7 +193,7 @@ export const regularSnapshotsWorker = async (_job: Job) => { const now = Date.now(); const lastSnapshotedAt = latestSnapshot?.time ?? now; const interval = await getRegularSnapshotInterval(client, aid); - logger.log(`Schedule regular snapshot for aid ${aid} in ${interval} hours.`, "mq"); + logger.log(`Scheduled regular snapshot for aid ${aid} in ${interval} hours.`, "mq"); const targetTime = truncate(lastSnapshotedAt + interval * HOUR, now + 1, now + 100000 * WEEK); await scheduleSnapshot(client, aid, "normal", targetTime); if (now - startedAt > 25 * MINUTE) { @@ -251,7 +251,7 @@ export const takeBulkSnapshotForVideosWorker = async (job: Job) => { } for (const aid of aidsToFetch) { const interval = await getRegularSnapshotInterval(client, aid); - logger.log(`Schedule regular snapshot for aid ${aid} in ${interval} hours.`, "mq"); + logger.log(`Scheduled regular snapshot for aid ${aid} in ${interval} hours.`, "mq"); await scheduleSnapshot(client, aid, "normal", Date.now() + interval * HOUR); } return `DONE`; @@ -288,7 +288,7 @@ export const takeSnapshotForVideoWorker = async (job: Job) => { await setSnapshotStatus(client, id, "completed"); if (type === "normal") { const interval = await getRegularSnapshotInterval(client, aid); - logger.log(`Schedule regular snapshot for aid ${aid} in ${interval} hours.`, "mq"); + logger.log(`Scheduled regular snapshot for aid ${aid} in ${interval} hours.`, "mq"); await scheduleSnapshot(client, aid, type, Date.now() + interval * HOUR); return `DONE`; } else if (type === "new") { From c12379134cb2b3b2335934d70c7aabdc5dbd2f12 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 27 Mar 2025 03:44:41 +0800 Subject: [PATCH 69/79] update: error handling in bulk fetch --- lib/mq/exec/snapshotTick.ts | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 86c34ba..37daefa 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -249,13 +249,28 @@ export const takeBulkSnapshotForVideosWorker = async (job: Job) => { logger.log(`Taken snapshot for video ${aid} in bulk.`, "net", "fn:takeBulkSnapshotForVideosWorker"); } + await bulkSetSnapshotStatus(client, ids, "completed"); for (const aid of aidsToFetch) { const interval = await getRegularSnapshotInterval(client, aid); logger.log(`Scheduled regular snapshot for aid ${aid} in ${interval} hours.`, "mq"); await scheduleSnapshot(client, aid, "normal", Date.now() + interval * HOUR); } return `DONE`; - } finally { + } catch (e) { + if (e instanceof NetSchedulerError && e.code === "NO_PROXY_AVAILABLE") { + logger.warn( + `No available proxy for aid ${job.data.aid}.`, + "mq", + "fn:takeSnapshotForVideoWorker", + ); + await bulkSetSnapshotStatus(client, ids, "completed"); + await bulkScheduleSnapshot(client, aidsToFetch, "normal", Date.now() + 2 * MINUTE); + return; + } + logger.error(e as Error, "mq", "fn:takeSnapshotForVideoWorker"); + await bulkSetSnapshotStatus(client, ids, "failed"); + } + finally { client.release(); } }; From cd160c486eaf34525d43bcdd56e1ffc0e14a2f47 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 27 Mar 2025 03:51:35 +0800 Subject: [PATCH 70/79] update: logging in bulk --- lib/mq/exec/snapshotTick.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 37daefa..a4aa147 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -259,15 +259,15 @@ export const takeBulkSnapshotForVideosWorker = async (job: Job) => { } catch (e) { if (e instanceof NetSchedulerError && e.code === "NO_PROXY_AVAILABLE") { logger.warn( - `No available proxy for aid ${job.data.aid}.`, + `No available proxy for bulk request now.`, "mq", - "fn:takeSnapshotForVideoWorker", + "fn:takeBulkSnapshotForVideosWorker", ); await bulkSetSnapshotStatus(client, ids, "completed"); await bulkScheduleSnapshot(client, aidsToFetch, "normal", Date.now() + 2 * MINUTE); return; } - logger.error(e as Error, "mq", "fn:takeSnapshotForVideoWorker"); + logger.error(e as Error, "mq", "fn:takeBulkSnapshotForVideosWorker"); await bulkSetSnapshotStatus(client, ids, "failed"); } finally { From 49098763f1b9d7ddb1d4029c0a12238adf06c34f Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 27 Mar 2025 03:57:47 +0800 Subject: [PATCH 71/79] fix: mixing bulk tasks with other tasks --- lib/db/snapshotSchedule.ts | 14 +++++++++++++- lib/mq/exec/snapshotTick.ts | 16 ++++++++++++++-- lib/mq/init.ts | 10 ++++++++++ src/worker.ts | 6 +++++- 4 files changed, 42 insertions(+), 4 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 620df60..c056665 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -256,13 +256,25 @@ export async function getSnapshotsInNextSecond(client: Client) { const query = ` SELECT * FROM snapshot_schedule - WHERE started_at <= NOW() + INTERVAL '1 seconds' AND status = 'pending' + WHERE started_at <= NOW() + INTERVAL '1 seconds' AND status = 'pending' AND type != 'normal' ORDER BY CASE WHEN type = 'milestone' THEN 0 ELSE 1 END, started_at + LIMIT 10; + `; + const res = await client.queryObject(query, []); + return res.rows; +} + +export async function getBulkSnapshotsInNextSecond(client: Client) { + const query = ` + SELECT * + FROM snapshot_schedule + WHERE started_at <= NOW() + INTERVAL '15 seconds' AND status = 'pending' AND type = 'normal' + ORDER BY started_at LIMIT 100; `; const res = await client.queryObject(query, []); diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index a4aa147..319a20c 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -15,6 +15,7 @@ import { setSnapshotStatus, snapshotScheduleExists, videoHasProcessingSchedule, + getBulkSnapshotsInNextSecond } from "lib/db/snapshotSchedule.ts"; import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; import { HOUR, MINUTE, SECOND, WEEK } from "$std/datetime/constants.ts"; @@ -39,10 +40,10 @@ const snapshotTypeToTaskMap: { [key: string]: string } = { "new": "snapshotMilestoneVideo", }; -export const snapshotTickWorker = async (_job: Job) => { +export const bulkSnapshotTickWorker = async (_job: Job) => { const client = await db.connect(); try { - const schedules = await getSnapshotsInNextSecond(client); + const schedules = await getBulkSnapshotsInNextSecond(client); const count = schedules.length; const groups = Math.ceil(count / 30); for (let i = 0; i < groups; i++) { @@ -60,6 +61,17 @@ export const snapshotTickWorker = async (_job: Job) => { map: dataMap, }, { priority: 3 }); } + } catch (e) { + logger.error(e as Error); + } finally { + client.release(); + } +}; + +export const snapshotTickWorker = async (_job: Job) => { + const client = await db.connect(); + try { + const schedules = await getSnapshotsInNextSecond(client); for (const schedule of schedules) { if (await videoHasProcessingSchedule(client, Number(schedule.aid))) { return `ALREADY_PROCESSING`; diff --git a/lib/mq/init.ts b/lib/mq/init.ts index e416988..d408f8e 100644 --- a/lib/mq/init.ts +++ b/lib/mq/init.ts @@ -35,6 +35,16 @@ export async function initMQ() { }, }); + await SnapshotQueue.upsertJobScheduler("bulkSnapshotTick", { + every: 15 * SECOND, + immediately: true, + }, { + opts: { + removeOnComplete: 1, + removeOnFail: 1, + }, + }); + await SnapshotQueue.upsertJobScheduler("collectMilestoneSnapshots", { every: 5 * MINUTE, immediately: true, diff --git a/src/worker.ts b/src/worker.ts index 883ec74..fae7b6a 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -11,7 +11,8 @@ import { snapshotTickWorker, takeSnapshotForVideoWorker, scheduleCleanupWorker, - takeBulkSnapshotForVideosWorker + takeBulkSnapshotForVideosWorker, + bulkSnapshotTickWorker } from "lib/mq/exec/snapshotTick.ts"; Deno.addSignalListener("SIGINT", async () => { @@ -88,6 +89,9 @@ const snapshotWorker = new Worker( case "bulkSnapshotVideo": await takeBulkSnapshotForVideosWorker(job); break; + case "bulkSnapshotTick": + await bulkSnapshotTickWorker(job); + break; default: break; } From 6a7f246562d01f6bafd699b535407ec41beb0815 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 27 Mar 2025 04:05:35 +0800 Subject: [PATCH 72/79] update: increase limit of getBulkSnapshotsInNextSecond --- lib/db/snapshotSchedule.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index c056665..33dd654 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -275,7 +275,7 @@ export async function getBulkSnapshotsInNextSecond(client: Client) { FROM snapshot_schedule WHERE started_at <= NOW() + INTERVAL '15 seconds' AND status = 'pending' AND type = 'normal' ORDER BY started_at - LIMIT 100; + LIMIT 1000; `; const res = await client.queryObject(query, []); return res.rows; From 01171f5de3fac2f135ef2bdb28eb826fa20b5db9 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 27 Mar 2025 04:14:12 +0800 Subject: [PATCH 73/79] update: remove WINDOW_SIZE --- lib/db/snapshotSchedule.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 33dd654..62fab1c 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -6,7 +6,6 @@ import { MINUTE } from "$std/datetime/constants.ts"; import { redis } from "lib/db/redis.ts"; import { Redis } from "ioredis"; -const WINDOW_SIZE = 2880; const REDIS_KEY = "cvsa:snapshot_window_counts"; function getCurrentWindowIndex(): number { @@ -38,7 +37,7 @@ export async function refreshSnapshotWindowCounts(client: Client, redisClient: R for (const row of result.rows) { const targetOffset = Math.floor((row.window_start.getTime() - startTime) / (5 * MINUTE)); const offset = (currentWindow + targetOffset); - if (offset >= 0 && offset < WINDOW_SIZE) { + if (offset >= 0) { await redisClient.hset(REDIS_KEY, offset.toString(), Number(row.count)); } } @@ -217,8 +216,9 @@ export async function adjustSnapshotTime( const initialOffset = currentWindow + Math.max(targetOffset, 0); let timePerIteration = 0; + const MAX_ITERATIONS = 2880; const t = performance.now(); - for (let i = initialOffset; i < WINDOW_SIZE; i++) { + for (let i = initialOffset; i < MAX_ITERATIONS; i++) { const offset = i; const count = await getWindowCount(redisClient, offset); @@ -246,8 +246,8 @@ export async function adjustSnapshotTime( } } const elapsed = performance.now() - t; - timePerIteration = elapsed / WINDOW_SIZE; - logger.log(`${timePerIteration.toFixed(3)}ms * ${WINDOW_SIZE} iterations`, "perf", "fn:adjustSnapshotTime"); + timePerIteration = elapsed / MAX_ITERATIONS; + logger.log(`${timePerIteration.toFixed(3)}ms * ${MAX_ITERATIONS} iterations`, "perf", "fn:adjustSnapshotTime"); return expectedStartTime; } From 9e3cc8236c3616fda840ba9f64305b288b0cfa9f Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 27 Mar 2025 04:15:46 +0800 Subject: [PATCH 74/79] fix: incorrect iterations counting --- lib/db/snapshotSchedule.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/db/snapshotSchedule.ts b/lib/db/snapshotSchedule.ts index 62fab1c..68228b7 100644 --- a/lib/db/snapshotSchedule.ts +++ b/lib/db/snapshotSchedule.ts @@ -217,8 +217,10 @@ export async function adjustSnapshotTime( let timePerIteration = 0; const MAX_ITERATIONS = 2880; + let iters = 0; const t = performance.now(); for (let i = initialOffset; i < MAX_ITERATIONS; i++) { + iters++; const offset = i; const count = await getWindowCount(redisClient, offset); @@ -236,12 +238,12 @@ export async function adjustSnapshotTime( if (delayedDate.getTime() < now.getTime()) { const elapsed = performance.now() - t; timePerIteration = elapsed / (i+1); - logger.log(`${timePerIteration.toFixed(3)}ms * ${i+1} iterations`, "perf", "fn:adjustSnapshotTime"); + logger.log(`${timePerIteration.toFixed(3)}ms * ${iters} iterations`, "perf", "fn:adjustSnapshotTime"); return now; } const elapsed = performance.now() - t; timePerIteration = elapsed / (i+1); - logger.log(`${timePerIteration.toFixed(3)}ms * ${i+1} iterations`, "perf", "fn:adjustSnapshotTime"); + logger.log(`${timePerIteration.toFixed(3)}ms * ${iters} iterations`, "perf", "fn:adjustSnapshotTime"); return delayedDate; } } From 7337538f0be8148c79923e0a1463b270b2fe6b39 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 27 Mar 2025 05:49:19 +0800 Subject: [PATCH 75/79] fix: snapshotBefore may returns null even there's a snapshot exists --- lib/mq/exec/snapshotTick.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/mq/exec/snapshotTick.ts b/lib/mq/exec/snapshotTick.ts index 319a20c..b18d845 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/lib/mq/exec/snapshotTick.ts @@ -173,7 +173,8 @@ export const collectMilestoneSnapshotsWorker = async (_job: Job) => { const getRegularSnapshotInterval = async (client: Client, aid: number) => { const now = Date.now(); const date = new Date(now - 24 * HOUR); - const oldSnapshot = await findSnapshotBefore(client, aid, date); + let oldSnapshot = await findSnapshotBefore(client, aid, date); + if (!oldSnapshot) oldSnapshot = await findClosestSnapshot(client, aid, date); const latestSnapshot = await getLatestSnapshot(client, aid); if (!oldSnapshot || !latestSnapshot) return 0; if (oldSnapshot.created_at === latestSnapshot.created_at) return 0; @@ -181,7 +182,7 @@ const getRegularSnapshotInterval = async (client: Client, aid: number) => { if (hoursDiff < 8) return 24; const viewsDiff = latestSnapshot.views - oldSnapshot.views; if (viewsDiff === 0) return 72; - const speedPerDay = viewsDiff / hoursDiff * 24; + const speedPerDay = viewsDiff / (hoursDiff + 0.001) * 24; if (speedPerDay < 6) return 36; if (speedPerDay < 120) return 24; if (speedPerDay < 320) return 12; From 636c5e25cb2ee894a1df37f551b2b3e89c93d8f0 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sat, 29 Mar 2025 14:13:15 +0800 Subject: [PATCH 76/79] ref: move ML stuff add: .idea to VCS, the refactor guide --- .gitignore | 11 ++- .idea/.gitignore | 9 +++ .idea/cvsa.iml | 21 ++++++ .idea/inspectionProfiles/Project_Default.xml | 12 ++++ .idea/modules.xml | 8 +++ .idea/sqldialects.xml | 6 ++ .idea/vcs.xml | 6 ++ README-refactor.md | 65 ++++++++++++++++++ components/Button.tsx | 12 ---- data/filter/1.py | 55 --------------- dev.ts | 7 -- fresh.config.ts | 6 -- fresh.gen.ts | 27 -------- islands/Counter.tsx | 16 ----- main.ts | 13 ---- {filter => ml/filter}/RunningLogs.txt | 0 .../filter}/checkpoint_conversion.py | 0 {filter => ml/filter}/clean_dataset.py | 0 {filter => ml/filter}/dataset.py | 0 {filter => ml/filter}/db_utils.py | 0 {filter => ml/filter}/embedding.py | 0 {filter => ml/filter}/embedding_range.py | 0 .../filter}/embedding_visualization.py | 0 {filter => ml/filter}/labeling_system.py | 0 {filter => ml/filter}/model.py | 0 {filter => ml/filter}/modelV3_10.py | 0 {filter => ml/filter}/modelV3_12.py | 0 {filter => ml/filter}/modelV3_15.py | 0 {filter => ml/filter}/modelV6_0.py | 0 {filter => ml/filter}/onnx_export.py | 0 {filter => ml/filter}/predict.py | 0 {filter => ml/filter}/quantize.py | 0 {filter => ml/filter}/tag.py | 0 {filter => ml/filter}/test.py | 0 {filter => ml/filter}/train.py | 0 {lab => ml/lab}/.gitignore | 0 {lab => ml/lab}/align-pipeline.md | 0 {lab => ml/lab}/mmsAlignment/align2LRC.py | 0 {lab => ml/lab}/mmsAlignment/alignWithMMS.py | 0 {lab => ml/lab}/mmsAlignment/splitSong.py | 0 {lab => ml/lab}/utils/audio.py | 0 {lab => ml/lab}/utils/cleanTempDir.py | 0 {lab => ml/lab}/utils/ttml.py | 0 {lab => ml/lab}/whisperAlignment/align2srt.py | 0 .../lab}/whisperAlignment/alignWithGroup.py | 0 .../lab}/whisperAlignment/splitGroups.py | 0 {lab => ml/lab}/whisperAlignment/srt2lrc.py | 0 {pred => ml/pred}/count.py | 0 {pred => ml/pred}/crawler.py | 0 {pred => ml/pred}/dataset.py | 0 {pred => ml/pred}/export_onnx.py | 0 {pred => ml/pred}/inference.py | 0 {pred => ml/pred}/model.py | 0 {pred => ml/pred}/train.py | 0 routes/_404.tsx | 27 -------- routes/_app.tsx | 16 ----- routes/api/joke.ts | 21 ------ routes/greet/[name].tsx | 5 -- routes/index.tsx | 25 ------- static/favicon.ico | Bin 22382 -> 0 bytes static/logo.svg | 6 -- static/styles.css | 3 - tailwind.config.ts | 7 -- 63 files changed, 132 insertions(+), 252 deletions(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/cvsa.iml create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/sqldialects.xml create mode 100644 .idea/vcs.xml create mode 100644 README-refactor.md delete mode 100644 components/Button.tsx delete mode 100644 data/filter/1.py delete mode 100755 dev.ts delete mode 100644 fresh.config.ts delete mode 100644 fresh.gen.ts delete mode 100644 islands/Counter.tsx delete mode 100644 main.ts rename {filter => ml/filter}/RunningLogs.txt (100%) rename {filter => ml/filter}/checkpoint_conversion.py (100%) rename {filter => ml/filter}/clean_dataset.py (100%) rename {filter => ml/filter}/dataset.py (100%) rename {filter => ml/filter}/db_utils.py (100%) rename {filter => ml/filter}/embedding.py (100%) rename {filter => ml/filter}/embedding_range.py (100%) rename {filter => ml/filter}/embedding_visualization.py (100%) rename {filter => ml/filter}/labeling_system.py (100%) rename {filter => ml/filter}/model.py (100%) rename {filter => ml/filter}/modelV3_10.py (100%) rename {filter => ml/filter}/modelV3_12.py (100%) rename {filter => ml/filter}/modelV3_15.py (100%) rename {filter => ml/filter}/modelV6_0.py (100%) rename {filter => ml/filter}/onnx_export.py (100%) rename {filter => ml/filter}/predict.py (100%) rename {filter => ml/filter}/quantize.py (100%) rename {filter => ml/filter}/tag.py (100%) rename {filter => ml/filter}/test.py (100%) rename {filter => ml/filter}/train.py (100%) rename {lab => ml/lab}/.gitignore (100%) rename {lab => ml/lab}/align-pipeline.md (100%) rename {lab => ml/lab}/mmsAlignment/align2LRC.py (100%) rename {lab => ml/lab}/mmsAlignment/alignWithMMS.py (100%) rename {lab => ml/lab}/mmsAlignment/splitSong.py (100%) rename {lab => ml/lab}/utils/audio.py (100%) rename {lab => ml/lab}/utils/cleanTempDir.py (100%) rename {lab => ml/lab}/utils/ttml.py (100%) rename {lab => ml/lab}/whisperAlignment/align2srt.py (100%) rename {lab => ml/lab}/whisperAlignment/alignWithGroup.py (100%) rename {lab => ml/lab}/whisperAlignment/splitGroups.py (100%) rename {lab => ml/lab}/whisperAlignment/srt2lrc.py (100%) rename {pred => ml/pred}/count.py (100%) rename {pred => ml/pred}/crawler.py (100%) rename {pred => ml/pred}/dataset.py (100%) rename {pred => ml/pred}/export_onnx.py (100%) rename {pred => ml/pred}/inference.py (100%) rename {pred => ml/pred}/model.py (100%) rename {pred => ml/pred}/train.py (100%) delete mode 100644 routes/_404.tsx delete mode 100644 routes/_app.tsx delete mode 100644 routes/api/joke.ts delete mode 100644 routes/greet/[name].tsx delete mode 100644 routes/index.tsx delete mode 100644 static/favicon.ico delete mode 100644 static/logo.svg delete mode 100644 static/styles.css delete mode 100644 tailwind.config.ts diff --git a/.gitignore b/.gitignore index 31d6ddf..58df6d2 100644 --- a/.gitignore +++ b/.gitignore @@ -51,7 +51,6 @@ internal/ !tests/cases/projects/projectOption/**/node_modules !tests/cases/projects/NodeModulesSearch/**/* !tests/baselines/reference/project/nodeModules*/**/* -.idea yarn.lock yarn-error.log .parallelperf.* @@ -78,10 +77,10 @@ node_modules/ # project specific logs/ __pycache__ -filter/runs -pred/runs -pred/checkpoints -data/ -filter/checkpoints +ml/filter/runs +ml/pred/runs +ml/pred/checkpoints +ml/data/ +ml/filter/checkpoints scripts model/ diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..518076d --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,9 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +dataSources.xml \ No newline at end of file diff --git a/.idea/cvsa.iml b/.idea/cvsa.iml new file mode 100644 index 0000000..c155925 --- /dev/null +++ b/.idea/cvsa.iml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..5535e8f --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,12 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..4552e71 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/sqldialects.xml b/.idea/sqldialects.xml new file mode 100644 index 0000000..6df4889 --- /dev/null +++ b/.idea/sqldialects.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README-refactor.md b/README-refactor.md new file mode 100644 index 0000000..75ffdb9 --- /dev/null +++ b/README-refactor.md @@ -0,0 +1,65 @@ +# 项目重构方案 + +## 目标架构 +采用monorepo结构管理三个独立部分: +1. `packages/crawler` - 现有爬虫功能 +2. `packages/frontend` - 基于Astro的前端 +3. `packages/backend` - 基于Hono的API后端 + +## 目录结构调整方案 + +### 新结构 +``` +. +├── packages/ +│ ├── crawler/ # 爬虫组件 +│ ├── frontend/ # Astro前端 +│ ├── backend/ # Hono后端API +│ └── core/ # 共享代码(未来提取) +├── docs/ # 文档 +├── scripts/ # 项目脚本 +└── package.json # 根项目配置 +``` + +### 具体迁移方案 + +#### 1. 爬虫部分(crawler) +保留以下目录/文件: +- `lib/` (除前端相关) +- `src/db/raw/` +- `src/filterWorker.ts` +- `src/worker.ts` +- `test/` +- `deno.json` +- `.gitignore` + +需要移除: +- Fresh框架相关文件 +- 前端组件(`components/`) +- 静态资源(`static/`) + +#### 2. 前端部分(frontend) +全新创建Astro项目,不保留任何现有前端代码 + +#### 3. 后端部分(backend) +全新创建Hono项目 + +#### 4. 共享代码(core) +未来可从爬虫中提取以下内容到core package: +- 数据库相关:`lib/db/` +- 消息队列:`lib/mq/` +- 网络请求:`lib/net/` +- 工具函数:`lib/utils/` + +## 重构步骤建议 + +1. 初始化monorepo结构 +2. 迁移爬虫代码到`packages/crawler` +3. 创建新的Astro项目在`packages/frontend` +4. 创建新的Hono项目在`packages/backend` +5. 逐步提取共享代码到`packages/core` + +## 注意事项 +- 机器学习相关代码(`pred/`, `filter/`, `lab/`)保持现状 +- 文档(`doc/`)可以迁移到`docs/`目录 +- 需要更新CI/CD流程支持monorepo \ No newline at end of file diff --git a/components/Button.tsx b/components/Button.tsx deleted file mode 100644 index 6e868c5..0000000 --- a/components/Button.tsx +++ /dev/null @@ -1,12 +0,0 @@ -import { JSX } from "preact"; -import { IS_BROWSER } from "$fresh/runtime.ts"; - -export function Button(props: JSX.HTMLAttributes) { - return ( - -

{props.count}

- - - ); -} diff --git a/main.ts b/main.ts deleted file mode 100644 index 675f529..0000000 --- a/main.ts +++ /dev/null @@ -1,13 +0,0 @@ -/// -/// -/// -/// -/// - -import "$std/dotenv/load.ts"; - -import { start } from "$fresh/server.ts"; -import manifest from "./fresh.gen.ts"; -import config from "./fresh.config.ts"; - -await start(manifest, config); diff --git a/filter/RunningLogs.txt b/ml/filter/RunningLogs.txt similarity index 100% rename from filter/RunningLogs.txt rename to ml/filter/RunningLogs.txt diff --git a/filter/checkpoint_conversion.py b/ml/filter/checkpoint_conversion.py similarity index 100% rename from filter/checkpoint_conversion.py rename to ml/filter/checkpoint_conversion.py diff --git a/filter/clean_dataset.py b/ml/filter/clean_dataset.py similarity index 100% rename from filter/clean_dataset.py rename to ml/filter/clean_dataset.py diff --git a/filter/dataset.py b/ml/filter/dataset.py similarity index 100% rename from filter/dataset.py rename to ml/filter/dataset.py diff --git a/filter/db_utils.py b/ml/filter/db_utils.py similarity index 100% rename from filter/db_utils.py rename to ml/filter/db_utils.py diff --git a/filter/embedding.py b/ml/filter/embedding.py similarity index 100% rename from filter/embedding.py rename to ml/filter/embedding.py diff --git a/filter/embedding_range.py b/ml/filter/embedding_range.py similarity index 100% rename from filter/embedding_range.py rename to ml/filter/embedding_range.py diff --git a/filter/embedding_visualization.py b/ml/filter/embedding_visualization.py similarity index 100% rename from filter/embedding_visualization.py rename to ml/filter/embedding_visualization.py diff --git a/filter/labeling_system.py b/ml/filter/labeling_system.py similarity index 100% rename from filter/labeling_system.py rename to ml/filter/labeling_system.py diff --git a/filter/model.py b/ml/filter/model.py similarity index 100% rename from filter/model.py rename to ml/filter/model.py diff --git a/filter/modelV3_10.py b/ml/filter/modelV3_10.py similarity index 100% rename from filter/modelV3_10.py rename to ml/filter/modelV3_10.py diff --git a/filter/modelV3_12.py b/ml/filter/modelV3_12.py similarity index 100% rename from filter/modelV3_12.py rename to ml/filter/modelV3_12.py diff --git a/filter/modelV3_15.py b/ml/filter/modelV3_15.py similarity index 100% rename from filter/modelV3_15.py rename to ml/filter/modelV3_15.py diff --git a/filter/modelV6_0.py b/ml/filter/modelV6_0.py similarity index 100% rename from filter/modelV6_0.py rename to ml/filter/modelV6_0.py diff --git a/filter/onnx_export.py b/ml/filter/onnx_export.py similarity index 100% rename from filter/onnx_export.py rename to ml/filter/onnx_export.py diff --git a/filter/predict.py b/ml/filter/predict.py similarity index 100% rename from filter/predict.py rename to ml/filter/predict.py diff --git a/filter/quantize.py b/ml/filter/quantize.py similarity index 100% rename from filter/quantize.py rename to ml/filter/quantize.py diff --git a/filter/tag.py b/ml/filter/tag.py similarity index 100% rename from filter/tag.py rename to ml/filter/tag.py diff --git a/filter/test.py b/ml/filter/test.py similarity index 100% rename from filter/test.py rename to ml/filter/test.py diff --git a/filter/train.py b/ml/filter/train.py similarity index 100% rename from filter/train.py rename to ml/filter/train.py diff --git a/lab/.gitignore b/ml/lab/.gitignore similarity index 100% rename from lab/.gitignore rename to ml/lab/.gitignore diff --git a/lab/align-pipeline.md b/ml/lab/align-pipeline.md similarity index 100% rename from lab/align-pipeline.md rename to ml/lab/align-pipeline.md diff --git a/lab/mmsAlignment/align2LRC.py b/ml/lab/mmsAlignment/align2LRC.py similarity index 100% rename from lab/mmsAlignment/align2LRC.py rename to ml/lab/mmsAlignment/align2LRC.py diff --git a/lab/mmsAlignment/alignWithMMS.py b/ml/lab/mmsAlignment/alignWithMMS.py similarity index 100% rename from lab/mmsAlignment/alignWithMMS.py rename to ml/lab/mmsAlignment/alignWithMMS.py diff --git a/lab/mmsAlignment/splitSong.py b/ml/lab/mmsAlignment/splitSong.py similarity index 100% rename from lab/mmsAlignment/splitSong.py rename to ml/lab/mmsAlignment/splitSong.py diff --git a/lab/utils/audio.py b/ml/lab/utils/audio.py similarity index 100% rename from lab/utils/audio.py rename to ml/lab/utils/audio.py diff --git a/lab/utils/cleanTempDir.py b/ml/lab/utils/cleanTempDir.py similarity index 100% rename from lab/utils/cleanTempDir.py rename to ml/lab/utils/cleanTempDir.py diff --git a/lab/utils/ttml.py b/ml/lab/utils/ttml.py similarity index 100% rename from lab/utils/ttml.py rename to ml/lab/utils/ttml.py diff --git a/lab/whisperAlignment/align2srt.py b/ml/lab/whisperAlignment/align2srt.py similarity index 100% rename from lab/whisperAlignment/align2srt.py rename to ml/lab/whisperAlignment/align2srt.py diff --git a/lab/whisperAlignment/alignWithGroup.py b/ml/lab/whisperAlignment/alignWithGroup.py similarity index 100% rename from lab/whisperAlignment/alignWithGroup.py rename to ml/lab/whisperAlignment/alignWithGroup.py diff --git a/lab/whisperAlignment/splitGroups.py b/ml/lab/whisperAlignment/splitGroups.py similarity index 100% rename from lab/whisperAlignment/splitGroups.py rename to ml/lab/whisperAlignment/splitGroups.py diff --git a/lab/whisperAlignment/srt2lrc.py b/ml/lab/whisperAlignment/srt2lrc.py similarity index 100% rename from lab/whisperAlignment/srt2lrc.py rename to ml/lab/whisperAlignment/srt2lrc.py diff --git a/pred/count.py b/ml/pred/count.py similarity index 100% rename from pred/count.py rename to ml/pred/count.py diff --git a/pred/crawler.py b/ml/pred/crawler.py similarity index 100% rename from pred/crawler.py rename to ml/pred/crawler.py diff --git a/pred/dataset.py b/ml/pred/dataset.py similarity index 100% rename from pred/dataset.py rename to ml/pred/dataset.py diff --git a/pred/export_onnx.py b/ml/pred/export_onnx.py similarity index 100% rename from pred/export_onnx.py rename to ml/pred/export_onnx.py diff --git a/pred/inference.py b/ml/pred/inference.py similarity index 100% rename from pred/inference.py rename to ml/pred/inference.py diff --git a/pred/model.py b/ml/pred/model.py similarity index 100% rename from pred/model.py rename to ml/pred/model.py diff --git a/pred/train.py b/ml/pred/train.py similarity index 100% rename from pred/train.py rename to ml/pred/train.py diff --git a/routes/_404.tsx b/routes/_404.tsx deleted file mode 100644 index 4628eeb..0000000 --- a/routes/_404.tsx +++ /dev/null @@ -1,27 +0,0 @@ -import { Head } from "$fresh/runtime.ts"; - -export default function Error404() { - return ( - <> - - 404 - Page not found - -
-
- the Fresh logo: a sliced lemon dripping with juice -

404 - Page not found

-

- The page you were looking for doesn't exist. -

- Go back home -
-
- - ); -} diff --git a/routes/_app.tsx b/routes/_app.tsx deleted file mode 100644 index a44414e..0000000 --- a/routes/_app.tsx +++ /dev/null @@ -1,16 +0,0 @@ -import { type PageProps } from "$fresh/server.ts"; -export default function App({ Component }: PageProps) { - return ( - - - - - cvsa - - - - - - - ); -} diff --git a/routes/api/joke.ts b/routes/api/joke.ts deleted file mode 100644 index 68b0ebe..0000000 --- a/routes/api/joke.ts +++ /dev/null @@ -1,21 +0,0 @@ -import { FreshContext } from "$fresh/server.ts"; - -// Jokes courtesy of https://punsandoneliners.com/randomness/programmer-jokes/ -const JOKES = [ - "Why do Java developers often wear glasses? They can't C#.", - "A SQL query walks into a bar, goes up to two tables and says “can I join you?”", - "Wasn't hard to crack Forrest Gump's password. 1forrest1.", - "I love pressing the F5 key. It's refreshing.", - "Called IT support and a chap from Australia came to fix my network connection. I asked “Do you come from a LAN down under?”", - "There are 10 types of people in the world. Those who understand binary and those who don't.", - "Why are assembly programmers often wet? They work below C level.", - "My favourite computer based band is the Black IPs.", - "What programme do you use to predict the music tastes of former US presidential candidates? An Al Gore Rhythm.", - "An SEO expert walked into a bar, pub, inn, tavern, hostelry, public house.", -]; - -export const handler = (_req: Request, _ctx: FreshContext): Response => { - const randomIndex = Math.floor(Math.random() * JOKES.length); - const body = JOKES[randomIndex]; - return new Response(body); -}; diff --git a/routes/greet/[name].tsx b/routes/greet/[name].tsx deleted file mode 100644 index a7a5fe1..0000000 --- a/routes/greet/[name].tsx +++ /dev/null @@ -1,5 +0,0 @@ -import { PageProps } from "$fresh/server.ts"; - -export default function Greet(props: PageProps) { - return
Hello {props.params.name}
; -} diff --git a/routes/index.tsx b/routes/index.tsx deleted file mode 100644 index 67a22a7..0000000 --- a/routes/index.tsx +++ /dev/null @@ -1,25 +0,0 @@ -import { useSignal } from "@preact/signals"; -import Counter from "../islands/Counter.tsx"; - -export default function Home() { - const count = useSignal(3); - return ( -
-
- the Fresh logo: a sliced lemon dripping with juice -

Welcome to Fresh

-

- Try updating this message in the - ./routes/index.tsx file, and refresh. -

- -
-
- ); -} diff --git a/static/favicon.ico b/static/favicon.ico deleted file mode 100644 index 1cfaaa2193b0f210107a559f7421569f57a25388..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 22382 zcmeI4dw7?{mB%N97z7oqA|OH{6p11r>cU#lM7K(nW`t#!NG z`qUAy{#t>9K|!BwH6TqGo5?%XehL;`0&-}m=Ue0llhcL@pl$8VmT z%zK+TmpOCh%*>geb9pY`9euPTFLpO|c5Z}ouDCdHKbPk-c~(}IxG%ZDxr=%@SHd^E zqD103nR9%XEERoVu3rrLu0HUY|1MgG%1x{{_pcwC`)FSxKQUHUyl&n5r0WaUnLDS_ zO1@EJ-yc$bGez?bM z=RUI!pyBE&vtsb~Nlt_6nbdbp$ix3y;iH@E#h>mpJEOtu-!_}g;rgj-#Y+6IA}J3UgmtZ|>|08$6-G-YTPxu6$cc zJ}Rv5v(Pi0IwV{0`8sY^c>!W~<7>=~Tx&xf*kG?*vC-^u@LmTG`5`^sYZLs?&Z47< zau=(tlCR@3bgovaC9=>IxZ5Az`p`7QbsLpKRZnMv?v+|=>T0dXj*Kq-QIJBHP z|7e}QxX#YKtKQ~J++@|)ZM40&Ldy@fo4v5p8sT>e-{eKhtBxXMsXo$eWkM!yf#sjQ z)=I9cwrlAl)9$Ue??K~b`75l;@nQc`xp-2&f?j+x6#e{Gt+~pN%r!Kd8&_?vC(rv! ze}Ht!_gP;j?HADK%gukuxzat@j{@hWVjre<;!Qq~$8`v0%_HeUVb!WU|dRvpYNRdVE0va2Ds}tG@I?%%a~DZ z+u;ANyx$6VJD+L3fikD4Zsd}Z1bxF8E4%;Tv)D7AWShaCDZco3qWL`4-3NQ6JX!L# z2>aLL3+wIesy!aN+3%o*_wjnOxnB(4A;K+4CI|nHcE0+djrP&U*v&M4mmWAyW`kef zz77<7JW(0QR;%5+uC(JAkN>i~F^WBL{Ul@l$&8Ol#`|pOm;?U(d?e8!{3VQSyu0lu zn+#9If`7ZYLIqor{0{UZprMU)G=k$RaT(~I@y`t|x9P9#O8825gX?_8`YRdhr_uf| zB9mJBLOCrXzvZHJ37u#I9gD!%T{vaS0{+PdAp>-5;#}}91;>&2De{-Re^AK%5d4cb z@ZpryH)k^L{|j`;?-5XECh!lwyHNNA9>1=ST4lrWb?V;-zx*PPyCsL7Teh100YBwG z@ZZ)$Lk+t5U&!f4(UXUhWX$L#^pGEF9(hHouNT}5kqHs3>k-OExcn zdoS&PAEWv6LU13Ej`wK01hhhfWN|U`NqoW~rpIwLUuUYkFY^z*&!tbF1QH%q;{WbhR$6z5Te#G@DZsd`&W)Mv z+#sN5nRDG1C7^)3fcrx7{Mo>B0N>}=0XupA5%2d-bp`ttxk5YLb+?tSo7K9W)>L^T z-u$d6POXPhmzxS`9W_X0i7fX&CxM&fK@;>uo2i2g4Xk^fcJq# zz%1Y{pcLo>+zc!Ob^yD98ej&XcL9A-n%na_(w5i5>n`n4|A9I2>&(wtx3EFw!TQ6G z!!{Dnqkw6E_|RU7_MRoHwt)Cu4T$Gt<$uldjP_yLA`|KkWJ_L5yRTp$IM_Gv^9TH7d(H+5m#AY8&`~LM()|s}j?h{Y1vNjajf>d;N)H~_g2=U+EGVpbhkEVThJ<6I} zvb2_cjen{*U@f?#_>I>qyKp<>qxOc|RR*drT;FA^klo=-fGVuB7z1b#gg zyLT)59Q%Hs#O_69@djfd>$LIxkYsdr{{BkkIF`|1nLK$0vXJOkFMe+8yyIFFQDK5g4hWoMl`F$P!Pm% z27A??tUZ)pbe;G)rY>_G2>Cx1`&V}-`)qqs*!)z2S&Tg-)+vbn)VP2=y>1@LT(Ml5 zYi6tiA^#UbZ=?1gqp2Lo^Vm0pM-G6fZEPY;aC7WsZxTv&0`~u%-en6~Q;2#`f zIqZX<+r?9V;!`t8A^&C2xob9j`cwn&=Q75}_kk6w;P=dLz)sG>7gn4?)K_RkFtUxr z9JIu696~uLM(kMerSTwL3i&@7pQl>%`lS8-Wbp`bc_>yx`_yBZ7r%=fqDlIp7_dpy z>*IP3fgBW@H74XM9sAz)A5NcLpja&Jb1TiGKgZ)z;=J#7&l-W^I%E&yNpe_*9PTED zf!MG^;Wy9dpW!~S_kC!W37YRdAKL#n>Ep)`gRmcuv~{Zc6VZc}p$@!5`9Hz4{3M@b zTVJEUd=2{`Tpc)O{+;&kAstAUyq=Kvm*2104$W^AlT$`KRw{nu@6;FOz~3rlFch8d z2A`MHFJ49th@&N`{-?30oCyhJ&;flybL6wdn|!-;$;$vbCaYb1%Qu zPLeUe^O|kmhyI}$P{r~1q)V-*5OWgn-j2HPP|&U!w7&$@`<)g)_-gv)?(d+#>bn2U zI1t2;rs@0H$YLZi{XO+Y)j@VwYpX-b+s!`C#t#nG)YB>e9|W>OS6KfmqzxWdjPgAC zsAQlR-fZ~G8}T>Rpl3b_*CKR5>u$1*2dN9s!&8Cy$~3jefVF-4!IF^`i5O7% zdKbs~bS6Az@{Qv9o@T6#h#}~E#8De()(&QjSism;sPQe+R20VbhjKU%8B|@uS^(#g z0-K&m9B(E($G?#-+=ebx(Fc5zKRJhI8N>j$W;0)g_b%D+FF6IgD>e_i!SyxBU>mV_ z)<6R-K@KIfOPv1px<4Dc@CsvPG%1dLG;IJKt?}8~^B1B2F!7UZ@_PWtPWIzY*+b&l zZ4>RIc-=v*$Ux)2Y-JG7+D3b+c;BB87aR4Pbl&o-)R(0_cpBP+HR5df*Y}c}fc@Cc z;GG0C>3pQl3oJ$tPG@{b*6zKaUuPN>Uwk1pLq611tfN1G4eibNm#j?undB$iSQi;5 z>%pryaA?X@4v%>r+QNTS2GnyH{7*&?8a2n)nI8Fg;w#pRi1(QBO-UW_b#lJ9&UGKZE_p#9e?1KKn6e_G=|st3qG z{pkj5QG?D={fU06q%%G8aietWjKNfVy=77YlEzS7-%md{Joat0T(WD~T-hC;6a&t= zj#Oi#V&l&g|Lv6mSyEqkX8sanu#$7T_H%T4JM?H>=(Hp@LG67HJdfa=)=hNgLv}J5 zpQ)bdEQZD(pLAa6^49mDGM@isBOfn=Fds@^n9qJ$V3*cG+d6F21ngF}^X621N8kN3 z<6|W_d|HCcTUmd90vg+F`%}pzh|iIKfGz+%u!}#GP0;zVKeBe9wJ+JeOY!A()+|bY zdt7T=Q4E4lkAMd{;&6-TqrawNrOodogOGpWP>jzN^oMsfXW$IHtwk4P`{vO;I{T-y zM(x47>X4oJbHqnl4=(-o0d3%AptzbKK7zJsGmq&C7FT>MgHRR&z&9N^?9katonPCE zu4)}+EnJ_h&_oW%@wrf4jlr;qXhdP>3C?5_u?H|624MmKl)3^;8pZu zug>WxZfF`C3u^mmFjRkh$8v4p59;&>nF*JNiCq7eX5P z(I@U_U2z4!Wnqe?(s-%)q|$bTq4|!^s7e;maYJh)W6_nf7&ql(>KyG?xPLX`2dEBy zFC#b)7WV%+;0j9FTVn&qx%oiClr@+E;3V$3T2m5Zafg2!6iTF zIGBzUQb1p*pOI_LtBQe3(2Gg*k!O&{n?NPk8+o=J*a_&jGwOi9!}nZdC%#XN)RWO# ze@F6{P2KX%qO?b@U%1Iz6ft&<#639s)CxM&8D($iiPS z`4rnXm5kiNe6McZI7{TiY+rES)A(%zQnxTa()hgt(qXnS$U7Oofk4We!fz);a7v(y&DRt~7zy75O|tmn&+X8hls8Z!IVlSy`CR4)Ri4 z8s>?LhlK=}8ow<`Dm8wnA;=RIjN=zlbx%G+IRXhdGgifPzmOU3B69BS4)IC8#<@<) bck@HGWY%2idMme??%p8ZW3z(%VE+9-Ofn0d diff --git a/static/logo.svg b/static/logo.svg deleted file mode 100644 index ef2fbe4..0000000 --- a/static/logo.svg +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/static/styles.css b/static/styles.css deleted file mode 100644 index b5c61c9..0000000 --- a/static/styles.css +++ /dev/null @@ -1,3 +0,0 @@ -@tailwind base; -@tailwind components; -@tailwind utilities; diff --git a/tailwind.config.ts b/tailwind.config.ts deleted file mode 100644 index 0c790d0..0000000 --- a/tailwind.config.ts +++ /dev/null @@ -1,7 +0,0 @@ -import { type Config } from "tailwindcss"; - -export default { - content: [ - "{routes,islands,components}/**/*.{ts,tsx,js,jsx}", - ], -} satisfies Config; From d88ad099c43294e7a383f8a829602712606ee114 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sat, 29 Mar 2025 18:13:57 +0800 Subject: [PATCH 77/79] ref: monorepo support --- deno.json | 60 +----------- lib/ml/mantis.ts | 22 ----- lib/mq/executors.ts | 1 - packages/backend/deno.json | 0 packages/core/deno.json | 0 {lib => packages/crawler}/db/allData.ts | 4 +- {lib => packages/crawler}/db/init.ts | 2 +- {lib => packages/crawler}/db/pgConfig.ts | 0 {lib => packages/crawler}/db/redis.ts | 0 {lib => packages/crawler}/db/schema.d.ts | 0 {lib => packages/crawler}/db/snapshot.ts | 2 +- .../crawler}/db/snapshotSchedule.ts | 6 +- {lib => packages/crawler}/db/songs.ts | 2 +- packages/crawler/deno.json | 49 ++++++++++ {lib => packages/crawler}/log/logger.ts | 0 {lib => packages/crawler}/log/test.ts | 2 +- {lib => packages/crawler}/ml/akari.ts | 6 +- {lib => packages/crawler}/ml/benchmark.ts | 0 {lib => packages/crawler}/ml/manager.ts | 4 +- .../crawler}/ml/quant_benchmark.ts | 0 .../crawler}/mq/exec/classifyVideo.ts | 18 ++-- .../crawler}/mq/exec/getLatestVideos.ts | 8 +- .../crawler}/mq/exec/snapshotTick.ts | 24 ++--- packages/crawler/mq/executors.ts | 1 + {lib => packages/crawler}/mq/index.ts | 0 {lib => packages/crawler}/mq/init.ts | 10 +- {lib => packages/crawler}/mq/lockManager.ts | 2 +- {lib => packages/crawler}/mq/rateLimiter.ts | 2 +- {lib => packages/crawler}/mq/scheduler.ts | 8 +- {lib => packages/crawler}/mq/schema.ts | 0 {lib => packages/crawler}/mq/slidingWindow.ts | 0 .../crawler}/mq/task/collectSongs.ts | 6 +- .../crawler}/mq/task/getVideoDetails.ts | 10 +- .../crawler}/mq/task/getVideoStats.ts | 6 +- .../crawler}/mq/task/queueLatestVideo.ts | 10 +- {lib => packages/crawler}/net/bilibili.d.ts | 0 .../crawler}/net/bulkGetVideoStats.ts | 6 +- .../crawler}/net/getLatestVideoAids.ts | 6 +- .../crawler}/net/getVideoDetails.ts | 6 +- {lib => packages/crawler}/net/getVideoInfo.ts | 6 +- {src => packages/crawler/src}/bullui.ts | 2 +- {src => packages/crawler/src}/filterWorker.ts | 12 +-- packages/crawler/src/jobAdder.ts | 3 + {src => packages/crawler/src}/worker.ts | 14 +-- .../crawler}/utils/formatSeconds.ts | 0 .../utils/formatTimestampToPostgre.ts | 0 {lib => packages/crawler}/utils/sleep.ts | 0 {lib => packages/crawler}/utils/truncate.ts | 0 packages/frontend/deno.json | 0 src/jobAdder.ts | 3 - test/ml/akari.json | 22 ----- test/ml/akari.test.ts | 46 ---------- test/mq/rateLimiter.test.ts | 91 ------------------- test/mq/slidingWindow.test.ts | 84 ----------------- 54 files changed, 149 insertions(+), 417 deletions(-) delete mode 100644 lib/ml/mantis.ts delete mode 100644 lib/mq/executors.ts create mode 100644 packages/backend/deno.json create mode 100644 packages/core/deno.json rename {lib => packages/crawler}/db/allData.ts (96%) rename {lib => packages/crawler}/db/init.ts (72%) rename {lib => packages/crawler}/db/pgConfig.ts (100%) rename {lib => packages/crawler}/db/redis.ts (100%) rename {lib => packages/crawler}/db/schema.d.ts (100%) rename {lib => packages/crawler}/db/snapshot.ts (95%) rename {lib => packages/crawler}/db/snapshotSchedule.ts (98%) rename {lib => packages/crawler}/db/songs.ts (92%) create mode 100644 packages/crawler/deno.json rename {lib => packages/crawler}/log/logger.ts (100%) rename {lib => packages/crawler}/log/test.ts (89%) rename {lib => packages/crawler}/ml/akari.ts (95%) rename {lib => packages/crawler}/ml/benchmark.ts (100%) rename {lib => packages/crawler}/ml/manager.ts (92%) rename {lib => packages/crawler}/ml/quant_benchmark.ts (100%) rename {lib => packages/crawler}/mq/exec/classifyVideo.ts (79%) rename {lib => packages/crawler}/mq/exec/getLatestVideos.ts (74%) rename {lib => packages/crawler}/mq/exec/snapshotTick.ts (95%) create mode 100644 packages/crawler/mq/executors.ts rename {lib => packages/crawler}/mq/index.ts (100%) rename {lib => packages/crawler}/mq/init.ts (86%) rename {lib => packages/crawler}/mq/lockManager.ts (97%) rename {lib => packages/crawler}/mq/rateLimiter.ts (96%) rename {lib => packages/crawler}/mq/scheduler.ts (98%) rename {lib => packages/crawler}/mq/schema.ts (100%) rename {lib => packages/crawler}/mq/slidingWindow.ts (100%) rename {lib => packages/crawler}/mq/task/collectSongs.ts (83%) rename {lib => packages/crawler}/mq/task/getVideoDetails.ts (82%) rename {lib => packages/crawler}/mq/task/getVideoStats.ts (91%) rename {lib => packages/crawler}/mq/task/queueLatestVideo.ts (82%) rename {lib => packages/crawler}/net/bilibili.d.ts (100%) rename {lib => packages/crawler}/net/bulkGetVideoStats.ts (86%) rename {lib => packages/crawler}/net/getLatestVideoAids.ts (83%) rename {lib => packages/crawler}/net/getVideoDetails.ts (73%) rename {lib => packages/crawler}/net/getVideoInfo.ts (87%) rename {src => packages/crawler/src}/bullui.ts (97%) rename {src => packages/crawler/src}/filterWorker.ts (77%) create mode 100644 packages/crawler/src/jobAdder.ts rename {src => packages/crawler/src}/worker.ts (87%) rename {lib => packages/crawler}/utils/formatSeconds.ts (100%) rename {lib => packages/crawler}/utils/formatTimestampToPostgre.ts (100%) rename {lib => packages/crawler}/utils/sleep.ts (100%) rename {lib => packages/crawler}/utils/truncate.ts (100%) create mode 100644 packages/frontend/deno.json delete mode 100644 src/jobAdder.ts delete mode 100644 test/ml/akari.json delete mode 100644 test/ml/akari.test.ts delete mode 100644 test/mq/rateLimiter.test.ts delete mode 100644 test/mq/slidingWindow.test.ts diff --git a/deno.json b/deno.json index f4ff4ee..2a573a5 100644 --- a/deno.json +++ b/deno.json @@ -1,60 +1,8 @@ { - "lock": false, + "lock": false, + "workspace": ["./packages/crawler", "./packages/frontend", "./packages/backend", "./packages/core"], + "nodeModulesDir": "auto", "tasks": { - "crawl-raw-bili": "deno --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run src/db/raw/insertAidsToDB.ts", - "crawl-bili-aids": "deno --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run src/db/raw/fetchAids.ts", - "check": "deno fmt --check && deno lint && deno check **/*.ts && deno check **/*.tsx", - "cli": "echo \"import '\\$fresh/src/dev/cli.ts'\" | deno run --unstable -A -", - "manifest": "deno task cli manifest $(pwd)", - "start": "deno run -A --watch=static/,routes/ dev.ts", - "build": "deno run -A dev.ts build", - "preview": "deno run -A main.ts", - "update": "deno run -A -r https://fresh.deno.dev/update .", - "worker:main": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net --allow-write --allow-run ./src/worker.ts", - "worker:filter": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net --allow-write ./src/filterWorker.ts", - "adder": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net ./src/jobAdder.ts", - "bullui": "deno run --allow-read --allow-env --allow-ffi --allow-net ./src/bullui.ts", - "all": "concurrently 'deno task worker:main' 'deno task adder' 'deno task bullui' 'deno task worker:filter'", - "test": "deno test ./test/ --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run" - }, - "lint": { - "rules": { - "tags": ["fresh", "recommended"] - } - }, - "exclude": ["**/_fresh/*"], - "imports": { - "@std/assert": "jsr:@std/assert@1", - "$fresh/": "https://deno.land/x/fresh@1.7.3/", - "preact": "https://esm.sh/preact@10.22.0", - "preact/": "https://esm.sh/preact@10.22.0/", - "@preact/signals": "https://esm.sh/*@preact/signals@1.2.2", - "@preact/signals-core": "https://esm.sh/*@preact/signals-core@1.5.1", - "tailwindcss": "npm:tailwindcss@3.4.1", - "tailwindcss/": "npm:/tailwindcss@3.4.1/", - "tailwindcss/plugin": "npm:/tailwindcss@3.4.1/plugin.js", - "$std/": "https://deno.land/std@0.216.0/", - "@huggingface/transformers": "npm:@huggingface/transformers@3.0.0", - "bullmq": "npm:bullmq", - "lib/": "./lib/", - "ioredis": "npm:ioredis", - "@bull-board/api": "npm:@bull-board/api", - "@bull-board/express": "npm:@bull-board/express", - "express": "npm:express", - "src/": "./src/", - "onnxruntime": "npm:onnxruntime-node@1.19.2", - "chalk": "npm:chalk" - }, - "compilerOptions": { - "jsx": "react-jsx", - "jsxImportSource": "preact" - }, - "nodeModulesDir": "auto", - "fmt": { - "useTabs": true, - "lineWidth": 120, - "indentWidth": 4, - "semiColons": true, - "proseWrap": "always" + "crawler": "deno task --filter 'crawler' all" } } diff --git a/lib/ml/mantis.ts b/lib/ml/mantis.ts deleted file mode 100644 index 6960be9..0000000 --- a/lib/ml/mantis.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { AIManager } from "lib/ml/manager.ts"; -import * as ort from "onnxruntime"; -import logger from "lib/log/logger.ts"; -import { WorkerError } from "lib/mq/schema.ts"; - -const modelPath = "./model/model.onnx"; - -class MantisProto extends AIManager { - constructor() { - super(); - this.models = { - "predictor": modelPath, - }; - } - - public override async init(): Promise { - await super.init(); - } -} - -const Mantis = new MantisProto(); -export default Mantis; diff --git a/lib/mq/executors.ts b/lib/mq/executors.ts deleted file mode 100644 index 85c2cc1..0000000 --- a/lib/mq/executors.ts +++ /dev/null @@ -1 +0,0 @@ -export * from "lib/mq/exec/getLatestVideos.ts"; diff --git a/packages/backend/deno.json b/packages/backend/deno.json new file mode 100644 index 0000000..e69de29 diff --git a/packages/core/deno.json b/packages/core/deno.json new file mode 100644 index 0000000..e69de29 diff --git a/lib/db/allData.ts b/packages/crawler/db/allData.ts similarity index 96% rename from lib/db/allData.ts rename to packages/crawler/db/allData.ts index bf92edd..461cb69 100644 --- a/lib/db/allData.ts +++ b/packages/crawler/db/allData.ts @@ -1,6 +1,6 @@ import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { AllDataType, BiliUserType } from "lib/db/schema.d.ts"; -import Akari from "lib/ml/akari.ts"; +import { AllDataType, BiliUserType } from "db/schema.d.ts"; +import Akari from "ml/akari.ts"; export async function videoExistsInAllData(client: Client, aid: number) { return await client.queryObject<{ exists: boolean }>( diff --git a/lib/db/init.ts b/packages/crawler/db/init.ts similarity index 72% rename from lib/db/init.ts rename to packages/crawler/db/init.ts index d206872..a1835b0 100644 --- a/lib/db/init.ts +++ b/packages/crawler/db/init.ts @@ -1,5 +1,5 @@ import { Pool } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { postgresConfig } from "lib/db/pgConfig.ts"; +import { postgresConfig } from "db/pgConfig.ts"; const pool = new Pool(postgresConfig, 12); diff --git a/lib/db/pgConfig.ts b/packages/crawler/db/pgConfig.ts similarity index 100% rename from lib/db/pgConfig.ts rename to packages/crawler/db/pgConfig.ts diff --git a/lib/db/redis.ts b/packages/crawler/db/redis.ts similarity index 100% rename from lib/db/redis.ts rename to packages/crawler/db/redis.ts diff --git a/lib/db/schema.d.ts b/packages/crawler/db/schema.d.ts similarity index 100% rename from lib/db/schema.d.ts rename to packages/crawler/db/schema.d.ts diff --git a/lib/db/snapshot.ts b/packages/crawler/db/snapshot.ts similarity index 95% rename from lib/db/snapshot.ts rename to packages/crawler/db/snapshot.ts index 726bfc5..ef8009d 100644 --- a/lib/db/snapshot.ts +++ b/packages/crawler/db/snapshot.ts @@ -1,5 +1,5 @@ import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { LatestSnapshotType } from "lib/db/schema.d.ts"; +import { LatestSnapshotType } from "db/schema.d.ts"; export async function getVideosNearMilestone(client: Client) { const queryResult = await client.queryObject(` diff --git a/lib/db/snapshotSchedule.ts b/packages/crawler/db/snapshotSchedule.ts similarity index 98% rename from lib/db/snapshotSchedule.ts rename to packages/crawler/db/snapshotSchedule.ts index 68228b7..b98f900 100644 --- a/lib/db/snapshotSchedule.ts +++ b/packages/crawler/db/snapshotSchedule.ts @@ -1,9 +1,9 @@ import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { formatTimestampToPsql } from "lib/utils/formatTimestampToPostgre.ts"; +import { formatTimestampToPsql } from "utils/formatTimestampToPostgre.ts"; import { SnapshotScheduleType } from "./schema.d.ts"; -import logger from "lib/log/logger.ts"; +import logger from "log/logger.ts"; import { MINUTE } from "$std/datetime/constants.ts"; -import { redis } from "lib/db/redis.ts"; +import { redis } from "db/redis.ts"; import { Redis } from "ioredis"; const REDIS_KEY = "cvsa:snapshot_window_counts"; diff --git a/lib/db/songs.ts b/packages/crawler/db/songs.ts similarity index 92% rename from lib/db/songs.ts rename to packages/crawler/db/songs.ts index 15a49b3..1bfa002 100644 --- a/lib/db/songs.ts +++ b/packages/crawler/db/songs.ts @@ -1,5 +1,5 @@ import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { parseTimestampFromPsql } from "lib/utils/formatTimestampToPostgre.ts"; +import { parseTimestampFromPsql } from "utils/formatTimestampToPostgre.ts"; export async function getNotCollectedSongs(client: Client) { const queryResult = await client.queryObject<{ aid: number }>(` diff --git a/packages/crawler/deno.json b/packages/crawler/deno.json new file mode 100644 index 0000000..1d91eda --- /dev/null +++ b/packages/crawler/deno.json @@ -0,0 +1,49 @@ +{ + "name": "@cvsa/crawler", + "tasks": { + "crawl-raw-bili": "deno --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run src/db/raw/insertAidsToDB.ts", + "crawl-bili-aids": "deno --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run src/db/raw/fetchAids.ts", + "check": "deno fmt --check && deno lint && deno check **/*.ts && deno check **/*.tsx", + "manifest": "deno task cli manifest $(pwd)", + "start": "deno run -A --watch=static/,routes/ dev.ts", + "build": "deno run -A dev.ts build", + "preview": "deno run -A main.ts", + "worker:main": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net --allow-write --allow-run ./src/worker.ts", + "worker:filter": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net --allow-write ./src/filterWorker.ts", + "adder": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net ./src/jobAdder.ts", + "bullui": "deno run --allow-read --allow-env --allow-ffi --allow-net ./src/bullui.ts", + "all": "concurrently 'deno task worker:main' 'deno task adder' 'deno task bullui' 'deno task worker:filter'", + "test": "deno test ./test/ --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run" + }, + "lint": { + "rules": { + "tags": ["recommended"] + } + }, + "imports": { + "@std/assert": "jsr:@std/assert@1", + "$std/": "https://deno.land/std@0.216.0/", + "@huggingface/transformers": "npm:@huggingface/transformers@3.0.0", + "bullmq": "npm:bullmq", + "mq/": "./mq/", + "db/": "./db/", + "log/": "./log/", + "net/": "./net/", + "ml/": "./ml/", + "utils/": "./utils/", + "ioredis": "npm:ioredis", + "@bull-board/api": "npm:@bull-board/api", + "@bull-board/express": "npm:@bull-board/express", + "express": "npm:express", + "src/": "./src/", + "onnxruntime": "npm:onnxruntime-node@1.19.2", + "chalk": "npm:chalk" + }, + "fmt": { + "useTabs": true, + "lineWidth": 120, + "indentWidth": 4, + "semiColons": true, + "proseWrap": "always" + } +} \ No newline at end of file diff --git a/lib/log/logger.ts b/packages/crawler/log/logger.ts similarity index 100% rename from lib/log/logger.ts rename to packages/crawler/log/logger.ts diff --git a/lib/log/test.ts b/packages/crawler/log/test.ts similarity index 89% rename from lib/log/test.ts rename to packages/crawler/log/test.ts index 71c719c..ee5953c 100644 --- a/lib/log/test.ts +++ b/packages/crawler/log/test.ts @@ -1,4 +1,4 @@ -import logger from "lib/log/logger.ts"; +import logger from "log/logger.ts"; logger.error(Error("test error"), "test service"); logger.debug(`some string`); diff --git a/lib/ml/akari.ts b/packages/crawler/ml/akari.ts similarity index 95% rename from lib/ml/akari.ts rename to packages/crawler/ml/akari.ts index d5ce9b2..ed1153e 100644 --- a/lib/ml/akari.ts +++ b/packages/crawler/ml/akari.ts @@ -1,7 +1,7 @@ -import { AIManager } from "lib/ml/manager.ts"; +import { AIManager } from "ml/manager.ts"; import * as ort from "onnxruntime"; -import logger from "lib/log/logger.ts"; -import { WorkerError } from "lib/mq/schema.ts"; +import logger from "log/logger.ts"; +import { WorkerError } from "mq/schema.ts"; import { AutoTokenizer, PreTrainedTokenizer } from "@huggingface/transformers"; const tokenizerModel = "alikia2x/jina-embedding-v3-m2v-1024"; diff --git a/lib/ml/benchmark.ts b/packages/crawler/ml/benchmark.ts similarity index 100% rename from lib/ml/benchmark.ts rename to packages/crawler/ml/benchmark.ts diff --git a/lib/ml/manager.ts b/packages/crawler/ml/manager.ts similarity index 92% rename from lib/ml/manager.ts rename to packages/crawler/ml/manager.ts index 8230fcf..42f783e 100644 --- a/lib/ml/manager.ts +++ b/packages/crawler/ml/manager.ts @@ -1,6 +1,6 @@ import * as ort from "onnxruntime"; -import logger from "lib/log/logger.ts"; -import { WorkerError } from "lib/mq/schema.ts"; +import logger from "log/logger.ts"; +import { WorkerError } from "mq/schema.ts"; export class AIManager { public sessions: { [key: string]: ort.InferenceSession } = {}; diff --git a/lib/ml/quant_benchmark.ts b/packages/crawler/ml/quant_benchmark.ts similarity index 100% rename from lib/ml/quant_benchmark.ts rename to packages/crawler/ml/quant_benchmark.ts diff --git a/lib/mq/exec/classifyVideo.ts b/packages/crawler/mq/exec/classifyVideo.ts similarity index 79% rename from lib/mq/exec/classifyVideo.ts rename to packages/crawler/mq/exec/classifyVideo.ts index 20545a0..c813b7b 100644 --- a/lib/mq/exec/classifyVideo.ts +++ b/packages/crawler/mq/exec/classifyVideo.ts @@ -1,13 +1,13 @@ import { Job } from "bullmq"; -import { db } from "lib/db/init.ts"; -import { getUnlabelledVideos, getVideoInfoFromAllData, insertVideoLabel } from "lib/db/allData.ts"; -import Akari from "lib/ml/akari.ts"; -import { ClassifyVideoQueue } from "lib/mq/index.ts"; -import logger from "lib/log/logger.ts"; -import { lockManager } from "lib/mq/lockManager.ts"; -import { aidExistsInSongs } from "lib/db/songs.ts"; -import { insertIntoSongs } from "lib/mq/task/collectSongs.ts"; -import { scheduleSnapshot } from "lib/db/snapshotSchedule.ts"; +import { db } from "db/init.ts"; +import { getUnlabelledVideos, getVideoInfoFromAllData, insertVideoLabel } from "db/allData.ts"; +import Akari from "ml/akari.ts"; +import { ClassifyVideoQueue } from "mq/index.ts"; +import logger from "log/logger.ts"; +import { lockManager } from "mq/lockManager.ts"; +import { aidExistsInSongs } from "db/songs.ts"; +import { insertIntoSongs } from "mq/task/collectSongs.ts"; +import { scheduleSnapshot } from "db/snapshotSchedule.ts"; import { MINUTE } from "$std/datetime/constants.ts"; export const classifyVideoWorker = async (job: Job) => { diff --git a/lib/mq/exec/getLatestVideos.ts b/packages/crawler/mq/exec/getLatestVideos.ts similarity index 74% rename from lib/mq/exec/getLatestVideos.ts rename to packages/crawler/mq/exec/getLatestVideos.ts index 34b5d1a..7a19738 100644 --- a/lib/mq/exec/getLatestVideos.ts +++ b/packages/crawler/mq/exec/getLatestVideos.ts @@ -1,8 +1,8 @@ import { Job } from "bullmq"; -import { queueLatestVideos } from "lib/mq/task/queueLatestVideo.ts"; -import { db } from "lib/db/init.ts"; -import { insertVideoInfo } from "lib/mq/task/getVideoDetails.ts"; -import { collectSongs } from "lib/mq/task/collectSongs.ts"; +import { queueLatestVideos } from "mq/task/queueLatestVideo.ts"; +import { db } from "db/init.ts"; +import { insertVideoInfo } from "mq/task/getVideoDetails.ts"; +import { collectSongs } from "mq/task/collectSongs.ts"; export const getLatestVideosWorker = async (_job: Job): Promise => { const client = await db.connect(); diff --git a/lib/mq/exec/snapshotTick.ts b/packages/crawler/mq/exec/snapshotTick.ts similarity index 95% rename from lib/mq/exec/snapshotTick.ts rename to packages/crawler/mq/exec/snapshotTick.ts index b18d845..876e05a 100644 --- a/lib/mq/exec/snapshotTick.ts +++ b/packages/crawler/mq/exec/snapshotTick.ts @@ -1,6 +1,6 @@ import { Job } from "bullmq"; -import { db } from "lib/db/init.ts"; -import { getLatestVideoSnapshot, getVideosNearMilestone } from "lib/db/snapshot.ts"; +import { db } from "db/init.ts"; +import { getLatestVideoSnapshot, getVideosNearMilestone } from "db/snapshot.ts"; import { bulkGetVideosWithoutProcessingSchedules, bulkScheduleSnapshot, @@ -16,18 +16,18 @@ import { snapshotScheduleExists, videoHasProcessingSchedule, getBulkSnapshotsInNextSecond -} from "lib/db/snapshotSchedule.ts"; +} from "db/snapshotSchedule.ts"; import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; import { HOUR, MINUTE, SECOND, WEEK } from "$std/datetime/constants.ts"; -import logger from "lib/log/logger.ts"; -import { SnapshotQueue } from "lib/mq/index.ts"; -import { insertVideoSnapshot } from "lib/mq/task/getVideoStats.ts"; -import { NetSchedulerError } from "lib/mq/scheduler.ts"; -import { getBiliVideoStatus, setBiliVideoStatus } from "lib/db/allData.ts"; -import { truncate } from "lib/utils/truncate.ts"; -import { lockManager } from "lib/mq/lockManager.ts"; -import { getSongsPublihsedAt } from "lib/db/songs.ts"; -import { bulkGetVideoStats } from "lib/net/bulkGetVideoStats.ts"; +import logger from "log/logger.ts"; +import { SnapshotQueue } from "mq/index.ts"; +import { insertVideoSnapshot } from "mq/task/getVideoStats.ts"; +import { NetSchedulerError } from "mq/scheduler.ts"; +import { getBiliVideoStatus, setBiliVideoStatus } from "db/allData.ts"; +import { truncate } from "utils/truncate.ts"; +import { lockManager } from "mq/lockManager.ts"; +import { getSongsPublihsedAt } from "db/songs.ts"; +import { bulkGetVideoStats } from "net/bulkGetVideoStats.ts"; const priorityMap: { [key: string]: number } = { "milestone": 1, diff --git a/packages/crawler/mq/executors.ts b/packages/crawler/mq/executors.ts new file mode 100644 index 0000000..1e486e1 --- /dev/null +++ b/packages/crawler/mq/executors.ts @@ -0,0 +1 @@ +export * from "mq/exec/getLatestVideos.ts"; diff --git a/lib/mq/index.ts b/packages/crawler/mq/index.ts similarity index 100% rename from lib/mq/index.ts rename to packages/crawler/mq/index.ts diff --git a/lib/mq/init.ts b/packages/crawler/mq/init.ts similarity index 86% rename from lib/mq/init.ts rename to packages/crawler/mq/init.ts index d408f8e..4a302d1 100644 --- a/lib/mq/init.ts +++ b/packages/crawler/mq/init.ts @@ -1,9 +1,9 @@ import { MINUTE, SECOND } from "$std/datetime/constants.ts"; -import { ClassifyVideoQueue, LatestVideosQueue, SnapshotQueue } from "lib/mq/index.ts"; -import logger from "lib/log/logger.ts"; -import { initSnapshotWindowCounts } from "lib/db/snapshotSchedule.ts"; -import { db } from "lib/db/init.ts"; -import { redis } from "lib/db/redis.ts"; +import { ClassifyVideoQueue, LatestVideosQueue, SnapshotQueue } from "mq/index.ts"; +import logger from "log/logger.ts"; +import { initSnapshotWindowCounts } from "db/snapshotSchedule.ts"; +import { db } from "db/init.ts"; +import { redis } from "db/redis.ts"; export async function initMQ() { const client = await db.connect(); diff --git a/lib/mq/lockManager.ts b/packages/crawler/mq/lockManager.ts similarity index 97% rename from lib/mq/lockManager.ts rename to packages/crawler/mq/lockManager.ts index f83b148..e0c7f8a 100644 --- a/lib/mq/lockManager.ts +++ b/packages/crawler/mq/lockManager.ts @@ -1,5 +1,5 @@ import { Redis } from "ioredis"; -import { redis } from "lib/db/redis.ts"; +import { redis } from "db/redis.ts"; class LockManager { private redis: Redis; diff --git a/lib/mq/rateLimiter.ts b/packages/crawler/mq/rateLimiter.ts similarity index 96% rename from lib/mq/rateLimiter.ts rename to packages/crawler/mq/rateLimiter.ts index 7f62547..aba7c3e 100644 --- a/lib/mq/rateLimiter.ts +++ b/packages/crawler/mq/rateLimiter.ts @@ -1,4 +1,4 @@ -import { SlidingWindow } from "lib/mq/slidingWindow.ts"; +import { SlidingWindow } from "mq/slidingWindow.ts"; export interface RateLimiterConfig { window: SlidingWindow; diff --git a/lib/mq/scheduler.ts b/packages/crawler/mq/scheduler.ts similarity index 98% rename from lib/mq/scheduler.ts rename to packages/crawler/mq/scheduler.ts index 6722519..0e8c036 100644 --- a/lib/mq/scheduler.ts +++ b/packages/crawler/mq/scheduler.ts @@ -1,7 +1,7 @@ -import logger from "lib/log/logger.ts"; -import { RateLimiter, RateLimiterConfig } from "lib/mq/rateLimiter.ts"; -import { SlidingWindow } from "lib/mq/slidingWindow.ts"; -import { redis } from "lib/db/redis.ts"; +import logger from "log/logger.ts"; +import { RateLimiter, RateLimiterConfig } from "mq/rateLimiter.ts"; +import { SlidingWindow } from "mq/slidingWindow.ts"; +import { redis } from "db/redis.ts"; import Redis from "ioredis"; import { SECOND } from "$std/datetime/constants.ts"; diff --git a/lib/mq/schema.ts b/packages/crawler/mq/schema.ts similarity index 100% rename from lib/mq/schema.ts rename to packages/crawler/mq/schema.ts diff --git a/lib/mq/slidingWindow.ts b/packages/crawler/mq/slidingWindow.ts similarity index 100% rename from lib/mq/slidingWindow.ts rename to packages/crawler/mq/slidingWindow.ts diff --git a/lib/mq/task/collectSongs.ts b/packages/crawler/mq/task/collectSongs.ts similarity index 83% rename from lib/mq/task/collectSongs.ts rename to packages/crawler/mq/task/collectSongs.ts index b71aa3b..389ca06 100644 --- a/lib/mq/task/collectSongs.ts +++ b/packages/crawler/mq/task/collectSongs.ts @@ -1,7 +1,7 @@ import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { aidExistsInSongs, getNotCollectedSongs } from "lib/db/songs.ts"; -import logger from "lib/log/logger.ts"; -import { scheduleSnapshot } from "lib/db/snapshotSchedule.ts"; +import { aidExistsInSongs, getNotCollectedSongs } from "db/songs.ts"; +import logger from "log/logger.ts"; +import { scheduleSnapshot } from "db/snapshotSchedule.ts"; import { MINUTE } from "$std/datetime/constants.ts"; export async function collectSongs(client: Client) { diff --git a/lib/mq/task/getVideoDetails.ts b/packages/crawler/mq/task/getVideoDetails.ts similarity index 82% rename from lib/mq/task/getVideoDetails.ts rename to packages/crawler/mq/task/getVideoDetails.ts index ea5f903..9b675e5 100644 --- a/lib/mq/task/getVideoDetails.ts +++ b/packages/crawler/mq/task/getVideoDetails.ts @@ -1,9 +1,9 @@ import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { getVideoDetails } from "lib/net/getVideoDetails.ts"; -import { formatTimestampToPsql } from "lib/utils/formatTimestampToPostgre.ts"; -import logger from "lib/log/logger.ts"; -import { ClassifyVideoQueue } from "lib/mq/index.ts"; -import { userExistsInBiliUsers, videoExistsInAllData } from "lib/db/allData.ts"; +import { getVideoDetails } from "net/getVideoDetails.ts"; +import { formatTimestampToPsql } from "utils/formatTimestampToPostgre.ts"; +import logger from "log/logger.ts"; +import { ClassifyVideoQueue } from "mq/index.ts"; +import { userExistsInBiliUsers, videoExistsInAllData } from "db/allData.ts"; import { HOUR, SECOND } from "$std/datetime/constants.ts"; export async function insertVideoInfo(client: Client, aid: number) { diff --git a/lib/mq/task/getVideoStats.ts b/packages/crawler/mq/task/getVideoStats.ts similarity index 91% rename from lib/mq/task/getVideoStats.ts rename to packages/crawler/mq/task/getVideoStats.ts index 3be1cd7..34b6c42 100644 --- a/lib/mq/task/getVideoStats.ts +++ b/packages/crawler/mq/task/getVideoStats.ts @@ -1,7 +1,7 @@ import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { getVideoInfo } from "lib/net/getVideoInfo.ts"; -import { LatestSnapshotType } from "lib/db/schema.d.ts"; -import logger from "lib/log/logger.ts"; +import { getVideoInfo } from "net/getVideoInfo.ts"; +import { LatestSnapshotType } from "db/schema.d.ts"; +import logger from "log/logger.ts"; /* * Fetch video stats from bilibili API and insert into database diff --git a/lib/mq/task/queueLatestVideo.ts b/packages/crawler/mq/task/queueLatestVideo.ts similarity index 82% rename from lib/mq/task/queueLatestVideo.ts rename to packages/crawler/mq/task/queueLatestVideo.ts index d2e938b..d8b3993 100644 --- a/lib/mq/task/queueLatestVideo.ts +++ b/packages/crawler/mq/task/queueLatestVideo.ts @@ -1,10 +1,10 @@ import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { getLatestVideoAids } from "lib/net/getLatestVideoAids.ts"; -import { videoExistsInAllData } from "lib/db/allData.ts"; -import { sleep } from "lib/utils/sleep.ts"; +import { getLatestVideoAids } from "net/getLatestVideoAids.ts"; +import { videoExistsInAllData } from "db/allData.ts"; +import { sleep } from "utils/sleep.ts"; import { SECOND } from "$std/datetime/constants.ts"; -import logger from "lib/log/logger.ts"; -import { LatestVideosQueue } from "lib/mq/index.ts"; +import logger from "log/logger.ts"; +import { LatestVideosQueue } from "mq/index.ts"; export async function queueLatestVideos( client: Client, diff --git a/lib/net/bilibili.d.ts b/packages/crawler/net/bilibili.d.ts similarity index 100% rename from lib/net/bilibili.d.ts rename to packages/crawler/net/bilibili.d.ts diff --git a/lib/net/bulkGetVideoStats.ts b/packages/crawler/net/bulkGetVideoStats.ts similarity index 86% rename from lib/net/bulkGetVideoStats.ts rename to packages/crawler/net/bulkGetVideoStats.ts index 7240bed..2b0c7f2 100644 --- a/lib/net/bulkGetVideoStats.ts +++ b/packages/crawler/net/bulkGetVideoStats.ts @@ -1,6 +1,6 @@ -import netScheduler from "lib/mq/scheduler.ts"; -import { MediaListInfoData, MediaListInfoResponse } from "lib/net/bilibili.d.ts"; -import logger from "lib/log/logger.ts"; +import netScheduler from "mq/scheduler.ts"; +import { MediaListInfoData, MediaListInfoResponse } from "net/bilibili.d.ts"; +import logger from "log/logger.ts"; /* * Bulk fetch video metadata from bilibili API diff --git a/lib/net/getLatestVideoAids.ts b/packages/crawler/net/getLatestVideoAids.ts similarity index 83% rename from lib/net/getLatestVideoAids.ts rename to packages/crawler/net/getLatestVideoAids.ts index 2fb44be..7dacd46 100644 --- a/lib/net/getLatestVideoAids.ts +++ b/packages/crawler/net/getLatestVideoAids.ts @@ -1,6 +1,6 @@ -import { VideoListResponse } from "lib/net/bilibili.d.ts"; -import logger from "lib/log/logger.ts"; -import netScheduler from "lib/mq/scheduler.ts"; +import { VideoListResponse } from "net/bilibili.d.ts"; +import logger from "log/logger.ts"; +import netScheduler from "mq/scheduler.ts"; export async function getLatestVideoAids(page: number = 1, pageSize: number = 10): Promise { const startFrom = 1 + pageSize * (page - 1); diff --git a/lib/net/getVideoDetails.ts b/packages/crawler/net/getVideoDetails.ts similarity index 73% rename from lib/net/getVideoDetails.ts rename to packages/crawler/net/getVideoDetails.ts index 9e421cf..d6d52c1 100644 --- a/lib/net/getVideoDetails.ts +++ b/packages/crawler/net/getVideoDetails.ts @@ -1,6 +1,6 @@ -import netScheduler from "lib/mq/scheduler.ts"; -import { VideoDetailsData, VideoDetailsResponse } from "lib/net/bilibili.d.ts"; -import logger from "lib/log/logger.ts"; +import netScheduler from "mq/scheduler.ts"; +import { VideoDetailsData, VideoDetailsResponse } from "net/bilibili.d.ts"; +import logger from "log/logger.ts"; export async function getVideoDetails(aid: number): Promise { const url = `https://api.bilibili.com/x/web-interface/view/detail?aid=${aid}`; diff --git a/lib/net/getVideoInfo.ts b/packages/crawler/net/getVideoInfo.ts similarity index 87% rename from lib/net/getVideoInfo.ts rename to packages/crawler/net/getVideoInfo.ts index 897fc62..0533c53 100644 --- a/lib/net/getVideoInfo.ts +++ b/packages/crawler/net/getVideoInfo.ts @@ -1,6 +1,6 @@ -import netScheduler from "lib/mq/scheduler.ts"; -import { VideoInfoData, VideoInfoResponse } from "lib/net/bilibili.d.ts"; -import logger from "lib/log/logger.ts"; +import netScheduler from "mq/scheduler.ts"; +import { VideoInfoData, VideoInfoResponse } from "net/bilibili.d.ts"; +import logger from "log/logger.ts"; /* * Fetch video metadata from bilibili API diff --git a/src/bullui.ts b/packages/crawler/src/bullui.ts similarity index 97% rename from src/bullui.ts rename to packages/crawler/src/bullui.ts index acf8d3f..5765540 100644 --- a/src/bullui.ts +++ b/packages/crawler/src/bullui.ts @@ -2,7 +2,7 @@ import express from "express"; import { createBullBoard } from "@bull-board/api"; import { BullMQAdapter } from "@bull-board/api/bullMQAdapter.js"; import { ExpressAdapter } from "@bull-board/express"; -import { ClassifyVideoQueue, LatestVideosQueue, SnapshotQueue } from "lib/mq/index.ts"; +import { ClassifyVideoQueue, LatestVideosQueue, SnapshotQueue } from "mq/index.ts"; const serverAdapter = new ExpressAdapter(); serverAdapter.setBasePath("/"); diff --git a/src/filterWorker.ts b/packages/crawler/src/filterWorker.ts similarity index 77% rename from src/filterWorker.ts rename to packages/crawler/src/filterWorker.ts index b14ef07..cb336c4 100644 --- a/src/filterWorker.ts +++ b/packages/crawler/src/filterWorker.ts @@ -1,10 +1,10 @@ import { ConnectionOptions, Job, Worker } from "bullmq"; -import { redis } from "lib/db/redis.ts"; -import logger from "lib/log/logger.ts"; -import { classifyVideosWorker, classifyVideoWorker } from "lib/mq/exec/classifyVideo.ts"; -import { WorkerError } from "lib/mq/schema.ts"; -import { lockManager } from "lib/mq/lockManager.ts"; -import Akari from "lib/ml/akari.ts"; +import { redis } from "db/redis.ts"; +import logger from "log/logger.ts"; +import { classifyVideosWorker, classifyVideoWorker } from "mq/exec/classifyVideo.ts"; +import { WorkerError } from "mq/schema.ts"; +import { lockManager } from "mq/lockManager.ts"; +import Akari from "ml/akari.ts"; Deno.addSignalListener("SIGINT", async () => { logger.log("SIGINT Received: Shutting down workers...", "mq"); diff --git a/packages/crawler/src/jobAdder.ts b/packages/crawler/src/jobAdder.ts new file mode 100644 index 0000000..3aefd24 --- /dev/null +++ b/packages/crawler/src/jobAdder.ts @@ -0,0 +1,3 @@ +import { initMQ } from "mq/init.ts"; + +await initMQ(); diff --git a/src/worker.ts b/packages/crawler/src/worker.ts similarity index 87% rename from src/worker.ts rename to packages/crawler/src/worker.ts index fae7b6a..e240a0b 100644 --- a/src/worker.ts +++ b/packages/crawler/src/worker.ts @@ -1,10 +1,10 @@ import { ConnectionOptions, Job, Worker } from "bullmq"; -import { collectSongsWorker, getLatestVideosWorker } from "lib/mq/executors.ts"; -import { redis } from "lib/db/redis.ts"; -import logger from "lib/log/logger.ts"; -import { lockManager } from "lib/mq/lockManager.ts"; -import { WorkerError } from "lib/mq/schema.ts"; -import { getVideoInfoWorker } from "lib/mq/exec/getLatestVideos.ts"; +import { collectSongsWorker, getLatestVideosWorker } from "mq/executors.ts"; +import { redis } from "db/redis.ts"; +import logger from "log/logger.ts"; +import { lockManager } from "mq/lockManager.ts"; +import { WorkerError } from "mq/schema.ts"; +import { getVideoInfoWorker } from "mq/exec/getLatestVideos.ts"; import { collectMilestoneSnapshotsWorker, regularSnapshotsWorker, @@ -13,7 +13,7 @@ import { scheduleCleanupWorker, takeBulkSnapshotForVideosWorker, bulkSnapshotTickWorker -} from "lib/mq/exec/snapshotTick.ts"; +} from "mq/exec/snapshotTick.ts"; Deno.addSignalListener("SIGINT", async () => { logger.log("SIGINT Received: Shutting down workers...", "mq"); diff --git a/lib/utils/formatSeconds.ts b/packages/crawler/utils/formatSeconds.ts similarity index 100% rename from lib/utils/formatSeconds.ts rename to packages/crawler/utils/formatSeconds.ts diff --git a/lib/utils/formatTimestampToPostgre.ts b/packages/crawler/utils/formatTimestampToPostgre.ts similarity index 100% rename from lib/utils/formatTimestampToPostgre.ts rename to packages/crawler/utils/formatTimestampToPostgre.ts diff --git a/lib/utils/sleep.ts b/packages/crawler/utils/sleep.ts similarity index 100% rename from lib/utils/sleep.ts rename to packages/crawler/utils/sleep.ts diff --git a/lib/utils/truncate.ts b/packages/crawler/utils/truncate.ts similarity index 100% rename from lib/utils/truncate.ts rename to packages/crawler/utils/truncate.ts diff --git a/packages/frontend/deno.json b/packages/frontend/deno.json new file mode 100644 index 0000000..e69de29 diff --git a/src/jobAdder.ts b/src/jobAdder.ts deleted file mode 100644 index cb107f4..0000000 --- a/src/jobAdder.ts +++ /dev/null @@ -1,3 +0,0 @@ -import { initMQ } from "lib/mq/init.ts"; - -await initMQ(); diff --git a/test/ml/akari.json b/test/ml/akari.json deleted file mode 100644 index 9de1219..0000000 --- a/test/ml/akari.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "test1": [ - { - "title": "【洛天依】《一花依世界》(2024重调版)|“抬头仰望,夜空多安详”【原创PV付】", - "desc": "本家:BV1Vs411H7JH\n作曲:LS\n作词:杏花包子\n调教:鬼面P\n混音:虎皮猫P\n演唱:洛天依\n曲绘:山下鸭鸭窝\n映像:阿妍\n——————————————————————\n本稿为同人二创,非本家重制", - "tags": "发现《一花依世界》, Vsinger创作激励计划, 洛天依, VOCALOID CHINA, 翻唱, 原创PV付, ACE虚拟歌姬, 中文VOCALOID, 国风电子, 一花依世界, ACE Studio, Vsinger创作激励计划2024冬季物语", - "label": 2 - }, - { - "title": "【鏡音レン】アカシア【VOCALOID Cover】", - "desc": "鏡音リン・レン 13th Anniversary\n\nMusic:BUMP OF CHICKEN https://youtu.be/BoZ0Zwab6Oc\nust:Maplestyle sm37853236\nOff Vocal: https://youtu.be/YMzrUzq1uX0\nSinger:鏡音レン\n\n氷雨ハルカ\nYoutube :https://t.co/8zuv6g7Acm\nniconico:https://t.co/C6DRfdYAp0\ntwitter :https://twitter.com/hisame_haruka\n\n転載禁止\nPlease do not reprint without my permission.", - "tags": "鏡音レン", - "label": 0 - }, - { - "title": "【洛天依原创曲】谪星【姆斯塔之谕】", - "desc": "谪星\n\n策划/世界观:听雨\n作词:听雨\n作曲/编曲:太白\n混音:虎皮猫\n人设:以木\n曲绘:Ar极光\n调校:哈士奇p\n视频:苏卿白", - "tags": "2025虚拟歌手贺岁纪, 洛天依, 原创歌曲, VOCALOID, 虚拟歌手, 原创音乐, 姆斯塔, 中文VOCALOID", - "label": 1 - } - ] -} diff --git a/test/ml/akari.test.ts b/test/ml/akari.test.ts deleted file mode 100644 index f254a01..0000000 --- a/test/ml/akari.test.ts +++ /dev/null @@ -1,46 +0,0 @@ -import Akari from "lib/ml/akari.ts"; -import { assertEquals, assertGreaterOrEqual } from "jsr:@std/assert"; -import { join } from "$std/path/join.ts"; -import { SECOND } from "$std/datetime/constants.ts"; - -Deno.test("Akari AI - normal cases accuracy test", async () => { - const path = import.meta.dirname!; - const dataPath = join(path, "akari.json"); - const rawData = await Deno.readTextFile(dataPath); - const data = JSON.parse(rawData); - await Akari.init(); - for (const testCase of data.test1) { - const result = await Akari.classifyVideo( - testCase.title, - testCase.desc, - testCase.tags, - ); - assertEquals(result, testCase.label); - } -}); - -Deno.test("Akari AI - performance test", async () => { - const path = import.meta.dirname!; - const dataPath = join(path, "akari.json"); - const rawData = await Deno.readTextFile(dataPath); - const data = JSON.parse(rawData); - await Akari.init(); - const N = 200; - const testCase = data.test1[0]; - const title = testCase.title; - const desc = testCase.desc; - const tags = testCase.tags; - const time = performance.now(); - for (let i = 0; i < N; i++) { - await Akari.classifyVideo( - title, - desc, - tags, - ); - } - const end = performance.now(); - const elapsed = (end - time) / SECOND; - const throughput = N / elapsed; - assertGreaterOrEqual(throughput, 100); - console.log(`Akari AI throughput: ${throughput.toFixed(1)} samples / sec`); -}); diff --git a/test/mq/rateLimiter.test.ts b/test/mq/rateLimiter.test.ts deleted file mode 100644 index 2f19723..0000000 --- a/test/mq/rateLimiter.test.ts +++ /dev/null @@ -1,91 +0,0 @@ -import { assertEquals } from "jsr:@std/assert"; -import { SlidingWindow } from "lib/mq/slidingWindow.ts"; -import { RateLimiter, RateLimiterConfig } from "lib/mq/rateLimiter.ts"; -import { Redis } from "npm:ioredis@5.5.0"; - -Deno.test("RateLimiter works correctly", async () => { - const redis = new Redis({ maxRetriesPerRequest: null }); - const windowSize = 5; - const maxRequests = 10; - - const slidingWindow = new SlidingWindow(redis, windowSize); - const config: RateLimiterConfig = { - window: slidingWindow, - max: maxRequests, - }; - const rateLimiter = new RateLimiter("test_event", [config]); - await rateLimiter.clear(); - - // Initial availability should be true - assertEquals(await rateLimiter.getAvailability(), true); - - // Trigger events up to the limit - for (let i = 0; i < maxRequests; i++) { - await rateLimiter.trigger(); - } - - // Availability should now be false - assertEquals(await rateLimiter.getAvailability(), false); - - // Wait for the window to slide - await new Promise((resolve) => setTimeout(resolve, windowSize * 1000 + 500)); - - // Availability should be true again - assertEquals(await rateLimiter.getAvailability(), true); - - redis.quit(); -}); - -Deno.test("Multiple configs work correctly", async () => { - const redis = new Redis({ maxRetriesPerRequest: null }); - const windowSize1 = 1; - const maxRequests1 = 2; - const windowSize2 = 5; - const maxRequests2 = 6; - - const slidingWindow1 = new SlidingWindow(redis, windowSize1); - const config1: RateLimiterConfig = { - window: slidingWindow1, - max: maxRequests1, - }; - const slidingWindow2 = new SlidingWindow(redis, windowSize2); - const config2: RateLimiterConfig = { - window: slidingWindow2, - max: maxRequests2, - }; - const rateLimiter = new RateLimiter("test_event_multi", [config1, config2]); - await rateLimiter.clear(); - - // Initial availability should be true - assertEquals(await rateLimiter.getAvailability(), true); - - // Trigger events up to the limit of the first config - for (let i = 0; i < maxRequests1; i++) { - await rateLimiter.trigger(); - } - - // Availability should now be false (due to config1) - assertEquals(await rateLimiter.getAvailability(), false); - - // Wait for the first window to slide - await new Promise((resolve) => setTimeout(resolve, windowSize1 * 1000 + 500)); - - // Availability should now be true (due to config1) - assertEquals(await rateLimiter.getAvailability(), true); - - // Trigger events up to the limit of the second config - for (let i = maxRequests1; i < maxRequests2; i++) { - await rateLimiter.trigger(); - } - - // Availability should still be false (due to config2) - assertEquals(await rateLimiter.getAvailability(), false); - - // Wait for the second window to slide - await new Promise((resolve) => setTimeout(resolve, windowSize2 * 1000 + 500)); - - // Availability should be true again - assertEquals(await rateLimiter.getAvailability(), true); - - redis.quit(); -}); diff --git a/test/mq/slidingWindow.test.ts b/test/mq/slidingWindow.test.ts deleted file mode 100644 index a749edc..0000000 --- a/test/mq/slidingWindow.test.ts +++ /dev/null @@ -1,84 +0,0 @@ -import { assertEquals } from "jsr:@std/assert"; -import { SlidingWindow } from "lib/mq/slidingWindow.ts"; -import { Redis } from "ioredis"; - -Deno.test("SlidingWindow - event and count", async () => { - const redis = new Redis({ maxRetriesPerRequest: null }); - const windowSize = 5000; // 5 seconds - const slidingWindow = new SlidingWindow(redis, windowSize); - const eventName = "test_event"; - await slidingWindow.clear(eventName); - - await slidingWindow.event(eventName); - const count = await slidingWindow.count(eventName); - - assertEquals(count, 1); - redis.quit(); -}); - -Deno.test("SlidingWindow - multiple events", async () => { - const redis = new Redis({ maxRetriesPerRequest: null }); - const windowSize = 5000; // 5 seconds - const slidingWindow = new SlidingWindow(redis, windowSize); - const eventName = "test_event"; - await slidingWindow.clear(eventName); - - await slidingWindow.event(eventName); - await slidingWindow.event(eventName); - await slidingWindow.event(eventName); - const count = await slidingWindow.count(eventName); - - assertEquals(count, 3); - redis.quit(); -}); - -Deno.test("SlidingWindow - no events", async () => { - const redis = new Redis({ maxRetriesPerRequest: null }); - const windowSize = 5000; // 5 seconds - const slidingWindow = new SlidingWindow(redis, windowSize); - const eventName = "test_event"; - await slidingWindow.clear(eventName); - - const count = await slidingWindow.count(eventName); - - assertEquals(count, 0); - redis.quit(); -}); - -Deno.test("SlidingWindow - different event names", async () => { - const redis = new Redis({ maxRetriesPerRequest: null }); - const windowSize = 5000; // 5 seconds - const slidingWindow = new SlidingWindow(redis, windowSize); - const eventName1 = "test_event_1"; - const eventName2 = "test_event_2"; - await slidingWindow.clear(eventName1); - await slidingWindow.clear(eventName2); - - await slidingWindow.event(eventName1); - await slidingWindow.event(eventName2); - - const count1 = await slidingWindow.count(eventName1); - const count2 = await slidingWindow.count(eventName2); - - assertEquals(count1, 1); - assertEquals(count2, 1); - redis.quit(); -}); - -Deno.test("SlidingWindow - large number of events", async () => { - const redis = new Redis({ maxRetriesPerRequest: null }); - const windowSize = 5000; // 5 seconds - const slidingWindow = new SlidingWindow(redis, windowSize); - const eventName = "test_event"; - await slidingWindow.clear(eventName); - const numEvents = 1000; - - for (let i = 0; i < numEvents; i++) { - await slidingWindow.event(eventName); - } - - const count = await slidingWindow.count(eventName); - - assertEquals(count, numEvents); - redis.quit(); -}); From 879a6604e5932ce3d9e1dec37a3c905095a03d2b Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sat, 29 Mar 2025 18:43:47 +0800 Subject: [PATCH 78/79] fix: missing export in deno.json --- .gitignore | 1 + packages/crawler/deno.json | 3 ++- packages/crawler/main.ts | 7 +++++++ 3 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 packages/crawler/main.ts diff --git a/.gitignore b/.gitignore index 58df6d2..710d3e4 100644 --- a/.gitignore +++ b/.gitignore @@ -80,6 +80,7 @@ __pycache__ ml/filter/runs ml/pred/runs ml/pred/checkpoints +ml/pred/observed ml/data/ ml/filter/checkpoints scripts diff --git a/packages/crawler/deno.json b/packages/crawler/deno.json index 1d91eda..4f95bb9 100644 --- a/packages/crawler/deno.json +++ b/packages/crawler/deno.json @@ -45,5 +45,6 @@ "indentWidth": 4, "semiColons": true, "proseWrap": "always" - } + }, + "exports": "./main.ts" } \ No newline at end of file diff --git a/packages/crawler/main.ts b/packages/crawler/main.ts new file mode 100644 index 0000000..8280e6d --- /dev/null +++ b/packages/crawler/main.ts @@ -0,0 +1,7 @@ +// DENO ASK ME TO EXPORT SOMETHING WHEN 'name' IS SPECIFIED +// AND IF I DON'T SPECIFY 'name', THE --filter FLAG IN `deno task` WON'T WORK. +// I DONT'T KNOW WHY +// SO HERE'S A PLACHOLDER EXPORT FOR DENO: +export const DENO = "FUCK YOU DENO"; +// Oh, maybe export the version is a good idea +export const VERSION = "1.0.12"; \ No newline at end of file From be3ff00edcb2bfea7a03541a819cc796e4b006c3 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sat, 29 Mar 2025 18:52:44 +0800 Subject: [PATCH 79/79] fix: incorrect path for model file --- packages/crawler/main.ts | 2 +- packages/crawler/ml/akari.ts | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/crawler/main.ts b/packages/crawler/main.ts index 8280e6d..3cdc0f4 100644 --- a/packages/crawler/main.ts +++ b/packages/crawler/main.ts @@ -4,4 +4,4 @@ // SO HERE'S A PLACHOLDER EXPORT FOR DENO: export const DENO = "FUCK YOU DENO"; // Oh, maybe export the version is a good idea -export const VERSION = "1.0.12"; \ No newline at end of file +export const VERSION = "1.0.13"; \ No newline at end of file diff --git a/packages/crawler/ml/akari.ts b/packages/crawler/ml/akari.ts index ed1153e..69a7a5d 100644 --- a/packages/crawler/ml/akari.ts +++ b/packages/crawler/ml/akari.ts @@ -5,8 +5,8 @@ import { WorkerError } from "mq/schema.ts"; import { AutoTokenizer, PreTrainedTokenizer } from "@huggingface/transformers"; const tokenizerModel = "alikia2x/jina-embedding-v3-m2v-1024"; -const onnxClassifierPath = "./model/akari/3.17.onnx"; -const onnxEmbeddingPath = "./model/embedding/model.onnx"; +const onnxClassifierPath = "../../model/akari/3.17.onnx"; +const onnxEmbeddingPath = "../../model/embedding/model.onnx"; class AkariProto extends AIManager { private tokenizer: PreTrainedTokenizer | null = null;