diff --git a/.gitignore b/.gitignore index e3b07d1..710d3e4 100644 --- a/.gitignore +++ b/.gitignore @@ -51,7 +51,6 @@ internal/ !tests/cases/projects/projectOption/**/node_modules !tests/cases/projects/NodeModulesSearch/**/* !tests/baselines/reference/project/nodeModules*/**/* -.idea yarn.lock yarn-error.log .parallelperf.* @@ -76,14 +75,13 @@ node_modules/ # project specific -data/main.db -.env logs/ __pycache__ -filter/runs -data/filter/eval* -data/filter/train* -filter/checkpoints -data/filter/model_predicted* +ml/filter/runs +ml/pred/runs +ml/pred/checkpoints +ml/pred/observed +ml/data/ +ml/filter/checkpoints scripts model/ diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..518076d --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,9 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +dataSources.xml \ No newline at end of file diff --git a/.idea/cvsa.iml b/.idea/cvsa.iml new file mode 100644 index 0000000..c155925 --- /dev/null +++ b/.idea/cvsa.iml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..5535e8f --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,12 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..4552e71 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/sqldialects.xml b/.idea/sqldialects.xml new file mode 100644 index 0000000..6df4889 --- /dev/null +++ b/.idea/sqldialects.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.tokeignore b/.tokeignore index 0c4d337..aafc28c 100644 --- a/.tokeignore +++ b/.tokeignore @@ -2,4 +2,5 @@ data *.json *.svg *.txt -*.md \ No newline at end of file +*.md +*config* \ No newline at end of file diff --git a/.zed/settings.json b/.zed/settings.json new file mode 100644 index 0000000..a58d028 --- /dev/null +++ b/.zed/settings.json @@ -0,0 +1,35 @@ +// Folder-specific settings +// +// For a full list of overridable settings, and general information on folder-specific settings, +// see the documentation: https://zed.dev/docs/configuring-zed#settings-files +{ + "lsp": { + "deno": { + "settings": { + "deno": { + "enable": true + } + } + } + }, + "languages": { + "TypeScript": { + "language_servers": [ + "deno", + "!typescript-language-server", + "!vtsls", + "!eslint" + ], + "formatter": "language_server" + }, + "TSX": { + "language_servers": [ + "deno", + "!typescript-language-server", + "!vtsls", + "!eslint" + ], + "formatter": "language_server" + } + } +} diff --git a/README.md b/README.md index 9033ec6..6a46cde 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,12 @@ 纵观整个互联网,对于「中文歌声合成」或「中文虚拟歌手」(常简称为中V或VC)相关信息进行较为系统、全面地整理收集的主要有以下几个网站: -- [萌娘百科](https://zh.moegirl.org.cn/): 收录了大量中V歌曲及歌姬的信息,呈现形式为传统维基(基于[MediaWiki](https://www.mediawiki.org/))。 -- [VCPedia](https://vcpedia.cn/): 由原萌娘百科中文歌声合成编辑团队的部分成员搭建,专属于中文歌声合成相关内容的信息集成站点[^1],呈现形式为传统维基(基于[MediaWiki](https://www.mediawiki.org/))。 -- [VocaDB](https://vocadb.net/): 一个围绕 Vocaloid、UTAU 和其他歌声合成器的协作数据库,其中包含艺术家、唱片、PV 等[^2],其中包含大量中文歌声合成作品。 +- [萌娘百科](https://zh.moegirl.org.cn/): + 收录了大量中V歌曲及歌姬的信息,呈现形式为传统维基(基于[MediaWiki](https://www.mediawiki.org/))。 +- [VCPedia](https://vcpedia.cn/): + 由原萌娘百科中文歌声合成编辑团队的部分成员搭建,专属于中文歌声合成相关内容的信息集成站点[^1],呈现形式为传统维基(基于[MediaWiki](https://www.mediawiki.org/))。 +- [VocaDB](https://vocadb.net/): 一个围绕 Vocaloid、UTAU 和其他歌声合成器的协作数据库,其中包含艺术家、唱片、PV + 等[^2],其中包含大量中文歌声合成作品。 - [天钿Daily](https://tdd.bunnyxt.com/):一个VC相关数据交流与分享的网站。致力于VC相关数据交流,定期抓取VC相关数据,选取有意义的纬度展示。[^3] 上述网站中,或多或少存在一些不足,例如: @@ -36,19 +39,22 @@ ### 数据库 -中V档案馆使用[PostgreSQL](https://postgresql.org)作为数据库,我们承诺定期导出数据库转储 (dump) 文件并公开,其内容遵从以下协议或条款: +中V档案馆使用[PostgreSQL](https://postgresql.org)作为数据库,我们承诺定期导出数据库转储 (dump) +文件并公开,其内容遵从以下协议或条款: - 数据库中的事实性数据,根据适用法律,不构成受版权保护的内容。中V档案馆放弃一切可能的权利([CC0 1.0 Universal](https://creativecommons.org/publicdomain/zero/1.0/))。 - 对于数据库中有原创性的内容(如贡献者编辑的描述性内容),如无例外,以[CC BY 4.0协议](https://creativecommons.org/licenses/by/4.0/)提供。 -- 对于引用、摘编或改编自萌娘百科、VCPedia的内容,以与原始协议(CC BY-NC-SA 3.0 CN)兼容的协议[CC BY-NC-SA 4.0协议](https://creativecommons.org/licenses/by-nc-sa/4.0/)提供,并注明原始协议 。 - > 根据原始协议第四条第2项内容,CC BY-NC-SA 4.0协议为与原始协议具有相同授权要素的后续版本(“可适用的协议”)。 +- 对于引用、摘编或改编自萌娘百科、VCPedia的内容,以与原始协议(CC BY-NC-SA 3.0 + CN)兼容的协议[CC BY-NC-SA 4.0协议](https://creativecommons.org/licenses/by-nc-sa/4.0/)提供,并注明原始协议 。 + > 根据原始协议第四条第2项内容,CC BY-NC-SA 4.0协议为与原始协议具有相同授权要素的后续版本(“可适用的协议”)。 - 中V档案馆文档使用[CC BY 4.0协议](https://creativecommons.org/licenses/by/4.0/)。 ### 软件代码 用于构建中V档案馆的软件代码在[AGPL 3.0](https://www.gnu.org/licenses/agpl-3.0.html)许可证下公开,参见[LICENSE](./LICENSE) - [^1]: 引用自[VCPedia](https://vcpedia.cn/%E9%A6%96%E9%A1%B5),于[知识共享 署名-非商业性使用-相同方式共享 3.0中国大陆 (CC BY-NC-SA 3.0 CN) 许可协议](https://creativecommons.org/licenses/by-nc-sa/3.0/cn/)下提供。 + [^2]: 翻译自[VocaDB](https://vocadb.net/),于[CC BY 4.0协议](https://creativecommons.org/licenses/by/4.0/)下提供。 -[^3]: 引用自[关于 - 天钿Daily](https://tdd.bunnyxt.com/about) \ No newline at end of file + +[^3]: 引用自[关于 - 天钿Daily](https://tdd.bunnyxt.com/about) diff --git a/components/Button.tsx b/components/Button.tsx deleted file mode 100644 index 6e868c5..0000000 --- a/components/Button.tsx +++ /dev/null @@ -1,12 +0,0 @@ -import { JSX } from "preact"; -import { IS_BROWSER } from "$fresh/runtime.ts"; - -export function Button(props: JSX.HTMLAttributes) { - return ( - -

{props.count}

- - - ); -} diff --git a/lib/db/allData.ts b/lib/db/allData.ts deleted file mode 100644 index 92c225b..0000000 --- a/lib/db/allData.ts +++ /dev/null @@ -1,61 +0,0 @@ -import { Client, Transaction } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { AllDataType } from "lib/db/schema.d.ts"; -import logger from "lib/log/logger.ts"; -import { parseTimestampFromPsql } from "lib/utils/formatTimestampToPostgre.ts"; - -export async function videoExistsInAllData(client: Client, aid: number) { - return await client.queryObject<{ exists: boolean }>(`SELECT EXISTS(SELECT 1 FROM all_data WHERE aid = $1)`, [aid]) - .then((result) => result.rows[0].exists); -} - -export async function insertIntoAllData(client: Client, data: AllDataType) { - logger.log(`inserted ${data.aid}`, "db-all_data"); - return await client.queryObject( - `INSERT INTO all_data (aid, bvid, description, uid, tags, title, published_at) - VALUES ($1, $2, $3, $4, $5, $6, $7) - ON CONFLICT (aid) DO NOTHING`, - [data.aid, data.bvid, data.description, data.uid, data.tags, data.title, data.published_at], - ); -} - -export async function getLatestVideoTimestampFromAllData(client: Client) { - return await client.queryObject<{ published_at: string }>( - `SELECT published_at FROM all_data ORDER BY published_at DESC LIMIT 1`, - ) - .then((result) => { - const date = new Date(result.rows[0].published_at); - if (isNaN(date.getTime())) { - return null; - } - return date.getTime(); - }); -} - -export async function videoTagsIsNull(client: Client | Transaction, aid: number) { - return await client.queryObject<{ exists: boolean }>( - `SELECT EXISTS(SELECT 1 FROM all_data WHERE aid = $1 AND tags IS NULL)`, - [aid], - ).then((result) => result.rows[0].exists); -} - -export async function updateVideoTags(client: Client | Transaction, aid: number, tags: string[]) { - return await client.queryObject( - `UPDATE all_data SET tags = $1 WHERE aid = $2`, - [tags.join(","), aid], - ); -} - -export async function getNullVideoTagsList(client: Client) { - const queryResult = await client.queryObject<{ aid: number; published_at: string }>( - `SELECT aid, published_at FROM all_data WHERE tags IS NULL`, - ); - const rows = queryResult.rows; - return rows.map( - (row) => { - return { - aid: Number(row.aid), - published_at: parseTimestampFromPsql(row.published_at), - }; - }, - ); -} diff --git a/lib/db/init.ts b/lib/db/init.ts deleted file mode 100644 index ed4667d..0000000 --- a/lib/db/init.ts +++ /dev/null @@ -1,6 +0,0 @@ -import { Pool } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import {postgresConfig} from "lib/db/pgConfig.ts"; - -const pool = new Pool(postgresConfig, 32); - -export const db = pool; diff --git a/lib/db/redis.ts b/lib/db/redis.ts deleted file mode 100644 index 7e8152f..0000000 --- a/lib/db/redis.ts +++ /dev/null @@ -1,3 +0,0 @@ -import { Redis } from "ioredis"; - -export const redis = new Redis({ maxRetriesPerRequest: null }); \ No newline at end of file diff --git a/lib/db/schema.d.ts b/lib/db/schema.d.ts deleted file mode 100644 index db8c9a4..0000000 --- a/lib/db/schema.d.ts +++ /dev/null @@ -1,9 +0,0 @@ -export interface AllDataType { - aid: number; - bvid: string | null; - description: string | null; - uid: number | null; - tags: string | null; - title: string | null; - published_at: string | null; -} \ No newline at end of file diff --git a/lib/ml/SentenceTransformer/index.ts b/lib/ml/SentenceTransformer/index.ts deleted file mode 100644 index 3676f2a..0000000 --- a/lib/ml/SentenceTransformer/index.ts +++ /dev/null @@ -1,19 +0,0 @@ -import { SentenceTransformer } from "./model.ts"; // Changed import path - -async function main() { - const sentenceTransformer = await SentenceTransformer.from_pretrained( - "mixedbread-ai/mxbai-embed-large-v1", - ); - const outputs = await sentenceTransformer.encode([ - "Hello world", - "How are you guys doing?", - "Today is Friday!", - ]); - - // @ts-ignore - console.log(outputs["last_hidden_state"]); - - return outputs; -} - -main(); // Keep main function call if you want this file to be runnable directly for testing. diff --git a/lib/ml/SentenceTransformer/model.ts b/lib/ml/SentenceTransformer/model.ts deleted file mode 100644 index 7d8b507..0000000 --- a/lib/ml/SentenceTransformer/model.ts +++ /dev/null @@ -1,40 +0,0 @@ -// lib/ml/sentence_transformer_model.ts -import { AutoModel, AutoTokenizer, PretrainedOptions } from "@huggingface/transformers"; - -export class SentenceTransformer { - constructor( - private readonly tokenizer: AutoTokenizer, - private readonly model: AutoModel, - ) {} - - static async from_pretrained( - modelName: string, - options?: PretrainedOptions, - ): Promise { - if (!options) { - options = { - progress_callback: undefined, - cache_dir: undefined, - local_files_only: false, - revision: "main", - }; - } - const tokenizer = await AutoTokenizer.from_pretrained(modelName, options); - const model = await AutoModel.from_pretrained(modelName, options); - - return new SentenceTransformer(tokenizer, model); - } - - async encode(sentences: string[]): Promise { // Changed return type to 'any' for now to match console.log output - //@ts-ignore - const modelInputs = await this.tokenizer(sentences, { - padding: true, - truncation: true, - }); - - //@ts-ignore - const outputs = await this.model(modelInputs); - - return outputs; - } -} diff --git a/lib/ml/SentenceTransformer/pooling.ts b/lib/ml/SentenceTransformer/pooling.ts deleted file mode 100644 index 762feb7..0000000 --- a/lib/ml/SentenceTransformer/pooling.ts +++ /dev/null @@ -1,34 +0,0 @@ -import { Tensor } from "@huggingface/transformers"; -//@ts-ignore -import { Callable } from "@huggingface/transformers/src/utils/core.js"; // Keep as is for now, might need adjustment - -export interface PoolingConfig { - word_embedding_dimension: number; - pooling_mode_cls_token: boolean; - pooling_mode_mean_tokens: boolean; - pooling_mode_max_tokens: boolean; - pooling_mode_mean_sqrt_len_tokens: boolean; -} - -export interface PoolingInput { - token_embeddings: Tensor; - attention_mask: Tensor; -} - -export interface PoolingOutput { - sentence_embedding: Tensor; -} - -export class Pooling extends Callable { - constructor(private readonly config: PoolingConfig) { - super(); - } - - // async _call(inputs: any) { // Keep if pooling functionality is needed - // return this.forward(inputs); - // } - - // async forward(inputs: PoolingInput): PoolingOutput { // Keep if pooling functionality is needed - - // } -} \ No newline at end of file diff --git a/lib/ml/classifyVideo.ts b/lib/ml/classifyVideo.ts deleted file mode 100644 index 6d27e8b..0000000 --- a/lib/ml/classifyVideo.ts +++ /dev/null @@ -1,32 +0,0 @@ -import { AutoModel, AutoTokenizer, Tensor } from '@huggingface/transformers'; - -const modelName = "alikia2x/jina-embedding-v3-m2v-1024"; - -const modelConfig = { - config: { model_type: 'model2vec' }, - dtype: 'fp32', - revision: 'refs/pr/1', - cache_dir: undefined, - local_files_only: true, -}; -const tokenizerConfig = { - revision: 'refs/pr/2' -}; - -const model = await AutoModel.from_pretrained(modelName, modelConfig); -const tokenizer = await AutoTokenizer.from_pretrained(modelName, tokenizerConfig); - -const texts = ['hello', 'hello world']; -const { input_ids } = await tokenizer(texts, { add_special_tokens: false, return_tensor: false }); - -const cumsum = arr => arr.reduce((acc, num, i) => [...acc, num + (acc[i - 1] || 0)], []); -const offsets = [0, ...cumsum(input_ids.slice(0, -1).map(x => x.length))]; - -const flattened_input_ids = input_ids.flat(); -const modelInputs = { - input_ids: new Tensor('int64', flattened_input_ids, [flattened_input_ids.length]), - offsets: new Tensor('int64', offsets, [offsets.length]) -}; - -const { embeddings } = await model(modelInputs); -console.log(embeddings.tolist()); // output matches python version \ No newline at end of file diff --git a/lib/mq/exec/getLatestVideos.ts b/lib/mq/exec/getLatestVideos.ts deleted file mode 100644 index 08bad1c..0000000 --- a/lib/mq/exec/getLatestVideos.ts +++ /dev/null @@ -1,52 +0,0 @@ -import { Job } from "bullmq"; -import { insertLatestVideos } from "lib/task/insertLatestVideo.ts"; -import { LatestVideosQueue } from "lib/mq/index.ts"; -import { MINUTE } from "$std/datetime/constants.ts"; -import { db } from "lib/db/init.ts"; -import { truncate } from "lib/utils/truncate.ts"; -import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import logger from "lib/log/logger.ts"; -import { lockManager } from "lib/mq/lockManager.ts"; - -const delayMap = [5, 10, 15, 30, 60, 60]; - -const updateQueueInterval = async (failedCount: number, delay: number) => { - logger.log(`job:getLatestVideos added to queue, delay: ${(delay / MINUTE).toFixed(2)} minutes.`, "mq"); - await LatestVideosQueue.upsertJobScheduler("getLatestVideos", { - every: delay, - }, { - data: { - failedCount: failedCount, - }, - }); - return; -}; - -const executeTask = async (client: Client, failedCount: number) => { - const result = await insertLatestVideos(client); - failedCount = result !== 0 ? truncate(failedCount + 1, 0, 5) : 0; - if (failedCount !== 0) { - await updateQueueInterval(failedCount, delayMap[failedCount] * MINUTE); - } - return; -}; - -export const getLatestVideosWorker = async (job: Job) => { - if (await lockManager.isLocked("getLatestVideos")) { - logger.log("job:getLatestVideos is locked, skipping.", "mq"); - return; - } - - lockManager.acquireLock("getLatestVideos"); - - const failedCount = (job.data.failedCount ?? 0) as number; - const client = await db.connect(); - - try { - await executeTask(client, failedCount); - } finally { - client.release(); - lockManager.releaseLock("getLatestVideos"); - } - return; -}; diff --git a/lib/mq/exec/getVideoTags.ts b/lib/mq/exec/getVideoTags.ts deleted file mode 100644 index 72c678e..0000000 --- a/lib/mq/exec/getVideoTags.ts +++ /dev/null @@ -1,99 +0,0 @@ -import { Job } from "bullmq"; -import { VideoTagsQueue } from "lib/mq/index.ts"; -import { DAY, HOUR, MINUTE, SECOND } from "$std/datetime/constants.ts"; -import { db } from "lib/db/init.ts"; -import { truncate } from "lib/utils/truncate.ts"; -import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import logger from "lib/log/logger.ts"; -import { getNullVideoTagsList, updateVideoTags } from "lib/db/allData.ts"; -import { getVideoTags } from "lib/net/getVideoTags.ts"; -import { NetSchedulerError } from "lib/mq/scheduler.ts"; -import { WorkerError } from "src/worker.ts"; - -const delayMap = [0.5, 3, 5, 15, 30, 60]; -const getJobPriority = (diff: number) => { - let priority; - if (diff > 14 * DAY) { - priority = 10; - } else if (diff > 7 * DAY) { - priority = 7; - } else if (diff > DAY) { - priority = 5; - } else if (diff > 6 * HOUR) { - priority = 3; - } else if (diff > HOUR) { - priority = 2; - } else { - priority = 1; - } - return priority; -}; - -const executeTask = async (client: Client, aid: number, failedCount: number, job: Job) => { - try { - const result = await getVideoTags(aid); - if (!result) { - failedCount = truncate(failedCount + 1, 0, 5); - const delay = delayMap[failedCount] * MINUTE; - logger.log( - `job:getVideoTags added to queue, delay: ${delayMap[failedCount]} minutes.`, - "mq", - ); - await VideoTagsQueue.add("getVideoTags", { aid, failedCount }, { delay, priority: 6 - failedCount }); - return 1; - } - await updateVideoTags(client, aid, result); - logger.log(`Fetched tags for aid: ${aid}`, "task"); - return 0; - } catch (e) { - if (!(e instanceof NetSchedulerError)) { - throw new WorkerError( e, "task", "getVideoTags/fn:executeTask"); - } - const err = e as NetSchedulerError; - if (err.code === "NO_AVAILABLE_PROXY" || err.code === "PROXY_RATE_LIMITED") { - logger.warn(`No available proxy for fetching tags, delayed. aid: ${aid}`, "task"); - await VideoTagsQueue.add("getVideoTags", { aid, failedCount }, { - delay: 25 * SECOND * Math.random() + 5 * SECOND, - priority: job.priority, - }); - return 2; - } - throw new WorkerError(err, "task", "getVideoTags/fn:executeTask"); - } -}; - -export const getVideoTagsWorker = async (job: Job) => { - const failedCount = (job.data.failedCount ?? 0) as number; - const client = await db.connect(); - const aid = job.data.aid; - if (!aid) { - return 3; - } - - const v = await executeTask(client, aid, failedCount, job); - client.release(); - return v; -}; - -export const getVideoTagsInitializer = async () => { - const client = await db.connect(); - const videos = await getNullVideoTagsList(client); - if (videos.length == 0) { - return 4; - } - const count = await VideoTagsQueue.getJobCounts("wait", "delayed", "active"); - const total = count.delayed + count.active + count.wait; - const max = 15; - const rest = truncate(max - total, 0, max); - - let i = 0; - for (const video of videos) { - if (i > rest) return 100 + i; - const aid = video.aid; - const timestamp = video.published_at; - const diff = Date.now() - timestamp; - await VideoTagsQueue.add("getVideoTags", { aid }, { priority: getJobPriority(diff) }); - i++; - } - return 0; -}; diff --git a/lib/mq/executors.ts b/lib/mq/executors.ts deleted file mode 100644 index 6af60b2..0000000 --- a/lib/mq/executors.ts +++ /dev/null @@ -1 +0,0 @@ -export * from "lib/mq/exec/getLatestVideos.ts"; \ No newline at end of file diff --git a/lib/mq/index.ts b/lib/mq/index.ts deleted file mode 100644 index 22511ab..0000000 --- a/lib/mq/index.ts +++ /dev/null @@ -1,5 +0,0 @@ -import { Queue } from "bullmq"; - -export const LatestVideosQueue = new Queue("latestVideos"); - -export const VideoTagsQueue = new Queue("videoTags"); diff --git a/lib/mq/init.ts b/lib/mq/init.ts deleted file mode 100644 index a79ba03..0000000 --- a/lib/mq/init.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { MINUTE, SECOND } from "$std/datetime/constants.ts"; -import { LatestVideosQueue, VideoTagsQueue } from "lib/mq/index.ts"; -import logger from "lib/log/logger.ts"; - -async function configGetLatestVideos() { - await LatestVideosQueue.upsertJobScheduler("getLatestVideos", { - every: 1 * MINUTE, - }); -} - -async function configGetVideosTags() { - await VideoTagsQueue.upsertJobScheduler("getVideosTags", { - every: 30 * SECOND, - immediately: true, - }); -} - -export async function initMQ() { - await configGetLatestVideos(); - await configGetVideosTags(); - logger.log("Message queue initialized."); -} diff --git a/lib/mq/scheduler.ts b/lib/mq/scheduler.ts deleted file mode 100644 index 25e7705..0000000 --- a/lib/mq/scheduler.ts +++ /dev/null @@ -1,164 +0,0 @@ -import logger from "lib/log/logger.ts"; -import {RateLimiter} from "lib/mq/rateLimiter.ts"; -import {SlidingWindow} from "lib/mq/slidingWindow.ts"; -import {redis} from "lib/db/redis.ts"; -import Redis from "ioredis"; - -interface Proxy { - type: string; - task: string; - limiter?: RateLimiter; -} - -interface ProxiesMap { - [name: string]: Proxy; -} - -type NetSchedulerErrorCode = - | "NO_AVAILABLE_PROXY" - | "PROXY_RATE_LIMITED" - | "PROXY_NOT_FOUND" - | "FETCH_ERROR" - | "NOT_IMPLEMENTED"; - -export class NetSchedulerError extends Error { - public code: NetSchedulerErrorCode; - public rawError: unknown | undefined; - constructor(message: string, errorCode: NetSchedulerErrorCode, rawError?: unknown) { - super(message); - this.name = "NetSchedulerError"; - this.code = errorCode; - this.rawError = rawError; - } -} - -class NetScheduler { - private proxies: ProxiesMap = {}; - - addProxy(name: string, type: string, task: string): void { - this.proxies[name] = { type, task }; - } - - removeProxy(name: string): void { - delete this.proxies[name]; - } - - setProxyLimiter(name: string, limiter: RateLimiter): void { - this.proxies[name].limiter = limiter; - } - - /* - * Make a request to the specified URL with any available proxy - * @param {string} url - The URL to request. - * @param {string} method - The HTTP method to use for the request. Default is "GET". - * @returns {Promise} - A promise that resolves to the response body. - * @throws {NetSchedulerError} - The error will be thrown in following cases: - * - No available proxy currently: with error code NO_AVAILABLE_PROXY - * - Proxy is under rate limit: with error code PROXY_RATE_LIMITED - * - The native `fetch` function threw an error: with error code FETCH_ERROR - * - The proxy type is not supported: with error code NOT_IMPLEMENTED - */ - async request(url: string, task: string, method: string = "GET"): Promise { - // find a available proxy - const proxiesNames = Object.keys(this.proxies); - for (const proxyName of proxiesNames) { - const proxy = this.proxies[proxyName]; - if (proxy.task !== task) continue; - if (await this.getProxyAvailability(proxyName)) { - return await this.proxyRequest(url, proxyName, method); - } - } - throw new NetSchedulerError("No available proxy currently.", "NO_AVAILABLE_PROXY"); - } - - /* - * Make a request to the specified URL with the specified proxy - * @param {string} url - The URL to request. - * @param {string} proxyName - The name of the proxy to use. - * @param {string} method - The HTTP method to use for the request. Default is "GET". - * @param {boolean} force - If true, the request will be made even if the proxy is rate limited. Default is false. - * @returns {Promise} - A promise that resolves to the response body. - * @throws {NetSchedulerError} - The error will be thrown in following cases: - * - Proxy not found: with error code PROXY_NOT_FOUND - * - Proxy is under rate limit: with error code PROXY_RATE_LIMITED - * - The native `fetch` function threw an error: with error code FETCH_ERROR - * - The proxy type is not supported: with error code NOT_IMPLEMENTED - */ - async proxyRequest(url: string, proxyName: string, method: string = "GET", force: boolean = false): Promise { - const proxy = this.proxies[proxyName]; - if (!proxy) { - throw new NetSchedulerError(`Proxy "${proxyName}" not found`, "PROXY_NOT_FOUND"); - } - - if (!force && await this.getProxyAvailability(proxyName) === false) { - throw new NetSchedulerError(`Proxy "${proxyName}" is rate limited`, "PROXY_RATE_LIMITED"); - } - - if (proxy.limiter) { - try { - await proxy.limiter!.trigger(); - } catch (e) { - const error = e as Error; - if (e instanceof Redis.ReplyError) { - logger.error(error, "redis"); - } - logger.warn(`Unhandled error: ${error.message}`, "mq", "proxyRequest"); - } - } - - switch (proxy.type) { - case "native": - return await this.nativeRequest(url, method); - default: - throw new NetSchedulerError(`Proxy type ${proxy.type} not supported.`, "NOT_IMPLEMENTED"); - } - } - - private async getProxyAvailability(name: string): Promise { - try { - const proxyConfig = this.proxies[name]; - if (!proxyConfig || !proxyConfig.limiter) { - return true; - } - return await proxyConfig.limiter.getAvailability(); - } catch (e) { - const error = e as Error; - if (e instanceof Redis.ReplyError) { - logger.error(error, "redis"); - return false; - } - logger.warn(`Unhandled error: ${error.message}`, "mq", "getProxyAvailability"); - return false; - } - } - - private async nativeRequest(url: string, method: string): Promise { - try { - const response = await fetch(url, { method }); - return await response.json() as R; - } catch (e) { - throw new NetSchedulerError("Fetch error", "FETCH_ERROR", e); - } - } -} - -const netScheduler = new NetScheduler(); -netScheduler.addProxy("default", "native", "default"); -netScheduler.addProxy("tags-native", "native", "getVideoTags"); -const tagsRateLimiter = new RateLimiter("getVideoTags", [ - { - window: new SlidingWindow(redis, 1), - max: 3, - }, - { - window: new SlidingWindow(redis, 30), - max: 30, - }, - { - window: new SlidingWindow(redis, 2 * 60), - max: 50, - }, -]); -netScheduler.setProxyLimiter("tags-native", tagsRateLimiter); - -export default netScheduler; diff --git a/lib/net/bilibili.d.ts b/lib/net/bilibili.d.ts deleted file mode 100644 index a0f682d..0000000 --- a/lib/net/bilibili.d.ts +++ /dev/null @@ -1,117 +0,0 @@ -interface BaseResponse { - code: number; - message: string; - ttl: number; - data: T; -} - -export type VideoListResponse = BaseResponse; -export type VideoTagsResponse = BaseResponse; - -type VideoTagsData = VideoTags[]; - -interface VideoTags { - tag_id: number; - tag_name: string; - cover: string; - head_cover: string; - content: string; - short_content: string; - type: number; - state: number; - ctime: number; - count: { - view: number; - use: number; - atten: number; - } - is_atten: number; - likes: number; - hates: number; - attribute: number; - liked: number; - hated: number; - extra_attr: number; -} - -interface VideoListData { - archives: VideoListVideo[]; - page: { - num: number; - size: number; - count: number; - }; -} - -interface VideoListVideo { - aid: number; - videos: number; - tid: number; - tname: string; - copyright: number; - pic: string; - title: string; - pubdate: number; - ctime: number; - desc: string; - state: number; - duration: number; - mission_id?: number; - rights: { - bp: number; - elec: number; - download: number; - movie: number; - pay: number; - hd5: number; - no_reprint: number; - autoplay: number; - ugc_pay: number; - is_cooperation: number; - ugc_pay_preview: number; - no_background: number; - arc_pay: number; - pay_free_watch: number; - }, - owner: { - mid: number; - name: string; - face: string; - }, - stat: { - aid: number; - view: number; - danmaku: number; - reply: number; - favorite: number; - coin: number; - share: number; - now_rank: number; - his_rank: number; - like: number; - dislike: number; - vt: number; - vv: number; - }, - dynamic: string; - cid: number; - dimension: { - width: number; - height: number; - rotate: number; - }, - season_id?: number; - short_link_v2: string; - first_frame: string; - pub_location: string; - cover43: string; - tidv2: number; - tname_v2: string; - bvid: string; - season_type: number; - is_ogv: number; - ovg_info: string | null; - rcmd_season: string; - enable_vt: number; - ai_rcmd: null | string; -} diff --git a/lib/net/bisectVideoStartFrom.ts b/lib/net/bisectVideoStartFrom.ts deleted file mode 100644 index d663e6c..0000000 --- a/lib/net/bisectVideoStartFrom.ts +++ /dev/null @@ -1,88 +0,0 @@ -import { getLatestVideos } from "lib/net/getLatestVideos.ts"; -import { AllDataType } from "lib/db/schema.d.ts"; - -export async function getVideoPositionInNewList(timestamp: number): Promise { - const virtualPageSize = 50; - - let lowPage = 1; - let highPage = 1; - let foundUpper = false; - while (true) { - const ps = highPage < 2 ? 50 : 1 - const pn = highPage < 2 ? 1 : highPage * virtualPageSize; - const fetchTags = highPage < 2 ? true : false; - const videos = await getLatestVideos(pn, ps, 250, fetchTags); - if (!videos || videos.length === 0) { - break; - } - const lastVideo = videos[videos.length - 1]; - if (!lastVideo || !lastVideo.published_at) { - break; - } - const lastTime = Date.parse(lastVideo.published_at); - if (lastTime <= timestamp && highPage == 1) { - return videos; - } - else if (lastTime <= timestamp) { - foundUpper = true; - break; - } else { - lowPage = highPage; - highPage *= 2; - } - } - - if (!foundUpper) { - return null; - } - - let boundaryPage = highPage; - let lo = lowPage; - let hi = highPage; - while (lo <= hi) { - const mid = Math.floor((lo + hi) / 2); - const videos = await getLatestVideos(mid * virtualPageSize, 1, 250, false); - if (!videos) { - return null; - } - if (videos.length === 0) { - hi = mid - 1; - continue; - } - const lastVideo = videos[videos.length - 1]; - if (!lastVideo || !lastVideo.published_at) { - hi = mid - 1; - continue; - } - const lastTime = Date.parse(lastVideo.published_at); - if (lastTime > timestamp) { - lo = mid + 1; - } else { - boundaryPage = mid; - hi = mid - 1; - } - } - - const boundaryVideos = await getLatestVideos(boundaryPage, virtualPageSize, 250, false); - let indexInPage = 0; - if (boundaryVideos && boundaryVideos.length > 0) { - for (let i = 0; i < boundaryVideos.length; i++) { - const video = boundaryVideos[i]; - if (!video.published_at) { - continue; - } - const videoTime = Date.parse(video.published_at); - if (videoTime > timestamp) { - indexInPage++; - } else { - break; - } - } - } - - const count = (boundaryPage - 1) * virtualPageSize + indexInPage; - - const safetyMargin = 5; - - return count + safetyMargin; -} diff --git a/lib/net/getLatestVideos.ts b/lib/net/getLatestVideos.ts deleted file mode 100644 index 33b539c..0000000 --- a/lib/net/getLatestVideos.ts +++ /dev/null @@ -1,45 +0,0 @@ -import { VideoListResponse } from "lib/net/bilibili.d.ts"; -import { formatTimestampToPsql as formatPublishedAt } from "lib/utils/formatTimestampToPostgre.ts"; -import { AllDataType } from "lib/db/schema.d.ts"; -import logger from "lib/log/logger.ts"; -import { HOUR, SECOND } from "$std/datetime/constants.ts"; - -export async function getLatestVideos( - page: number = 1, - pageSize: number = 10, - sleepRate: number = 250, - fetchTags: boolean = true, -): Promise { - try { - const response = await fetch( - `https://api.bilibili.com/x/web-interface/newlist?rid=30&ps=${pageSize}&pn=${page}`, - ); - const data: VideoListResponse = await response.json(); - - if (data.code !== 0) { - logger.error(`Error fetching videos: ${data.message}`, "net", "getLatestVideos"); - return null; - } - - if (data.data.archives.length === 0) { - logger.verbose("No more videos found", "net", "getLatestVideos"); - return []; - } - - return data.data.archives.map((video) => { - const published_at = formatPublishedAt(video.pubdate * SECOND + 8 * HOUR); - return { - aid: video.aid, - bvid: video.bvid, - description: video.desc, - uid: video.owner.mid, - tags: null, - title: video.title, - published_at: published_at, - } as AllDataType; - }); - } catch (error) { - logger.error(error as Error, "net", "getLatestVideos"); - return null; - } -} diff --git a/lib/net/getVideoTags.ts b/lib/net/getVideoTags.ts deleted file mode 100644 index 4ec0af6..0000000 --- a/lib/net/getVideoTags.ts +++ /dev/null @@ -1,35 +0,0 @@ -import { VideoTagsResponse } from "lib/net/bilibili.d.ts"; -import netScheduler, {NetSchedulerError} from "lib/mq/scheduler.ts"; -import logger from "lib/log/logger.ts"; - -/* - * Fetch the tags for a video - * @param {number} aid The video's aid - * @return {Promise} A promise, which resolves to an array of tags, - * or null if an `fetch` error occurred - * @throws {NetSchedulerError} If the request failed. - */ -export async function getVideoTags(aid: number): Promise { - try { - const url = `https://api.bilibili.com/x/tag/archive/tags?aid=${aid}`; - const data = await netScheduler.request(url, 'getVideoTags'); - if (data.code != 0) { - logger.error(`Error fetching tags for video ${aid}: ${data.message}`, 'net', 'getVideoTags'); - return []; - } - return data.data.map((tag) => tag.tag_name); - } - catch (e) { - const error = e as NetSchedulerError; - if (error.code == "FETCH_ERROR") { - const rawError = error.rawError! as Error; - rawError.message = `Error fetching tags for video ${aid}: ` + rawError.message; - logger.error(rawError, 'net', 'getVideoTags'); - return null; - } - else { - // Re-throw the error - throw e; - } - } -} diff --git a/lib/task/insertLatestVideo.ts b/lib/task/insertLatestVideo.ts deleted file mode 100644 index af932fb..0000000 --- a/lib/task/insertLatestVideo.ts +++ /dev/null @@ -1,77 +0,0 @@ -import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { getLatestVideos } from "lib/net/getLatestVideos.ts"; -import { getLatestVideoTimestampFromAllData, insertIntoAllData, videoExistsInAllData } from "lib/db/allData.ts"; -import { sleep } from "lib/utils/sleep.ts"; -import { getVideoPositionInNewList } from "lib/net/bisectVideoStartFrom.ts"; -import { SECOND } from "$std/datetime/constants.ts"; -import logger from "lib/log/logger.ts"; - -export async function insertLatestVideos( - client: Client, - pageSize: number = 10, - sleepRate: number = 250, - intervalRate: number = 4000, -): Promise { - const latestVideoTimestamp = await getLatestVideoTimestampFromAllData(client); - if (latestVideoTimestamp == null) { - logger.error("Cannot get latest video timestamp from current database.", "net", "fn:insertLatestVideos()"); - return null - } - logger.log(`Latest video in the database: ${new Date(latestVideoTimestamp).toISOString()}`, "net", "fn:insertLatestVideos()") - const videoIndex = await getVideoPositionInNewList(latestVideoTimestamp); - if (videoIndex == null) { - logger.error("Cannot locate the video through bisect.", "net", "fn:insertLatestVideos()"); - return null - } - if (typeof videoIndex == "object") { - for (const video of videoIndex) { - const videoExists = await videoExistsInAllData(client, video.aid); - if (!videoExists) { - insertIntoAllData(client, video); - } - } - return 0; - } - let page = Math.floor(videoIndex / pageSize) + 1; - let failCount = 0; - const insertedVideos = new Set(); - while (true) { - try { - const videos = await getLatestVideos(page, pageSize, sleepRate); - if (videos == null) { - failCount++; - if (failCount > 5) { - return null; - } - continue; - } - failCount = 0; - if (videos.length == 0) { - logger.verbose("No more videos found", "net", "fn:insertLatestVideos()"); - break; - } - for (const video of videos) { - const videoExists = await videoExistsInAllData(client, video.aid); - if (!videoExists) { - insertIntoAllData(client, video); - insertedVideos.add(video.aid); - } - } - logger.log(`Page ${page} crawled, total: ${insertedVideos.size} videos.`, "net", "fn:insertLatestVideos()"); - page--; - if (page < 1) { - return 0; - } - } catch (error) { - logger.error(error as Error, "net", "fn:insertLatestVideos()"); - failCount++; - if (failCount > 5) { - return null; - } - continue; - } finally { - await sleep(Math.random() * intervalRate + failCount * 3 * SECOND + SECOND); - } - } - return 0; -} diff --git a/main.ts b/main.ts deleted file mode 100644 index 675f529..0000000 --- a/main.ts +++ /dev/null @@ -1,13 +0,0 @@ -/// -/// -/// -/// -/// - -import "$std/dotenv/load.ts"; - -import { start } from "$fresh/server.ts"; -import manifest from "./fresh.gen.ts"; -import config from "./fresh.config.ts"; - -await start(manifest, config); diff --git a/filter/RunningLogs.txt b/ml/filter/RunningLogs.txt similarity index 63% rename from filter/RunningLogs.txt rename to ml/filter/RunningLogs.txt index 29ce991..65b0d04 100644 --- a/filter/RunningLogs.txt +++ b/ml/filter/RunningLogs.txt @@ -18,4 +18,13 @@ Note 0324: V3.5-test3 # 用回3.2的FC层试试 0331: V3.6-test3 # 3.5不太行,我试着调下超参 0335: V3.7-test3 # 3.6还行,再调超参试试看 -0414: V3.8-test3 # 3.7不行,从3.6的基础重新调 \ No newline at end of file +0414: V3.8-test3 # 3.7不行,从3.6的基础重新调 +1918: V3.9 +2308: V3.11 +2243: V3.11 # 256维嵌入 +2253: V3.11 # 1024维度嵌入(对比) +2337: V3.12 # 级联分类 +2350: V3.13 # V3.12, 换用普通交叉熵损失 +0012: V3.11 # 换用普通交叉熵损失 +0039: V3.11 # 级联分类,但使用两个独立模型 +0122: V3.15 # 删除author_info通道 \ No newline at end of file diff --git a/filter/checkpoint_conversion.py b/ml/filter/checkpoint_conversion.py similarity index 100% rename from filter/checkpoint_conversion.py rename to ml/filter/checkpoint_conversion.py diff --git a/filter/clean_dataset.py b/ml/filter/clean_dataset.py similarity index 100% rename from filter/clean_dataset.py rename to ml/filter/clean_dataset.py diff --git a/filter/dataset.py b/ml/filter/dataset.py similarity index 98% rename from filter/dataset.py rename to ml/filter/dataset.py index 7a4edc1..4f992b0 100644 --- a/filter/dataset.py +++ b/ml/filter/dataset.py @@ -103,8 +103,7 @@ class MultiChannelDataset(Dataset): texts = { 'title': example['title'], 'description': example['description'], - 'tags': tags_text, - 'author_info': example['author_info'] + 'tags': tags_text } return { diff --git a/filter/db_utils.py b/ml/filter/db_utils.py similarity index 100% rename from filter/db_utils.py rename to ml/filter/db_utils.py diff --git a/ml/filter/embedding.py b/ml/filter/embedding.py new file mode 100644 index 0000000..ccecc9a --- /dev/null +++ b/ml/filter/embedding.py @@ -0,0 +1,110 @@ +import numpy as np +import torch +from model2vec import StaticModel + + +def prepare_batch(batch_data, device="cpu"): + """ + 将输入的 batch_data 转换为模型所需的输入格式 [batch_size, num_channels, embedding_dim]。 + + 参数: + batch_data (dict): 输入的 batch 数据,格式为 { + "title": [text1, text2, ...], + "description": [text1, text2, ...], + "tags": [text1, text2, ...] + } + device (str): 模型运行的设备(如 "cpu" 或 "cuda")。 + + 返回: + torch.Tensor: 形状为 [batch_size, num_channels, embedding_dim] 的张量。 + """ + # 1. 对每个通道的文本分别编码 + channel_embeddings = [] + model = StaticModel.from_pretrained("./model/embedding_1024/") + for channel in ["title", "description", "tags"]: + texts = batch_data[channel] # 获取当前通道的文本列表 + embeddings = torch.from_numpy(model.encode(texts)).to(torch.float32).to(device) # 编码为 [batch_size, embedding_dim] + channel_embeddings.append(embeddings) + + # 2. 将编码结果堆叠为 [batch_size, num_channels, embedding_dim] + batch_tensor = torch.stack(channel_embeddings, dim=1) # 在 dim=1 上堆叠 + return batch_tensor + +import onnxruntime as ort +from transformers import AutoTokenizer +from itertools import accumulate + +def prepare_batch_per_token(batch_data, max_length=1024): + """ + 将输入的 batch_data 转换为模型所需的输入格式 [batch_size, num_channels, seq_length, embedding_dim]。 + + 参数: + batch_data (dict): 输入的 batch 数据,格式为 { + "title": [text1, text2, ...], + "description": [text1, text2, ...], + "tags": [text1, text2, ...], + "author_info": [text1, text2, ...] + } + max_length (int): 最大序列长度。 + + 返回: + torch.Tensor: 形状为 [batch_size, num_channels, seq_length, embedding_dim] 的张量。 + """ + # 初始化 tokenizer 和 ONNX 模型 + tokenizer = AutoTokenizer.from_pretrained("alikia2x/jina-embedding-v3-m2v-1024") + session = ort.InferenceSession("./model/embedding_256/onnx/model.onnx") + + # 1. 对每个通道的文本分别编码 + channel_embeddings = [] + for channel in ["title", "description", "tags", "author_info"]: + texts = batch_data[channel] # 获取当前通道的文本列表 + + # Step 1: 生成 input_ids 和 offsets + # 对每个文本单独编码,保留原始 token 长度 + encoded_inputs = [tokenizer(text, truncation=True, max_length=max_length, return_tensors='np') for text in texts] + + # 提取每个文本的 input_ids 长度(考虑实际的 token 数量) + input_ids_lengths = [len(enc["input_ids"][0]) for enc in encoded_inputs] + + # 生成 offsets: [0, len1, len1+len2, ...] + offsets = list(accumulate([0] + input_ids_lengths[:-1])) # 累积和,排除最后一个长度 + + # 将所有 input_ids 展平为一维数组 + flattened_input_ids = np.concatenate([enc["input_ids"][0] for enc in encoded_inputs], axis=0).astype(np.int64) + + # Step 2: 构建 ONNX 输入 + inputs = { + "input_ids": ort.OrtValue.ortvalue_from_numpy(flattened_input_ids), + "offsets": ort.OrtValue.ortvalue_from_numpy(np.array(offsets, dtype=np.int64)) + } + + # Step 3: 运行 ONNX 模型 + embeddings = session.run(None, inputs)[0] # 假设输出名为 "embeddings" + + # Step 4: 将输出重塑为 [batch_size, seq_length, embedding_dim] + # 注意:这里假设 ONNX 输出的形状是 [total_tokens, embedding_dim] + # 需要根据实际序列长度重新分组 + batch_size = len(texts) + embeddings_split = np.split(embeddings, np.cumsum(input_ids_lengths[:-1])) + padded_embeddings = [] + for emb, seq_len in zip(embeddings_split, input_ids_lengths): + # 对每个序列填充到 max_length + if seq_len > max_length: + # 如果序列长度超过 max_length,截断 + emb = emb[:max_length] + pad_length = 0 + else: + # 否则填充到 max_length + pad_length = max_length - seq_len + + # 填充到 [max_length, embedding_dim] + padded = np.pad(emb, ((0, pad_length), (0, 0)), mode='constant') + padded_embeddings.append(padded) + + # 确保所有填充后的序列形状一致 + embeddings_tensor = torch.tensor(np.stack(padded_embeddings), dtype=torch.float32) + channel_embeddings.append(embeddings_tensor) + + # 2. 将编码结果堆叠为 [batch_size, num_channels, seq_length, embedding_dim] + batch_tensor = torch.stack(channel_embeddings, dim=1) + return batch_tensor \ No newline at end of file diff --git a/ml/filter/embedding_range.py b/ml/filter/embedding_range.py new file mode 100644 index 0000000..1286400 --- /dev/null +++ b/ml/filter/embedding_range.py @@ -0,0 +1,54 @@ +import json +import torch +import random +from embedding import prepare_batch +from tqdm import tqdm +import numpy as np +import matplotlib.pyplot as plt + +file_path = './data/filter/model_predicted.jsonl' + +class Dataset: + def __init__(self, file_path): + all_examples = self.load_data(file_path) + self.examples = all_examples + + def load_data(self, file_path): + with open(file_path, 'r', encoding='utf-8') as f: + return [json.loads(line) for line in f] + + def __getitem__(self, idx): + end_idx = min((idx + 1) * self.batch_size, len(self.examples)) + texts = { + 'title': [ex['title'] for ex in self.examples[idx * self.batch_size:end_idx]], + 'description': [ex['description'] for ex in self.examples[idx * self.batch_size:end_idx]], + 'tags': [",".join(ex['tags']) for ex in self.examples[idx * self.batch_size:end_idx]], + 'author_info': [ex['author_info'] for ex in self.examples[idx * self.batch_size:end_idx]] + } + return texts + + def __len__(self): + return len(self.examples) + + def get_batch(self, idx, batch_size): + self.batch_size = batch_size + return self.__getitem__(idx) + +total = 600000 +batch_size = 512 +batch_num = total // batch_size +dataset = Dataset(file_path) +arr_len = batch_size * 4 * 1024 +sample_rate = 0.1 +sample_num = int(arr_len * sample_rate) + +data = np.array([]) +for i in tqdm(range(batch_num)): + batch = dataset.get_batch(i, batch_size) + batch = prepare_batch(batch, device="cpu") + arr = batch.flatten().numpy() + sampled = np.random.choice(arr.shape[0], size=sample_num, replace=False) + data = np.concatenate((data, arr[sampled]), axis=0) if data.size else arr[sampled] + if i % 10 == 0: + np.save('embedding_range.npy', data) +np.save('embedding_range.npy', data) \ No newline at end of file diff --git a/ml/filter/embedding_visualization.py b/ml/filter/embedding_visualization.py new file mode 100644 index 0000000..be6abad --- /dev/null +++ b/ml/filter/embedding_visualization.py @@ -0,0 +1,43 @@ +import numpy as np +import matplotlib.pyplot as plt + +# 加载数据 +data = np.load("1.npy") + +# 绘制直方图,获取频数 +n, bins, patches = plt.hist(data, bins=32, density=False, alpha=0.7, color='skyblue') + +# 计算数据总数 +total_data = len(data) + +# 将频数转换为频率 +frequencies = n / total_data + +# 计算统计信息 +max_val = np.max(data) +min_val = np.min(data) +std_dev = np.std(data) + +# 设置图形属性 +plt.title('Frequency Distribution Histogram') +plt.xlabel('Value') +plt.ylabel('Frequency') + +# 重新绘制直方图,使用频率作为高度 +plt.cla() # 清除当前坐标轴上的内容 +plt.bar([(bins[i] + bins[i+1])/2 for i in range(len(bins)-1)], frequencies, width=[bins[i+1]-bins[i] for i in range(len(bins)-1)], alpha=0.7, color='skyblue') + +# 在柱子上注明频率值 +for i in range(len(patches)): + plt.text(bins[i]+(bins[i+1]-bins[i])/2, frequencies[i], f'{frequencies[i]:.2e}', ha='center', va='bottom', fontsize=6) + +# 在图表一角显示统计信息 +stats_text = f"Max: {max_val:.6f}\nMin: {min_val:.6f}\nStd: {std_dev:.4e}" +plt.text(0.95, 0.95, stats_text, transform=plt.gca().transAxes, + ha='right', va='top', bbox=dict(facecolor='white', edgecolor='black', alpha=0.8)) + +# 设置 x 轴刻度对齐柱子边界 +plt.xticks(bins, fontsize = 6) + +# 显示图形 +plt.show() \ No newline at end of file diff --git a/filter/labeling_system.py b/ml/filter/labeling_system.py similarity index 99% rename from filter/labeling_system.py rename to ml/filter/labeling_system.py index 504e19c..caba2d9 100644 --- a/filter/labeling_system.py +++ b/ml/filter/labeling_system.py @@ -10,7 +10,7 @@ import tty import termios from sentence_transformers import SentenceTransformer from db_utils import fetch_entry_data, parse_entry_data -from modelV3_9 import VideoClassifierV3_9 +from modelV3_10 import VideoClassifierV3_10 class LabelingSystem: def __init__(self, mode='model_testing', database_path="./data/main.db", @@ -27,7 +27,7 @@ class LabelingSystem: self.model = None self.sentence_transformer = None if self.mode == 'model_testing': - self.model = VideoClassifierV3_9() + self.model = VideoClassifierV3_10() self.model.load_state_dict(torch.load(model_path)) self.model.eval() self.sentence_transformer = SentenceTransformer("Thaweewat/jina-embedding-v3-m2v-1024") diff --git a/filter/model.py b/ml/filter/model.py similarity index 100% rename from filter/model.py rename to ml/filter/model.py diff --git a/filter/modelV3_10.py b/ml/filter/modelV3_10.py similarity index 97% rename from filter/modelV3_10.py rename to ml/filter/modelV3_10.py index 909590b..9efd0e9 100644 --- a/filter/modelV3_10.py +++ b/ml/filter/modelV3_10.py @@ -3,13 +3,13 @@ import torch.nn as nn import torch.nn.functional as F class VideoClassifierV3_10(nn.Module): - def __init__(self, embedding_dim=1024, hidden_dim=648, output_dim=3): + def __init__(self, embedding_dim=1024, hidden_dim=648, output_dim=3, temperature=1.7): super().__init__() self.num_channels = 4 self.channel_names = ['title', 'description', 'tags', 'author_info'] # 可学习温度系数 - self.temperature = nn.Parameter(torch.tensor(1.7)) + self.temperature = nn.Parameter(torch.tensor(temperature)) # 带约束的通道权重(使用Sigmoid替代Softmax) self.channel_weights = nn.Parameter(torch.ones(self.num_channels)) diff --git a/ml/filter/modelV3_12.py b/ml/filter/modelV3_12.py new file mode 100644 index 0000000..49d5779 --- /dev/null +++ b/ml/filter/modelV3_12.py @@ -0,0 +1,79 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class VideoClassifierV3_12(nn.Module): + def __init__(self, embedding_dim=1024, hidden_dim=648): + super().__init__() + self.num_channels = 4 + self.channel_names = ['title', 'description', 'tags', 'author_info'] + + # 可学习温度系数 + self.temperature = nn.Parameter(torch.tensor(1.7)) + + # 带约束的通道权重(使用Sigmoid替代Softmax) + self.channel_weights = nn.Parameter(torch.ones(self.num_channels)) + + # 第一个二分类器:0 vs 1/2 + self.first_classifier = nn.Sequential( + nn.Linear(embedding_dim * self.num_channels, hidden_dim*2), + nn.BatchNorm1d(hidden_dim*2), + nn.Dropout(0.2), + nn.GELU(), + nn.Linear(hidden_dim*2, 2) # 输出为2类:0 vs 1/2 + ) + + # 第二个二分类器:1 vs 2 + self.second_classifier = nn.Sequential( + nn.Linear(embedding_dim * self.num_channels, hidden_dim*2), + nn.BatchNorm1d(hidden_dim*2), + nn.Dropout(0.2), + nn.GELU(), + nn.Linear(hidden_dim*2, 2) # 输出为2类:1 vs 2 + ) + + # 权重初始化 + self._init_weights() + + def _init_weights(self): + for layer in self.first_classifier: + if isinstance(layer, nn.Linear): + nn.init.kaiming_normal_(layer.weight, nonlinearity='relu') + nn.init.zeros_(layer.bias) + + for layer in self.second_classifier: + if isinstance(layer, nn.Linear): + nn.init.kaiming_normal_(layer.weight, nonlinearity='relu') + nn.init.zeros_(layer.bias) + + def forward(self, channel_features: torch.Tensor): + """ + 输入格式: [batch_size, num_channels, embedding_dim] + 输出格式: [batch_size, output_dim] + """ + # 自适应通道权重(Sigmoid约束) + weights = torch.sigmoid(self.channel_weights) # [0,1]范围 + weighted_features = channel_features * weights.unsqueeze(0).unsqueeze(-1) + + # 特征拼接 + combined = weighted_features.view(weighted_features.size(0), -1) + + # 第一个二分类器:0 vs 1/2 + first_output = self.first_classifier(combined) + first_probs = F.softmax(first_output, dim=1) + + # 第二个二分类器:1 vs 2 + second_output = self.second_classifier(combined) + second_probs = F.softmax(second_output, dim=1) + + # 合并结果 + final_probs = torch.zeros(channel_features.size(0), 3).to(channel_features.device) + final_probs[:, 0] = first_probs[:, 0] # 类别0的概率 + final_probs[:, 1] = first_probs[:, 1] * second_probs[:, 0] # 类别1的概率 + final_probs[:, 2] = first_probs[:, 1] * second_probs[:, 1] # 类别2的概率 + + return final_probs + + def get_channel_weights(self): + """获取各通道权重(带温度调节)""" + return torch.softmax(self.channel_weights / self.temperature, dim=0).detach().cpu().numpy() diff --git a/filter/modelV3_9.py b/ml/filter/modelV3_15.py similarity index 78% rename from filter/modelV3_9.py rename to ml/filter/modelV3_15.py index 48bdc57..9e6be19 100644 --- a/filter/modelV3_9.py +++ b/ml/filter/modelV3_15.py @@ -2,14 +2,14 @@ import torch import torch.nn as nn import torch.nn.functional as F -class VideoClassifierV3_9(nn.Module): - def __init__(self, embedding_dim=1024, hidden_dim=648, output_dim=3): +class VideoClassifierV3_15(nn.Module): + def __init__(self, embedding_dim=1024, hidden_dim=648, output_dim=3, temperature=1.7): super().__init__() - self.num_channels = 4 - self.channel_names = ['title', 'description', 'tags', 'author_info'] + self.num_channels = 3 + self.channel_names = ['title', 'description', 'tags'] # 可学习温度系数 - self.temperature = nn.Parameter(torch.tensor(1.7)) + self.temperature = nn.Parameter(torch.tensor(temperature)) # 带约束的通道权重(使用Sigmoid替代Softmax) self.channel_weights = nn.Parameter(torch.ones(self.num_channels)) @@ -38,21 +38,11 @@ class VideoClassifierV3_9(nn.Module): nn.init.zeros_(layer.bias) - def forward(self, input_texts, sentence_transformer): - # 合并文本进行批量编码 - all_texts = [text for channel in self.channel_names for text in input_texts[channel]] - - # 冻结的文本编码 - with torch.no_grad(): - embeddings = torch.tensor( - sentence_transformer.encode(all_texts), - device=next(self.parameters()).device - ) - - # 分割并加权通道特征 - split_sizes = [len(input_texts[name]) for name in self.channel_names] - channel_features = torch.split(embeddings, split_sizes, dim=0) - channel_features = torch.stack(channel_features, dim=1) + def forward(self, channel_features: torch.Tensor): + """ + 输入格式: [batch_size, num_channels, embedding_dim] + 输出格式: [batch_size, output_dim] + """ # 自适应通道权重(Sigmoid约束) weights = torch.sigmoid(self.channel_weights) # [0,1]范围 diff --git a/ml/filter/modelV6_0.py b/ml/filter/modelV6_0.py new file mode 100644 index 0000000..32502fa --- /dev/null +++ b/ml/filter/modelV6_0.py @@ -0,0 +1,93 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class VideoClassifierV6_0(nn.Module): + def __init__(self, embedding_dim=256, seq_length=1024, hidden_dim=512, output_dim=3): + super().__init__() + self.num_channels = 4 + self.channel_names = ['title', 'description', 'tags', 'author_info'] + + # CNN特征提取层 + self.conv_layers = nn.Sequential( + # 第一层卷积 + nn.Conv2d(self.num_channels, 64, kernel_size=(3, 3), padding=1), + nn.BatchNorm2d(64), + nn.GELU(), + nn.MaxPool2d(kernel_size=(2, 2)), + + # 第二层卷积 + nn.Conv2d(64, 128, kernel_size=(3, 3), padding=1), + nn.BatchNorm2d(128), + nn.GELU(), + nn.MaxPool2d(kernel_size=(2, 2)), + + # 第三层卷积 + nn.Conv2d(128, 256, kernel_size=(3, 3), padding=1), + nn.BatchNorm2d(256), + nn.GELU(), + + # 全局平均池化层 + # 输出形状为 [batch_size, 256, 1, 1] + nn.AdaptiveAvgPool2d((1, 1)) + ) + + # 全局池化后的特征维度固定为 256 + self.feature_dim = 256 + + # 全连接层 + self.fc = nn.Sequential( + nn.Linear(self.feature_dim, hidden_dim), + nn.BatchNorm1d(hidden_dim), + nn.Dropout(0.2), + nn.GELU(), + nn.Linear(hidden_dim, output_dim) + ) + + self._init_weights() + + def _init_weights(self): + for module in self.modules(): + if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear): + nn.init.kaiming_normal_(module.weight, nonlinearity='relu') + if module.bias is not None: + nn.init.zeros_(module.bias) + + def forward(self, channel_features: torch.Tensor): + """ + 输入格式: [batch_size, num_channels, seq_length, embedding_dim] + 输出格式: [batch_size, output_dim] + """ + # CNN特征提取 + conv_features = self.conv_layers(channel_features) + + # 展平特征(全局池化后形状为 [batch_size, 256, 1, 1]) + flat_features = conv_features.view(conv_features.size(0), -1) # [batch_size, 256] + + # 全连接层分类 + return self.fc(flat_features) + +# 损失函数保持不变 +class AdaptiveRecallLoss(nn.Module): + def __init__(self, class_weights, alpha=0.8, gamma=2.0, fp_penalty=0.5): + super().__init__() + self.class_weights = class_weights + self.alpha = alpha + self.gamma = gamma + self.fp_penalty = fp_penalty + + def forward(self, logits, targets): + ce_loss = F.cross_entropy(logits, targets, weight=self.class_weights, reduction='none') + pt = torch.exp(-ce_loss) + focal_loss = ((1 - pt) ** self.gamma) * ce_loss + + class_mask = F.one_hot(targets, num_classes=len(self.class_weights)) + class_weights = (self.alpha + (1 - self.alpha) * pt.unsqueeze(-1)) * class_mask + recall_loss = (class_weights * focal_loss.unsqueeze(-1)).sum(dim=1) + + probs = F.softmax(logits, dim=1) + fp_mask = (targets != 0) & (torch.argmax(logits, dim=1) == 0) + fp_loss = self.fp_penalty * probs[:, 0][fp_mask].pow(2).sum() + + total_loss = recall_loss.mean() + fp_loss / len(targets) + return total_loss \ No newline at end of file diff --git a/filter/onnx_export.py b/ml/filter/onnx_export.py similarity index 76% rename from filter/onnx_export.py rename to ml/filter/onnx_export.py index 6337ef3..848cda0 100644 --- a/filter/onnx_export.py +++ b/ml/filter/onnx_export.py @@ -1,16 +1,16 @@ import torch -from modelV3_10 import VideoClassifierV3_10 +from modelV3_15 import VideoClassifierV3_15 -def export_onnx(model_path="./filter/checkpoints/best_model_V3.10.pt", - onnx_path="./model/video_classifier_v3_10.onnx"): +def export_onnx(model_path="./filter/checkpoints/best_model_V3.17.pt", + onnx_path="./model/video_classifier_v3_17.onnx"): # 初始化模型 - model = VideoClassifierV3_10() + model = VideoClassifierV3_15() model.load_state_dict(torch.load(model_path)) model.eval() # 创建符合输入规范的虚拟输入 - dummy_input = torch.randn(1, 4, 1024) # [batch=1, channels=4, embedding_dim=1024] + dummy_input = torch.randn(1, 3, 1024) # [batch=1, channels=4, embedding_dim=1024] # 导出ONNX torch.onnx.export( diff --git a/filter/predict.py b/ml/filter/predict.py similarity index 100% rename from filter/predict.py rename to ml/filter/predict.py diff --git a/ml/filter/quantize.py b/ml/filter/quantize.py new file mode 100644 index 0000000..f694014 --- /dev/null +++ b/ml/filter/quantize.py @@ -0,0 +1,36 @@ +from safetensors import safe_open +from safetensors.torch import save_file +import torch + +# 配置路径 +model_path = "./model/embedding/model.safetensors" +save_path = "./model/embedding/int8_model.safetensors" + +# 加载原始嵌入层 +with safe_open(model_path, framework="pt") as f: + embeddings_tensor = f.get_tensor("embeddings") + +# 计算极值 +min_val = torch.min(embeddings_tensor) +max_val = torch.max(embeddings_tensor) + +# 计算量化参数 +scale = (max_val - min_val) / 255 # int8 的范围是 256 个值(-128 到 127) + +# 将浮点数映射到 int8 范围 +int8_tensor = torch.round((embeddings_tensor - min_val) / scale).to(torch.int8) - 128 + +# 确保与原张量形状一致 +assert int8_tensor.shape == embeddings_tensor.shape + +# 保存映射后的 int8 张量 +save_file({"embeddings": int8_tensor}, save_path) + +# 输出反映射公式 +print("int8 反映射公式:") +m = min_val.item() +am = abs(min_val.item()) +sign = "-" if m < 0 else "+" +print(f"int8_tensor = (int8_value + 128) × {scale.item()} {sign} {am}") + +print("int8 映射完成!") \ No newline at end of file diff --git a/filter/tag.py b/ml/filter/tag.py similarity index 100% rename from filter/tag.py rename to ml/filter/tag.py diff --git a/filter/test.py b/ml/filter/test.py similarity index 87% rename from filter/test.py rename to ml/filter/test.py index 1554c98..4a29421 100644 --- a/filter/test.py +++ b/ml/filter/test.py @@ -1,7 +1,7 @@ from labeling_system import LabelingSystem DATABASE_PATH = "./data/main.db" -MODEL_PATH = "./filter/checkpoints/best_model_V3.9.pt" +MODEL_PATH = "./filter/checkpoints/best_model_V3.11.pt" OUTPUT_FILE = "./data/filter/real_test.jsonl" BATCH_SIZE = 50 diff --git a/filter/train.py b/ml/filter/train.py similarity index 93% rename from filter/train.py rename to ml/filter/train.py index ad0bc3d..dca219f 100644 --- a/filter/train.py +++ b/ml/filter/train.py @@ -4,17 +4,16 @@ import numpy as np from torch.utils.data import DataLoader import torch.optim as optim from dataset import MultiChannelDataset -from filter.modelV3_10 import VideoClassifierV3_10, AdaptiveRecallLoss -from sentence_transformers import SentenceTransformer +from filter.modelV3_15 import AdaptiveRecallLoss, VideoClassifierV3_15 from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, classification_report import os import torch -from torch.utils.tensorboard import SummaryWriter # 引入 TensorBoard +from torch.utils.tensorboard import SummaryWriter import time from embedding import prepare_batch +import torch.nn as nn -# 动态生成子目录名称 run_name = f"run_{time.strftime('%Y%m%d_%H%M')}" log_dir = os.path.join('./filter/runs', run_name) @@ -52,9 +51,8 @@ class_weights = torch.tensor( ) # 初始化模型和SentenceTransformer -sentence_transformer = SentenceTransformer("Thaweewat/jina-embedding-v3-m2v-1024") -model = VideoClassifierV3_10() -checkpoint_name = './filter/checkpoints/best_model_V3.11.pt' +model = VideoClassifierV3_15() +checkpoint_name = './filter/checkpoints/best_model_V3.17.pt' # 模型保存路径 os.makedirs('./filter/checkpoints', exist_ok=True) @@ -78,7 +76,7 @@ def evaluate(model, dataloader): with torch.no_grad(): for batch in dataloader: - batch_tensor = prepare_batch(batch['texts'], device="cpu") + batch_tensor = prepare_batch(batch['texts']) logits = model(batch_tensor) preds = torch.argmax(logits, dim=1) all_preds.extend(preds.cpu().numpy()) @@ -111,9 +109,8 @@ for epoch in range(num_epochs): for batch_idx, batch in enumerate(train_loader): optimizer.zero_grad() - batch_tensor = prepare_batch(batch['texts'], device="cpu") + batch_tensor = prepare_batch(batch['texts']) - # 传入文本字典和sentence_transformer logits = model(batch_tensor) loss = criterion(logits, batch['label']) diff --git a/lab/.gitignore b/ml/lab/.gitignore similarity index 100% rename from lab/.gitignore rename to ml/lab/.gitignore diff --git a/lab/align-pipeline.md b/ml/lab/align-pipeline.md similarity index 100% rename from lab/align-pipeline.md rename to ml/lab/align-pipeline.md diff --git a/lab/mmsAlignment/align2LRC.py b/ml/lab/mmsAlignment/align2LRC.py similarity index 100% rename from lab/mmsAlignment/align2LRC.py rename to ml/lab/mmsAlignment/align2LRC.py diff --git a/lab/mmsAlignment/alignWithMMS.py b/ml/lab/mmsAlignment/alignWithMMS.py similarity index 100% rename from lab/mmsAlignment/alignWithMMS.py rename to ml/lab/mmsAlignment/alignWithMMS.py diff --git a/lab/mmsAlignment/splitSong.py b/ml/lab/mmsAlignment/splitSong.py similarity index 100% rename from lab/mmsAlignment/splitSong.py rename to ml/lab/mmsAlignment/splitSong.py diff --git a/lab/utils/audio.py b/ml/lab/utils/audio.py similarity index 100% rename from lab/utils/audio.py rename to ml/lab/utils/audio.py diff --git a/lab/utils/cleanTempDir.py b/ml/lab/utils/cleanTempDir.py similarity index 100% rename from lab/utils/cleanTempDir.py rename to ml/lab/utils/cleanTempDir.py diff --git a/lab/utils/ttml.py b/ml/lab/utils/ttml.py similarity index 100% rename from lab/utils/ttml.py rename to ml/lab/utils/ttml.py diff --git a/lab/whisperAlignment/align2srt.py b/ml/lab/whisperAlignment/align2srt.py similarity index 100% rename from lab/whisperAlignment/align2srt.py rename to ml/lab/whisperAlignment/align2srt.py diff --git a/lab/whisperAlignment/alignWithGroup.py b/ml/lab/whisperAlignment/alignWithGroup.py similarity index 100% rename from lab/whisperAlignment/alignWithGroup.py rename to ml/lab/whisperAlignment/alignWithGroup.py diff --git a/lab/whisperAlignment/splitGroups.py b/ml/lab/whisperAlignment/splitGroups.py similarity index 100% rename from lab/whisperAlignment/splitGroups.py rename to ml/lab/whisperAlignment/splitGroups.py diff --git a/lab/whisperAlignment/srt2lrc.py b/ml/lab/whisperAlignment/srt2lrc.py similarity index 100% rename from lab/whisperAlignment/srt2lrc.py rename to ml/lab/whisperAlignment/srt2lrc.py diff --git a/ml/pred/count.py b/ml/pred/count.py new file mode 100644 index 0000000..5ed2d81 --- /dev/null +++ b/ml/pred/count.py @@ -0,0 +1,12 @@ +# iterate all json files in ./data/pred + +import os +import json + +count = 0 +for filename in os.listdir('./data/pred'): + if filename.endswith('.json'): + with open('./data/pred/' + filename, 'r') as f: + data = json.load(f) + count += len(data) +print(count) \ No newline at end of file diff --git a/ml/pred/crawler.py b/ml/pred/crawler.py new file mode 100644 index 0000000..53008d8 --- /dev/null +++ b/ml/pred/crawler.py @@ -0,0 +1,19 @@ +import os +import requests +import json +import time + +with open("./pred/2", "r") as fp: + raw = fp.readlines() + aids = [ int(x.strip()) for x in raw ] + +for aid in aids: + if os.path.exists(f"./data/pred/{aid}.json"): + continue + url = f"https://api.bunnyxt.com/tdd/v2/video/{aid}/record?last_count=5000" + r = requests.get(url) + data = r.json() + with open (f"./data/pred/{aid}.json", "w") as fp: + json.dump(data, fp, ensure_ascii=False, indent=4) + time.sleep(5) + print(aid) \ No newline at end of file diff --git a/ml/pred/dataset.py b/ml/pred/dataset.py new file mode 100644 index 0000000..9ed4846 --- /dev/null +++ b/ml/pred/dataset.py @@ -0,0 +1,178 @@ +import os +import json +import random +import bisect +import numpy as np +import pandas as pd +import torch +from torch.utils.data import Dataset +import datetime + +class VideoPlayDataset(Dataset): + def __init__(self, data_dir, publish_time_path, term='long', seed=42): + if seed is not None: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + self.data_dir = data_dir + self.series_dict = self._load_and_process_data(publish_time_path) + self.valid_series = [s for s in self.series_dict.values() if len(s['abs_time']) > 1] + self.term = term + # Set time window based on term + self.time_window = 1000 * 24 * 3600 if term == 'long' else 7 * 24 * 3600 + MINUTE = 60 + HOUR = 3600 + DAY = 24 * HOUR + + if term == 'long': + self.feature_windows = [ + 1 * HOUR, + 6 * HOUR, + 1 *DAY, + 3 * DAY, + 7 * DAY, + 30 * DAY, + 100 * DAY + ] + else: + self.feature_windows = [ + ( 15 * MINUTE, 0 * MINUTE), + ( 40 * MINUTE, 0 * MINUTE), + ( 1 * HOUR, 0 * HOUR), + ( 2 * HOUR, 1 * HOUR), + ( 3 * HOUR, 2 * HOUR), + ( 3 * HOUR, 0 * HOUR), + #( 6 * HOUR, 3 * HOUR), + ( 6 * HOUR, 0 * HOUR), + (18 * HOUR, 12 * HOUR), + #( 1 * DAY, 6 * HOUR), + ( 1 * DAY, 0 * DAY), + #( 2 * DAY, 1 * DAY), + ( 3 * DAY, 0 * DAY), + #( 4 * DAY, 1 * DAY), + ( 7 * DAY, 0 * DAY) + ] + + def _extract_features(self, series, current_idx, target_idx): + current_time = series['abs_time'][current_idx] + current_play = series['play_count'][current_idx] + dt = datetime.datetime.fromtimestamp(current_time) + + if self.term == 'long': + time_features = [ + np.log2(max(current_time - series['create_time'], 1)) + ] + else: + time_features = [ + (dt.hour * 3600 + dt.minute * 60 + dt.second) / 86400, + (dt.weekday() * 24 + dt.hour) / 168, + np.log2(max(current_time - series['create_time'], 1)) + ] + + growth_features = [] + if self.term == 'long': + for window in self.feature_windows: + prev_time = current_time - window + prev_idx = self._get_nearest_value(series, prev_time, current_idx) + if prev_idx is not None: + time_diff = current_time - series['abs_time'][prev_idx] + play_diff = current_play - series['play_count'][prev_idx] + scaled_diff = play_diff / (time_diff / window) if time_diff > 0 else 0.0 + else: + scaled_diff = 0.0 + growth_features.append(np.log2(max(scaled_diff, 1))) + else: + for window_start, window_end in self.feature_windows: + prev_time_start = current_time - window_start + prev_time_end = current_time - window_end # window_end is typically 0 + prev_idx_start = self._get_nearest_value(series, prev_time_start, current_idx) + prev_idx_end = self._get_nearest_value(series, prev_time_end, current_idx) + if prev_idx_start is not None and prev_idx_end is not None: + time_diff = series['abs_time'][prev_idx_end] - series['abs_time'][prev_idx_start] + play_diff = series['play_count'][prev_idx_end] - series['play_count'][prev_idx_start] + scaled_diff = play_diff / (time_diff / (window_start - window_end)) if time_diff > 0 else 0.0 + else: + scaled_diff = 0.0 + growth_features.append(np.log2(max(scaled_diff, 1))) + + time_diff = series['abs_time'][target_idx] - current_time + return [np.log2(max(time_diff, 1))] + [np.log2(current_play + 1)] + growth_features + time_features + + def _load_and_process_data(self, publish_time_path): + publish_df = pd.read_csv(publish_time_path) + publish_df['published_at'] = pd.to_datetime(publish_df['published_at']) + publish_dict = dict(zip(publish_df['aid'], publish_df['published_at'])) + series_dict = {} + for filename in os.listdir(self.data_dir): + if not filename.endswith('.json'): + continue + with open(os.path.join(self.data_dir, filename), 'r') as f: + data = json.load(f) + if 'code' in data: + continue + for item in data: + aid = item['aid'] + published_time = pd.to_datetime(publish_dict[aid]).timestamp() + if aid not in series_dict: + series_dict[aid] = { + 'abs_time': [], + 'play_count': [], + 'create_time': published_time + } + series_dict[aid]['abs_time'].append(item['added']) + series_dict[aid]['play_count'].append(item['view']) + # Sort each series by absolute time + for aid in series_dict: + sorted_indices = sorted(range(len(series_dict[aid]['abs_time'])), + key=lambda k: series_dict[aid]['abs_time'][k]) + series_dict[aid]['abs_time'] = [series_dict[aid]['abs_time'][i] for i in sorted_indices] + series_dict[aid]['play_count'] = [series_dict[aid]['play_count'][i] for i in sorted_indices] + return series_dict + + def __len__(self): + return 100000 # Virtual length for sampling + + def _get_nearest_value(self, series, target_time, current_idx): + times = series['abs_time'] + pos = bisect.bisect_right(times, target_time, 0, current_idx + 1) + candidates = [] + if pos > 0: + candidates.append(pos - 1) + if pos <= current_idx: + candidates.append(pos) + if not candidates: + return None + closest_idx = min(candidates, key=lambda i: abs(times[i] - target_time)) + return closest_idx + + def __getitem__(self, _idx): + while True: + series = random.choice(self.valid_series) + if len(series['abs_time']) < 2: + continue + current_idx = random.randint(0, len(series['abs_time']) - 2) + current_time = series['abs_time'][current_idx] + max_target_time = current_time + self.time_window + candidate_indices = [] + for j in range(current_idx + 1, len(series['abs_time'])): + if series['abs_time'][j] > max_target_time: + break + candidate_indices.append(j) + if not candidate_indices: + continue + target_idx = random.choice(candidate_indices) + break + current_play = series['play_count'][current_idx] + target_play = series['play_count'][target_idx] + target_delta = max(target_play - current_play, 0) + return { + 'features': torch.FloatTensor(self._extract_features(series, current_idx, target_idx)), + 'target': torch.log2(torch.FloatTensor([target_delta]) + 1) + } + +def collate_fn(batch): + return { + 'features': torch.stack([x['features'] for x in batch]), + 'targets': torch.stack([x['target'] for x in batch]) + } \ No newline at end of file diff --git a/ml/pred/export_onnx.py b/ml/pred/export_onnx.py new file mode 100644 index 0000000..c7b4d59 --- /dev/null +++ b/ml/pred/export_onnx.py @@ -0,0 +1,28 @@ +import torch +import torch.onnx +from model import CompactPredictor + +def export_model(input_size, checkpoint_path, onnx_path): + model = CompactPredictor(input_size) + model.load_state_dict(torch.load(checkpoint_path)) + + dummy_input = torch.randn(1, input_size) + + model.eval() + + torch.onnx.export(model, # Model to be exported + dummy_input, # Model input + onnx_path, # Save path + export_params=True, # Whether to export model parameters + opset_version=11, # ONNX opset version + do_constant_folding=True, # Whether to perform constant folding optimization + input_names=['input'], # Input node name + output_names=['output'], # Output node name + dynamic_axes={'input': {0: 'batch_size'}, # Dynamic batch size + 'output': {0: 'batch_size'}}) + + print(f"ONNX model has been exported to: {onnx_path}") + +if __name__ == '__main__': + export_model(10, './pred/checkpoints/long_term.pt', 'long_term.onnx') + export_model(12, './pred/checkpoints/short_term.pt', 'short_term.onnx') diff --git a/ml/pred/inference.py b/ml/pred/inference.py new file mode 100644 index 0000000..cadb90f --- /dev/null +++ b/ml/pred/inference.py @@ -0,0 +1,32 @@ +import datetime +import numpy as np +from model import CompactPredictor +import torch + +def main(): + model = CompactPredictor(10).to('cpu', dtype=torch.float32) + model.load_state_dict(torch.load('./pred/checkpoints/long_term.pt')) + model.eval() + # inference + initial = 997029 + last = initial + start_time = '2025-03-17 00:13:17' + for i in range(1, 120): + hour = i / 0.5 + sec = hour * 3600 + time_d = np.log2(sec) + data = [time_d, np.log2(initial+1), # time_delta, current_views + 6.111542, 8.404707, 10.071566, 11.55888, 12.457823,# grows_feat + 0.009225, 0.001318, 28.001814# time_feat + ] + np_arr = np.array([data]) + tensor = torch.from_numpy(np_arr).to('cpu', dtype=torch.float32) + output = model(tensor) + num = output.detach().numpy()[0][0] + views_pred = int(np.exp2(num)) + initial + current_time = datetime.datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S') + datetime.timedelta(hours=hour) + print(current_time.strftime('%m-%d %H:%M:%S'), views_pred, views_pred - last) + last = views_pred + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/ml/pred/model.py b/ml/pred/model.py new file mode 100644 index 0000000..191e3eb --- /dev/null +++ b/ml/pred/model.py @@ -0,0 +1,23 @@ +import torch.nn as nn + +class CompactPredictor(nn.Module): + def __init__(self, input_size): + super().__init__() + self.net = nn.Sequential( + nn.BatchNorm1d(input_size), + nn.Linear(input_size, 256), + nn.LeakyReLU(0.1), + nn.Dropout(0.3), + nn.Linear(256, 128), + nn.LeakyReLU(0.1), + nn.Dropout(0.2), + nn.Linear(128, 64), + nn.Tanh(), # Use Tanh to limit the output range + nn.Linear(64, 1) + ) + # Initialize the last layer to values close to zero + nn.init.uniform_(self.net[-1].weight, -0.01, 0.01) + nn.init.constant_(self.net[-1].bias, 0.0) + + def forward(self, x): + return self.net(x) diff --git a/ml/pred/train.py b/ml/pred/train.py new file mode 100644 index 0000000..b162163 --- /dev/null +++ b/ml/pred/train.py @@ -0,0 +1,114 @@ +import random +import time +import numpy as np +from torch.utils.tensorboard import SummaryWriter +from torch.utils.data import DataLoader +import torch +from dataset import VideoPlayDataset, collate_fn +from pred.model import CompactPredictor + +def asymmetricHuberLoss(delta=1.0, beta=1.3): + """ + 创建一个可调用的非对称 Huber 损失函数。 + + 参数: + delta (float): Huber 损失的 delta 参数。 + beta (float): 控制负误差惩罚的系数。 + + 返回: + callable: 可调用的损失函数。 + """ + def loss_function(input, target): + error = input - target + abs_error = torch.abs(error) + + linear_loss = abs_error - 0.5 * delta + quadratic_loss = 0.5 * error**2 + + loss = torch.where(abs_error < delta, quadratic_loss, linear_loss) + loss = torch.where(error < 0, beta * loss, loss) + + return torch.mean(loss) + + return loss_function + +def train(model, dataloader, device, epochs=100): + writer = SummaryWriter(f'./pred/runs/play_predictor_{time.strftime("%Y%m%d_%H%M")}') + optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.01) + scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-3, + total_steps=len(dataloader)*30) + # Huber loss + criterion = asymmetricHuberLoss(delta=1.0, beta=2.1) + + model.train() + global_step = 0 + for epoch in range(epochs): + total_loss = 0.0 + for batch_idx, batch in enumerate(dataloader): + features = batch['features'].to(device) + targets = batch['targets'].to(device) + + optimizer.zero_grad() + outputs = model(features) + loss = criterion(outputs, targets) + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + scheduler.step() + + total_loss += loss.item() + global_step += 1 + + if global_step % 100 == 0: + writer.add_scalar('Loss/train', loss.item(), global_step) + writer.add_scalar('LR', scheduler.get_last_lr()[0], global_step) + if batch_idx % 50 == 0: + # Monitor gradients + grad_norms = [ + torch.norm(p.grad).item() + for p in model.parameters() if p.grad is not None + ] + writer.add_scalar('Grad/Norm', sum(grad_norms)/len(grad_norms), global_step) + + # Monitor parameter values + param_means = [torch.mean(p.data).item() for p in model.parameters()] + writer.add_scalar('Params/Mean', sum(param_means)/len(param_means), global_step) + + samples_count = len(targets) + good = 0 + for r in range(samples_count): + r = random.randint(0, samples_count-1) + t = float(torch.exp2(targets[r])) - 1 + o = float(torch.exp2(outputs[r])) - 1 + d = features[r].cpu().numpy()[0] + speed = np.exp2(features[r].cpu().numpy()[8]) / 6 + time_diff = np.exp2(d) / 3600 + inc = speed * time_diff + model_error = abs(t - o) + reg_error = abs(inc - t) + if model_error < reg_error: + good += 1 + #print(f"{t:07.1f} | {o:07.1f} | {d:07.1f} | {inc:07.1f} | {good/samples_count*100:.1f}%") + writer.add_scalar('Train/WinRate', good/samples_count, global_step) + + print(f"Epoch {epoch+1} | Avg Loss: {total_loss/len(dataloader):.4f}") + + writer.close() + return model + +if __name__ == "__main__": + device = 'mps' + + # Initialize dataset and model + dataset = VideoPlayDataset('./data/pred', './data/pred/publish_time.csv', 'short') + dataloader = DataLoader(dataset, batch_size=128, shuffle=True, collate_fn=collate_fn) + + # Get feature dimension + sample = next(iter(dataloader)) + input_size = sample['features'].shape[1] + + model = CompactPredictor(input_size).to(device) + trained_model = train(model, dataloader, device, epochs=18) + + # Save model + torch.save(trained_model.state_dict(), f"./pred/checkpoints/model_{time.strftime('%Y%m%d_%H%M')}.pt") diff --git a/packages/backend/database.ts b/packages/backend/database.ts new file mode 100644 index 0000000..7eccea3 --- /dev/null +++ b/packages/backend/database.ts @@ -0,0 +1,20 @@ +import { type Client, Pool } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { postgresConfig } from "@core/db/pgConfig.ts"; +import { createMiddleware } from "hono/factory"; + +const pool = new Pool(postgresConfig, 4); + +export const db = pool; + +export const dbMiddleware = createMiddleware(async (c, next) => { + const connection = await pool.connect(); + c.set("db", connection); + await next(); + connection.release(); +}); + +declare module "hono" { + interface ContextVariableMap { + db: Client; + } +} diff --git a/packages/backend/deno.json b/packages/backend/deno.json new file mode 100644 index 0000000..54f0456 --- /dev/null +++ b/packages/backend/deno.json @@ -0,0 +1,17 @@ +{ + "name": "@cvsa/backend", + "imports": { + "hono": "jsr:@hono/hono@^4.7.5", + "zod": "npm:zod", + "yup": "npm:yup" + }, + "tasks": { + "dev": "deno serve --env-file=.env --allow-env --allow-net --watch main.ts", + "start": "deno serve --env-file=.env --allow-env --allow-net --host 127.0.0.1 main.ts" + }, + "compilerOptions": { + "jsx": "precompile", + "jsxImportSource": "hono/jsx" + }, + "exports": "./main.ts" +} diff --git a/packages/backend/main.ts b/packages/backend/main.ts new file mode 100644 index 0000000..39fdd9b --- /dev/null +++ b/packages/backend/main.ts @@ -0,0 +1,20 @@ +import { Hono } from "hono"; +import { dbMiddleware } from "./database.ts"; +import { rootHandler } from "./root.ts"; +import { getSnapshotsHanlder } from "./snapshots.ts"; + +export const app = new Hono(); + +app.use('/video/*', dbMiddleware); + +app.get("/", ...rootHandler); + +app.get('/video/:id/snapshots', ...getSnapshotsHanlder); + +const fetch = app.fetch; + +export default { + fetch, +} satisfies Deno.ServeDefaultExport; + +export const VERSION = "0.2.0"; \ No newline at end of file diff --git a/packages/backend/root.ts b/packages/backend/root.ts new file mode 100644 index 0000000..3cb15fd --- /dev/null +++ b/packages/backend/root.ts @@ -0,0 +1,31 @@ +import { getSingerForBirthday, pickSinger, pickSpecialSinger, type Singer } from "./singers.ts"; +import { VERSION } from "./main.ts"; +import { createHandlers } from "./utils.ts"; + +export const rootHandler = createHandlers((c) => { + let singer: Singer | Singer[] | null = null; + const shouldShowSpecialSinger = Math.random() < 0.016; + if (getSingerForBirthday().length !== 0){ + singer = getSingerForBirthday(); + for (const s of singer) { + delete s.birthday; + s.message = `祝${s.name}生日快乐~` + } + } + else if (shouldShowSpecialSinger) { + singer = pickSpecialSinger(); + } + else { + singer = pickSinger(); + } + return c.json({ + "project": { + "name": "中V档案馆", + "motto": "一起唱吧,心中的歌!" + }, + "status": 200, + "version": VERSION, + "time": Date.now(), + "singer": singer + }) +}) \ No newline at end of file diff --git a/packages/backend/singers.ts b/packages/backend/singers.ts new file mode 100644 index 0000000..47963bb --- /dev/null +++ b/packages/backend/singers.ts @@ -0,0 +1,103 @@ +export const singers = [ + { + "name": "洛天依", + "color": "#66CCFF", + "birthday": "0712", + }, + { + "name": "言和", + "color": "#00FFCC", + "birthday": "0711", + }, + { + "name": "乐正绫", + "color": "#EE0000", + "birthday": "0412", + }, + { + "name": "乐正龙牙", + "color": "#006666", + "birthday": "1002", + }, + { + "name": "徵羽摩柯", + "color": "#0080FF", + "birthday": "1210", + }, + { + "name": "墨清弦", + "color": "#FFFF00", + "birthday": "0520", + }, + { + "name": "星尘", + "color": "#9999FF", + "birthday": "0812", + }, + { + "name": "心华", + "color": "#EE82EE", + "birthday": "0210", + }, + { + "name": "海伊", + "color": "#3399FF", + "birthday": "0722", + }, + { + "name": "苍穹", + "color": "#8BC0B5", + "birthday": "0520", + }, + { + "name": "赤羽", + "color": "#FF4004", + "birthday": "1126", + }, + { + "name": "诗岸", + "color": "#F6BE72", + "birthday": "0119", + }, + { + "name": "牧心", + "color": "#2A2859", + "birthday": "0807", + }, +]; + +export interface Singer { + name: string; + color?: string; + birthday?: string; + message?: string; +} + +export const specialSingers = [ + { + "name": "雅音宫羽", + "message": "你是我最真模样,从来不曾遗忘。", + }, + { + "name": "初音未来", + "message": "初始之音,响彻未来!", + }, +]; + +export const pickSinger = () => { + const index = Math.floor(Math.random() * singers.length); + return singers[index]; +}; + +export const pickSpecialSinger = () => { + const index = Math.floor(Math.random() * specialSingers.length); + return specialSingers[index]; +}; + +export const getSingerForBirthday = (): Singer[] => { + const today = new Date(); + const month = String(today.getMonth() + 1).padStart(2, "0"); + const day = String(today.getDate()).padStart(2, "0"); + const datestring = `${month}${day}`; + return singers.filter((singer) => singer.birthday === datestring); +}; diff --git a/packages/backend/snapshots.ts b/packages/backend/snapshots.ts new file mode 100644 index 0000000..1ee3216 --- /dev/null +++ b/packages/backend/snapshots.ts @@ -0,0 +1,89 @@ +import type { Context } from "hono"; +import { createHandlers } from "./utils.ts"; +import type { BlankEnv, BlankInput } from "hono/types"; +import { getVideoSnapshots, getVideoSnapshotsByBV } from "@core/db/videoSnapshot.ts"; +import type { VideoSnapshotType } from "@core/db/schema.d.ts"; +import { boolean, mixed, number, object, ValidationError } from "yup"; + +const SnapshotQueryParamsSchema = object({ + ps: number().optional().positive(), + pn: number().optional().positive(), + offset: number().optional().positive(), + reverse: boolean().optional(), +}); + +const idSchema = mixed().test( + "is-valid-id", + 'id must be a string starting with "av" followed by digits, or "BV" followed by 10 alphanumeric characters, or a positive integer', + (value) => { + if (typeof value === "number") { + return Number.isInteger(value) && value > 0; + } + + if (typeof value === "string") { + if (value.startsWith("av")) { + const digitsOnly = value.substring(2); + return /^\d+$/.test(digitsOnly) && digitsOnly.length > 0; + } + + if (value.startsWith("BV")) { + const remainingChars = value.substring(2); + return /^[a-zA-Z0-9]{10}$/.test(remainingChars); + } + } + + return false; + }, +); + +type ContextType = Context; +export const getSnapshotsHanlder = createHandlers(async (c: ContextType) => { + const client = c.get("db"); + + try { + const idParam = await idSchema.validate(c.req.param("id")); + let videoId: number | string = idParam as string | number; + if (typeof videoId === "string" && videoId.startsWith("av")) { + videoId = videoId.slice(2); + } + const queryParams = await SnapshotQueryParamsSchema.validate(c.req.query()); + const { ps, pn, offset, reverse = false } = queryParams; + + let limit = 1000; + if (ps && ps > 1) { + limit = ps; + } + + let pageOrOffset = 1; + let mode: "page" | "offset" = "page"; + + if (pn && pn > 1) { + pageOrOffset = pn; + mode = "page"; + } else if (offset && offset > 1) { + pageOrOffset = offset; + mode = "offset"; + } + + let result: VideoSnapshotType[]; + + if (typeof videoId === "number") { + result = await getVideoSnapshots(client, videoId, limit, pageOrOffset, reverse, mode); + } else { + result = await getVideoSnapshotsByBV(client, videoId, limit, pageOrOffset, reverse, mode); + } + + const rows = result.map((row) => ({ + ...row, + aid: Number(row.aid), + })); + + return c.json(rows); + } catch (e) { + if (e instanceof ValidationError) { + return c.json({ message: "Invalid query parameters", errors: e.errors }, 400); + } else { + return c.json({ message: "Unhandled error", error: e }, 500); + } + } +}); diff --git a/packages/backend/utils.ts b/packages/backend/utils.ts new file mode 100644 index 0000000..9d06a7c --- /dev/null +++ b/packages/backend/utils.ts @@ -0,0 +1,5 @@ +import { createFactory } from 'hono/factory' + +const factory = createFactory(); + +export const createHandlers = factory.createHandlers; \ No newline at end of file diff --git a/lib/db/pgConfig.ts b/packages/core/db/pgConfig.ts similarity index 65% rename from lib/db/pgConfig.ts rename to packages/core/db/pgConfig.ts index 4c34ef4..cebba63 100644 --- a/lib/db/pgConfig.ts +++ b/packages/core/db/pgConfig.ts @@ -3,11 +3,12 @@ const requiredEnvVars = ["DB_HOST", "DB_NAME", "DB_USER", "DB_PASSWORD", "DB_POR const unsetVars = requiredEnvVars.filter((key) => Deno.env.get(key) === undefined); if (unsetVars.length > 0) { - throw new Error(`Missing required environment variables: ${unsetVars.join(", ")}`); + throw new Error(`Missing required environment variables: ${unsetVars.join(", ")}`); } const databaseHost = Deno.env.get("DB_HOST")!; const databaseName = Deno.env.get("DB_NAME")!; +const databaseNameCred = Deno.env.get("DB_NAME_CRED")!; const databaseUser = Deno.env.get("DB_USER")!; const databasePassword = Deno.env.get("DB_PASSWORD")!; const databasePort = Deno.env.get("DB_PORT")!; @@ -18,4 +19,12 @@ export const postgresConfig = { database: databaseName, user: databaseUser, password: databasePassword, -}; \ No newline at end of file +}; + +export const postgresConfigCred = { + hostname: databaseHost, + port: parseInt(databasePort), + database: databaseNameCred, + user: databaseUser, + password: databasePassword, +}; diff --git a/packages/core/db/schema.d.ts b/packages/core/db/schema.d.ts new file mode 100644 index 0000000..a9a7296 --- /dev/null +++ b/packages/core/db/schema.d.ts @@ -0,0 +1,55 @@ +export interface AllDataType { + id: number; + aid: bigint; + bvid: string | null; + description: string | null; + uid: number | null; + tags: string | null; + title: string | null; + published_at: string | null; + duration: number; + created_at: string | null; +} + +export interface BiliUserType { + id: number; + uid: number; + username: string; + desc: string; + fans: number; +} + +export interface VideoSnapshotType { + id: number; + created_at: string; + views: number; + coins: number; + likes: number; + favorites: number; + shares: number; + danmakus: number; + aid: bigint; + replies: number; +} + +export interface LatestSnapshotType { + aid: bigint; + time: number; + views: number; + danmakus: number; + replies: number; + likes: number; + coins: number; + shares: number; + favorites: number; +} + +export interface SnapshotScheduleType { + id: number; + aid: bigint; + type?: string; + created_at: string; + started_at?: string; + finished_at?: string; + status: string; +} diff --git a/packages/core/db/videoSnapshot.ts b/packages/core/db/videoSnapshot.ts new file mode 100644 index 0000000..68f07ec --- /dev/null +++ b/packages/core/db/videoSnapshot.ts @@ -0,0 +1,33 @@ +import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { VideoSnapshotType } from "@core/db/schema.d.ts"; + +export async function getVideoSnapshots(client: Client, aid: number, limit: number, pageOrOffset: number, reverse: boolean, mode: 'page' | 'offset' = 'page') { + const offset = mode === 'page' ? (pageOrOffset - 1) * limit : pageOrOffset; + const order = reverse ? 'ASC' : 'DESC'; + const query = ` + SELECT * + FROM video_snapshot + WHERE aid = $1 + ORDER BY created_at ${order} + LIMIT $2 + OFFSET $3 + `; + const queryResult = await client.queryObject(query, [aid, limit, offset]); + return queryResult.rows; +} + +export async function getVideoSnapshotsByBV(client: Client, bv: string, limit: number, pageOrOffset: number, reverse: boolean, mode: 'page' | 'offset' = 'page') { + const offset = mode === 'page' ? (pageOrOffset - 1) * limit : pageOrOffset; + const order = reverse ? 'ASC' : 'DESC'; + const query = ` + SELECT vs.* + FROM video_snapshot vs + JOIN bilibili_metadata bm ON vs.aid = bm.aid + WHERE bm.bvid = $1 + ORDER BY vs.created_at ${order} + LIMIT $2 + OFFSET $3 + ` + const queryResult = await client.queryObject(query, [bv, limit, offset]); + return queryResult.rows; +} \ No newline at end of file diff --git a/packages/core/deno.json b/packages/core/deno.json new file mode 100644 index 0000000..e69de29 diff --git a/packages/crawler/db/allData.ts b/packages/crawler/db/allData.ts new file mode 100644 index 0000000..461cb69 --- /dev/null +++ b/packages/crawler/db/allData.ts @@ -0,0 +1,87 @@ +import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { AllDataType, BiliUserType } from "db/schema.d.ts"; +import Akari from "ml/akari.ts"; + +export async function videoExistsInAllData(client: Client, aid: number) { + return await client.queryObject<{ exists: boolean }>( + `SELECT EXISTS(SELECT 1 FROM bilibili_metadata WHERE aid = $1)`, + [aid], + ) + .then((result) => result.rows[0].exists); +} + +export async function userExistsInBiliUsers(client: Client, uid: number) { + return await client.queryObject<{ exists: boolean }>(`SELECT EXISTS(SELECT 1 FROM bilibili_user WHERE uid = $1)`, [ + uid, + ]); +} + +export async function getUnlabelledVideos(client: Client) { + const queryResult = await client.queryObject<{ aid: number }>( + `SELECT a.aid FROM bilibili_metadata a LEFT JOIN labelling_result l ON a.aid = l.aid WHERE l.aid IS NULL`, + ); + return queryResult.rows.map((row) => row.aid); +} + +export async function insertVideoLabel(client: Client, aid: number, label: number) { + return await client.queryObject( + `INSERT INTO labelling_result (aid, label, model_version) VALUES ($1, $2, $3) ON CONFLICT (aid, model_version) DO NOTHING`, + [aid, label, Akari.getModelVersion()], + ); +} + +export async function getVideoInfoFromAllData(client: Client, aid: number) { + const queryResult = await client.queryObject( + `SELECT * FROM bilibili_metadata WHERE aid = $1`, + [aid], + ); + const row = queryResult.rows[0]; + let authorInfo = ""; + if (row.uid && await userExistsInBiliUsers(client, row.uid)) { + const q = await client.queryObject( + `SELECT * FROM bilibili_user WHERE uid = $1`, + [row.uid], + ); + const userRow = q.rows[0]; + if (userRow) { + authorInfo = userRow.desc; + } + } + return { + title: row.title, + description: row.description, + tags: row.tags, + author_info: authorInfo, + }; +} + +export async function getUnArchivedBiliUsers(client: Client) { + const queryResult = await client.queryObject<{ uid: number }>( + ` + SELECT ad.uid + FROM bilibili_metadata ad + LEFT JOIN bilibili_user bu ON ad.uid = bu.uid + WHERE bu.uid IS NULL; + `, + [], + ); + const rows = queryResult.rows; + return rows.map((row) => row.uid); +} + +export async function setBiliVideoStatus(client: Client, aid: number, status: number) { + return await client.queryObject( + `UPDATE bilibili_metadata SET status = $1 WHERE aid = $2`, + [status, aid], + ); +} + +export async function getBiliVideoStatus(client: Client, aid: number) { + const queryResult = await client.queryObject<{ status: number }>( + `SELECT status FROM bilibili_metadata WHERE aid = $1`, + [aid], + ); + const rows = queryResult.rows; + if (rows.length === 0) return 0; + return rows[0].status; +} diff --git a/packages/crawler/db/init.ts b/packages/crawler/db/init.ts new file mode 100644 index 0000000..33bc3c6 --- /dev/null +++ b/packages/crawler/db/init.ts @@ -0,0 +1,6 @@ +import { Pool } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { postgresConfig } from "@core/db/pgConfig.ts"; + +const pool = new Pool(postgresConfig, 12); + +export const db = pool; diff --git a/packages/crawler/db/redis.ts b/packages/crawler/db/redis.ts new file mode 100644 index 0000000..51ac02c --- /dev/null +++ b/packages/crawler/db/redis.ts @@ -0,0 +1,3 @@ +import { Redis } from "ioredis"; + +export const redis = new Redis({ maxRetriesPerRequest: null }); diff --git a/packages/crawler/db/schema.d.ts b/packages/crawler/db/schema.d.ts new file mode 100644 index 0000000..d030308 --- /dev/null +++ b/packages/crawler/db/schema.d.ts @@ -0,0 +1,55 @@ +export interface AllDataType { + id: number; + aid: number; + bvid: string | null; + description: string | null; + uid: number | null; + tags: string | null; + title: string | null; + published_at: string | null; + duration: number; + created_at: string | null; +} + +export interface BiliUserType { + id: number; + uid: number; + username: string; + desc: string; + fans: number; +} + +export interface VideoSnapshotType { + id: number; + created_at: string; + views: number; + coins: number; + likes: number; + favorites: number; + shares: number; + danmakus: number; + aid: bigint; + replies: number; +} + +export interface LatestSnapshotType { + aid: number; + time: number; + views: number; + danmakus: number; + replies: number; + likes: number; + coins: number; + shares: number; + favorites: number; +} + +export interface SnapshotScheduleType { + id: number; + aid: number; + type?: string; + created_at: string; + started_at?: string; + finished_at?: string; + status: string; +} diff --git a/packages/crawler/db/snapshot.ts b/packages/crawler/db/snapshot.ts new file mode 100644 index 0000000..ef8009d --- /dev/null +++ b/packages/crawler/db/snapshot.ts @@ -0,0 +1,44 @@ +import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { LatestSnapshotType } from "db/schema.d.ts"; + +export async function getVideosNearMilestone(client: Client) { + const queryResult = await client.queryObject(` + SELECT ls.* + FROM latest_video_snapshot ls + INNER JOIN + songs s ON ls.aid = s.aid + AND s.deleted = false + WHERE + s.deleted = false AND + (views >= 90000 AND views < 100000) OR + (views >= 900000 AND views < 1000000) OR + (views >= 9900000 AND views < 10000000) + `); + return queryResult.rows.map((row) => { + return { + ...row, + aid: Number(row.aid), + }; + }); +} + +export async function getLatestVideoSnapshot(client: Client, aid: number): Promise { + const queryResult = await client.queryObject( + ` + SELECT * + FROM latest_video_snapshot + WHERE aid = $1 + `, + [aid], + ); + if (queryResult.rows.length === 0) { + return null; + } + return queryResult.rows.map((row) => { + return { + ...row, + aid: Number(row.aid), + time: new Date(row.time).getTime(), + }; + })[0]; +} diff --git a/packages/crawler/db/snapshotSchedule.ts b/packages/crawler/db/snapshotSchedule.ts new file mode 100644 index 0000000..b8aec48 --- /dev/null +++ b/packages/crawler/db/snapshotSchedule.ts @@ -0,0 +1,320 @@ +import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { formatTimestampToPsql } from "utils/formatTimestampToPostgre.ts"; +import { SnapshotScheduleType } from "./schema.d.ts"; +import logger from "log/logger.ts"; +import { MINUTE } from "$std/datetime/constants.ts"; +import { redis } from "db/redis.ts"; +import { Redis } from "ioredis"; + +const REDIS_KEY = "cvsa:snapshot_window_counts"; + +function getCurrentWindowIndex(): number { + const now = new Date(); + const minutesSinceMidnight = now.getHours() * 60 + now.getMinutes(); + const currentWindow = Math.floor(minutesSinceMidnight / 5); + return currentWindow; +} + +export async function refreshSnapshotWindowCounts(client: Client, redisClient: Redis) { + const now = new Date(); + const startTime = now.getTime(); + + const result = await client.queryObject<{ window_start: Date; count: number }>` + SELECT + date_trunc('hour', started_at) + + (EXTRACT(minute FROM started_at)::int / 5 * INTERVAL '5 minutes') AS window_start, + COUNT(*) AS count + FROM snapshot_schedule + WHERE started_at >= NOW() AND status = 'pending' AND started_at <= NOW() + INTERVAL '10 days' + GROUP BY 1 + ORDER BY window_start + `; + + await redisClient.del(REDIS_KEY); + + const currentWindow = getCurrentWindowIndex(); + + for (const row of result.rows) { + const targetOffset = Math.floor((row.window_start.getTime() - startTime) / (5 * MINUTE)); + const offset = currentWindow + targetOffset; + if (offset >= 0) { + await redisClient.hset(REDIS_KEY, offset.toString(), Number(row.count)); + } + } +} + +export async function initSnapshotWindowCounts(client: Client, redisClient: Redis) { + await refreshSnapshotWindowCounts(client, redisClient); + setInterval(async () => { + await refreshSnapshotWindowCounts(client, redisClient); + }, 5 * MINUTE); +} + +async function getWindowCount(redisClient: Redis, offset: number): Promise { + const count = await redisClient.hget(REDIS_KEY, offset.toString()); + return count ? parseInt(count, 10) : 0; +} + +export async function snapshotScheduleExists(client: Client, id: number) { + const res = await client.queryObject<{ id: number }>( + `SELECT id FROM snapshot_schedule WHERE id = $1`, + [id], + ); + return res.rows.length > 0; +} + +export async function videoHasActiveSchedule(client: Client, aid: number) { + const res = await client.queryObject<{ status: string }>( + `SELECT status FROM snapshot_schedule WHERE aid = $1 AND (status = 'pending' OR status = 'processing')`, + [aid], + ); + return res.rows.length > 0; +} + +export async function videoHasProcessingSchedule(client: Client, aid: number) { + const res = await client.queryObject<{ status: string }>( + `SELECT status FROM snapshot_schedule WHERE aid = $1 AND status = 'processing'`, + [aid], + ); + return res.rows.length > 0; +} + +export async function bulkGetVideosWithoutProcessingSchedules(client: Client, aids: number[]) { + const res = await client.queryObject<{ aid: number }>( + `SELECT aid FROM snapshot_schedule WHERE aid = ANY($1) AND status != 'processing' GROUP BY aid`, + [aids], + ); + return res.rows.map((row) => row.aid); +} + +interface Snapshot { + created_at: number; + views: number; +} + +export async function findClosestSnapshot( + client: Client, + aid: number, + targetTime: Date, +): Promise { + const query = ` + SELECT created_at, views + FROM video_snapshot + WHERE aid = $1 + ORDER BY ABS(EXTRACT(EPOCH FROM (created_at - $2::timestamptz))) + LIMIT 1 + `; + const result = await client.queryObject<{ created_at: string; views: number }>( + query, + [aid, targetTime.toISOString()], + ); + if (result.rows.length === 0) return null; + const row = result.rows[0]; + return { + created_at: new Date(row.created_at).getTime(), + views: row.views, + }; +} + +export async function findSnapshotBefore( + client: Client, + aid: number, + targetTime: Date, +): Promise { + const query = ` + SELECT created_at, views + FROM video_snapshot + WHERE aid = $1 + AND created_at <= $2::timestamptz + ORDER BY created_at DESC + LIMIT 1 + `; + const result = await client.queryObject<{ created_at: string; views: number }>( + query, + [aid, targetTime.toISOString()], + ); + if (result.rows.length === 0) return null; + const row = result.rows[0]; + return { + created_at: new Date(row.created_at).getTime(), + views: row.views, + }; +} + +export async function hasAtLeast2Snapshots(client: Client, aid: number) { + const res = await client.queryObject<{ count: number }>( + `SELECT COUNT(*) FROM video_snapshot WHERE aid = $1`, + [aid], + ); + return res.rows[0].count >= 2; +} + +export async function getLatestSnapshot(client: Client, aid: number): Promise { + const res = await client.queryObject<{ created_at: string; views: number }>( + `SELECT created_at, views FROM video_snapshot WHERE aid = $1 ORDER BY created_at DESC LIMIT 1`, + [aid], + ); + if (res.rows.length === 0) return null; + const row = res.rows[0]; + return { + created_at: new Date(row.created_at).getTime(), + views: row.views, + }; +} + +/* + * Returns the number of snapshot schedules within the specified range. + * @param client The database client. + * @param start The start time of the range. (Timestamp in milliseconds) + * @param end The end time of the range. (Timestamp in milliseconds) + */ +export async function getSnapshotScheduleCountWithinRange(client: Client, start: number, end: number) { + const startTimeString = formatTimestampToPsql(start); + const endTimeString = formatTimestampToPsql(end); + const query = ` + SELECT COUNT(*) FROM snapshot_schedule + WHERE started_at BETWEEN $1 AND $2 + AND status = 'pending' + `; + const res = await client.queryObject<{ count: number }>(query, [startTimeString, endTimeString]); + return res.rows[0].count; +} + +/* + * Creates a new snapshot schedule record. + * @param client The database client. + * @param aid The aid of the video. + * @param targetTime Scheduled time for snapshot. (Timestamp in milliseconds) + */ +export async function scheduleSnapshot( + client: Client, + aid: number, + type: string, + targetTime: number, + force: boolean = false, +) { + if (await videoHasActiveSchedule(client, aid) && !force) return; + let adjustedTime = new Date(targetTime); + if (type !== "milestone" && type !== "new") { + adjustedTime = await adjustSnapshotTime(new Date(targetTime), 1000, redis); + } + logger.log(`Scheduled snapshot for ${aid} at ${adjustedTime.toISOString()}`, "mq", "fn:scheduleSnapshot"); + return client.queryObject( + `INSERT INTO snapshot_schedule (aid, type, started_at) VALUES ($1, $2, $3)`, + [aid, type, adjustedTime.toISOString()], + ); +} + +export async function bulkScheduleSnapshot( + client: Client, + aids: number[], + type: string, + targetTime: number, + force: boolean = false, +) { + for (const aid of aids) { + await scheduleSnapshot(client, aid, type, targetTime, force); + } +} + +export async function adjustSnapshotTime( + expectedStartTime: Date, + allowedCounts: number = 1000, + redisClient: Redis, +): Promise { + const currentWindow = getCurrentWindowIndex(); + const targetOffset = Math.floor((expectedStartTime.getTime() - Date.now()) / (5 * MINUTE)) - 6; + + const initialOffset = currentWindow + Math.max(targetOffset, 0); + + let timePerIteration = 0; + const MAX_ITERATIONS = 2880; + let iters = 0; + const t = performance.now(); + for (let i = initialOffset; i < MAX_ITERATIONS; i++) { + iters++; + const offset = i; + const count = await getWindowCount(redisClient, offset); + + if (count < allowedCounts) { + await redisClient.hincrby(REDIS_KEY, offset.toString(), 1); + + const startPoint = new Date(); + startPoint.setHours(0, 0, 0, 0); + const startTime = startPoint.getTime(); + const windowStart = startTime + offset * 5 * MINUTE; + const randomDelay = Math.floor(Math.random() * 5 * MINUTE); + const delayedDate = new Date(windowStart + randomDelay); + const now = new Date(); + + if (delayedDate.getTime() < now.getTime()) { + const elapsed = performance.now() - t; + timePerIteration = elapsed / (i + 1); + logger.log(`${timePerIteration.toFixed(3)}ms * ${iters} iterations`, "perf", "fn:adjustSnapshotTime"); + return now; + } + const elapsed = performance.now() - t; + timePerIteration = elapsed / (i + 1); + logger.log(`${timePerIteration.toFixed(3)}ms * ${iters} iterations`, "perf", "fn:adjustSnapshotTime"); + return delayedDate; + } + } + const elapsed = performance.now() - t; + timePerIteration = elapsed / MAX_ITERATIONS; + logger.log(`${timePerIteration.toFixed(3)}ms * ${MAX_ITERATIONS} iterations`, "perf", "fn:adjustSnapshotTime"); + return expectedStartTime; +} + +export async function getSnapshotsInNextSecond(client: Client) { + const query = ` + SELECT * + FROM snapshot_schedule + WHERE started_at <= NOW() + INTERVAL '1 seconds' AND status = 'pending' AND type != 'normal' + ORDER BY + CASE + WHEN type = 'milestone' THEN 0 + ELSE 1 + END, + started_at + LIMIT 10; + `; + const res = await client.queryObject(query, []); + return res.rows; +} + +export async function getBulkSnapshotsInNextSecond(client: Client) { + const query = ` + SELECT * + FROM snapshot_schedule + WHERE started_at <= NOW() + INTERVAL '15 seconds' AND status = 'pending' AND type = 'normal' + ORDER BY started_at + LIMIT 1000; + `; + const res = await client.queryObject(query, []); + return res.rows; +} + +export async function setSnapshotStatus(client: Client, id: number, status: string) { + return await client.queryObject( + `UPDATE snapshot_schedule SET status = $2 WHERE id = $1`, + [id, status], + ); +} + +export async function bulkSetSnapshotStatus(client: Client, ids: number[], status: string) { + return await client.queryObject( + `UPDATE snapshot_schedule SET status = $2 WHERE id = ANY($1)`, + [ids, status], + ); +} + +export async function getVideosWithoutActiveSnapshotSchedule(client: Client) { + const query: string = ` + SELECT s.aid + FROM songs s + LEFT JOIN snapshot_schedule ss ON s.aid = ss.aid AND (ss.status = 'pending' OR ss.status = 'processing') + WHERE ss.aid IS NULL + `; + const res = await client.queryObject<{ aid: number }>(query, []); + return res.rows.map((r) => Number(r.aid)); +} diff --git a/packages/crawler/db/songs.ts b/packages/crawler/db/songs.ts new file mode 100644 index 0000000..1bfa002 --- /dev/null +++ b/packages/crawler/db/songs.ts @@ -0,0 +1,45 @@ +import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { parseTimestampFromPsql } from "utils/formatTimestampToPostgre.ts"; + +export async function getNotCollectedSongs(client: Client) { + const queryResult = await client.queryObject<{ aid: number }>(` + SELECT lr.aid + FROM labelling_result lr + WHERE lr.label != 0 + AND NOT EXISTS ( + SELECT 1 + FROM songs s + WHERE s.aid = lr.aid + ); + `); + return queryResult.rows.map((row) => row.aid); +} + +export async function aidExistsInSongs(client: Client, aid: number) { + const queryResult = await client.queryObject<{ exists: boolean }>( + ` + SELECT EXISTS ( + SELECT 1 + FROM songs + WHERE aid = $1 + ); + `, + [aid], + ); + return queryResult.rows[0].exists; +} + +export async function getSongsPublihsedAt(client: Client, aid: number) { + const queryResult = await client.queryObject<{ published_at: string }>( + ` + SELECT published_at + FROM songs + WHERE aid = $1; + `, + [aid], + ); + if (queryResult.rows.length === 0) { + return null; + } + return parseTimestampFromPsql(queryResult.rows[0].published_at); +} diff --git a/packages/crawler/deno.json b/packages/crawler/deno.json new file mode 100644 index 0000000..7c10eae --- /dev/null +++ b/packages/crawler/deno.json @@ -0,0 +1,43 @@ +{ + "name": "@cvsa/crawler", + "tasks": { + "crawl-raw-bili": "deno --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run src/db/raw/insertAidsToDB.ts", + "crawl-bili-aids": "deno --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run src/db/raw/fetchAids.ts", + "check": "deno fmt --check && deno lint && deno check **/*.ts && deno check **/*.tsx", + "manifest": "deno task cli manifest $(pwd)", + "start": "deno run -A --watch=static/,routes/ dev.ts", + "build": "deno run -A dev.ts build", + "preview": "deno run -A main.ts", + "worker:main": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net --allow-write --allow-run ./src/worker.ts", + "worker:filter": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net --allow-write ./src/filterWorker.ts", + "adder": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net ./src/jobAdder.ts", + "bullui": "deno run --allow-read --allow-env --allow-ffi --allow-net ./src/bullui.ts", + "all": "concurrently 'deno task worker:main' 'deno task adder' 'deno task bullui' 'deno task worker:filter'", + "test": "deno test ./test/ --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run" + }, + "lint": { + "rules": { + "tags": ["recommended"] + } + }, + "imports": { + "@std/assert": "jsr:@std/assert@1", + "$std/": "https://deno.land/std@0.216.0/", + "@huggingface/transformers": "npm:@huggingface/transformers@3.0.0", + "bullmq": "npm:bullmq", + "mq/": "./mq/", + "db/": "./db/", + "log/": "./log/", + "net/": "./net/", + "ml/": "./ml/", + "utils/": "./utils/", + "ioredis": "npm:ioredis", + "@bull-board/api": "npm:@bull-board/api", + "@bull-board/express": "npm:@bull-board/express", + "express": "npm:express", + "src/": "./src/", + "onnxruntime": "npm:onnxruntime-node@1.19.2", + "chalk": "npm:chalk" + }, + "exports": "./main.ts" +} diff --git a/lib/log/logger.ts b/packages/crawler/log/logger.ts similarity index 91% rename from lib/log/logger.ts rename to packages/crawler/log/logger.ts index aabbb9d..10a741a 100644 --- a/lib/log/logger.ts +++ b/packages/crawler/log/logger.ts @@ -1,6 +1,6 @@ import winston, { format, transports } from "npm:winston"; import { TransformableInfo } from "npm:logform"; -import chalk from "npm:chalk"; +import chalk from "chalk"; const customFormat = format.printf((info: TransformableInfo) => { const { timestamp, level, message, service, codePath, error } = info; @@ -52,7 +52,7 @@ const createTransport = (level: string, filename: string) => { }); }; -const verboseLogPath = Deno.env.get("LOG_VERBOSE") ?? "logs/verbose.log"; +const sillyLogPath = Deno.env.get("LOG_VERBOSE") ?? "logs/verbose.log"; const warnLogPath = Deno.env.get("LOG_WARN") ?? "logs/warn.log"; const errorLogPath = Deno.env.get("LOG_ERROR") ?? "logs/error.log"; @@ -68,13 +68,16 @@ const winstonLogger = winston.createLogger({ customFormat, ), }), - createTransport("verbose", verboseLogPath), + createTransport("silly", sillyLogPath), createTransport("warn", warnLogPath), createTransport("error", errorLogPath), ], }); const logger = { + silly: (message: string, service?: string, codePath?: string) => { + winstonLogger.silly(message, { service, codePath }); + }, verbose: (message: string, service?: string, codePath?: string) => { winstonLogger.verbose(message, { service, codePath }); }, diff --git a/lib/log/test.ts b/packages/crawler/log/test.ts similarity index 82% rename from lib/log/test.ts rename to packages/crawler/log/test.ts index 49deb8c..ee5953c 100644 --- a/lib/log/test.ts +++ b/packages/crawler/log/test.ts @@ -1,4 +1,4 @@ -import logger from "lib/log/logger.ts"; +import logger from "log/logger.ts"; logger.error(Error("test error"), "test service"); logger.debug(`some string`); @@ -9,4 +9,4 @@ logger.log("foo", "service"); logger.log("foo", "db", "insert.ts"); logger.warn("warn"); logger.error("error"); -logger.verbose("error"); \ No newline at end of file +logger.verbose("error"); diff --git a/packages/crawler/main.ts b/packages/crawler/main.ts new file mode 100644 index 0000000..838dc21 --- /dev/null +++ b/packages/crawler/main.ts @@ -0,0 +1,7 @@ +// DENO ASK ME TO EXPORT SOMETHING WHEN 'name' IS SPECIFIED +// AND IF I DON'T SPECIFY 'name', THE --filter FLAG IN `deno task` WON'T WORK. +// I DONT'T KNOW WHY +// SO HERE'S A PLACHOLDER EXPORT FOR DENO: +export const DENO = "FUCK YOU DENO"; +// Oh, maybe export the version is a good idea +export const VERSION = "1.0.17"; diff --git a/packages/crawler/ml/akari.ts b/packages/crawler/ml/akari.ts new file mode 100644 index 0000000..69a7a5d --- /dev/null +++ b/packages/crawler/ml/akari.ts @@ -0,0 +1,107 @@ +import { AIManager } from "ml/manager.ts"; +import * as ort from "onnxruntime"; +import logger from "log/logger.ts"; +import { WorkerError } from "mq/schema.ts"; +import { AutoTokenizer, PreTrainedTokenizer } from "@huggingface/transformers"; + +const tokenizerModel = "alikia2x/jina-embedding-v3-m2v-1024"; +const onnxClassifierPath = "../../model/akari/3.17.onnx"; +const onnxEmbeddingPath = "../../model/embedding/model.onnx"; + +class AkariProto extends AIManager { + private tokenizer: PreTrainedTokenizer | null = null; + private readonly modelVersion = "3.17"; + + constructor() { + super(); + this.models = { + "classifier": onnxClassifierPath, + "embedding": onnxEmbeddingPath, + }; + } + + public override async init(): Promise { + await super.init(); + await this.initJinaTokenizer(); + } + + private tokenizerInitialized(): boolean { + return this.tokenizer !== null; + } + + private getTokenizer(): PreTrainedTokenizer { + if (!this.tokenizerInitialized()) { + throw new Error("Tokenizer is not initialized. Call init() first."); + } + return this.tokenizer!; + } + + private async initJinaTokenizer(): Promise { + if (this.tokenizerInitialized()) { + return; + } + try { + this.tokenizer = await AutoTokenizer.from_pretrained(tokenizerModel); + logger.log("Tokenizer initialized", "ml"); + } catch (error) { + throw new WorkerError(error as Error, "ml", "fn:initTokenizer"); + } + } + + private async getJinaEmbeddings1024(texts: string[]): Promise { + const tokenizer = this.getTokenizer(); + const session = this.getModelSession("embedding"); + + const { input_ids } = await tokenizer(texts, { + add_special_tokens: false, + return_tensor: false, + }); + + const cumsum = (arr: number[]): number[] => + arr.reduce((acc: number[], num: number, i: number) => [...acc, num + (acc[i - 1] || 0)], []); + + const offsets: number[] = [0, ...cumsum(input_ids.slice(0, -1).map((x: string) => x.length))]; + const flattened_input_ids = input_ids.flat(); + + const inputs = { + input_ids: new ort.Tensor("int64", new BigInt64Array(flattened_input_ids.map(BigInt)), [ + flattened_input_ids.length, + ]), + offsets: new ort.Tensor("int64", new BigInt64Array(offsets.map(BigInt)), [offsets.length]), + }; + + const { embeddings } = await session.run(inputs); + return Array.from(embeddings.data as Float32Array); + } + + private async runClassification(embeddings: number[]): Promise { + const session = this.getModelSession("classifier"); + const inputTensor = new ort.Tensor( + Float32Array.from(embeddings), + [1, 3, 1024], + ); + + const { logits } = await session.run({ channel_features: inputTensor }); + return this.softmax(logits.data as Float32Array); + } + + public async classifyVideo(title: string, description: string, tags: string, aid?: number): Promise { + const embeddings = await this.getJinaEmbeddings1024([ + title, + description, + tags, + ]); + const probabilities = await this.runClassification(embeddings); + if (aid) { + logger.log(`Prediction result for aid: ${aid}: [${probabilities.map((p) => p.toFixed(5))}]`, "ml"); + } + return probabilities.indexOf(Math.max(...probabilities)); + } + + public getModelVersion(): string { + return this.modelVersion; + } +} + +const Akari = new AkariProto(); +export default Akari; diff --git a/packages/crawler/ml/benchmark.ts b/packages/crawler/ml/benchmark.ts new file mode 100644 index 0000000..3fc76ac --- /dev/null +++ b/packages/crawler/ml/benchmark.ts @@ -0,0 +1,179 @@ +import { AutoTokenizer, PreTrainedTokenizer } from "@huggingface/transformers"; +import * as ort from "onnxruntime"; + +function softmax(logits: Float32Array): number[] { + const maxLogit = Math.max(...logits); + const exponents = logits.map((logit) => Math.exp(logit - maxLogit)); + const sumOfExponents = exponents.reduce((sum, exp) => sum + exp, 0); + return Array.from(exponents.map((exp) => exp / sumOfExponents)); +} + +// 配置参数 +const sentenceTransformerModelName = "alikia2x/jina-embedding-v3-m2v-1024"; +const onnxClassifierPath = "./model/video_classifier_v3_17.onnx"; +const onnxEmbeddingPath = "./model/embedding_original.onnx"; +const testDataPath = "./data/filter/test1.jsonl"; + +// 初始化会话 +const [sessionClassifier, sessionEmbedding] = await Promise.all([ + ort.InferenceSession.create(onnxClassifierPath), + ort.InferenceSession.create(onnxEmbeddingPath), +]); + +let tokenizer: PreTrainedTokenizer; + +// 初始化分词器 +async function loadTokenizer() { + const tokenizerConfig = { local_files_only: true }; + tokenizer = await AutoTokenizer.from_pretrained(sentenceTransformerModelName, tokenizerConfig); +} + +// 新的嵌入生成函数(使用ONNX) +async function getONNXEmbeddings(texts: string[], session: ort.InferenceSession): Promise { + const { input_ids } = await tokenizer(texts, { + add_special_tokens: false, + return_tensor: false, + }); + + // 构造输入参数 + const cumsum = (arr: number[]): number[] => + arr.reduce((acc: number[], num: number, i: number) => [...acc, num + (acc[i - 1] || 0)], []); + + const offsets: number[] = [0, ...cumsum(input_ids.slice(0, -1).map((x: string) => x.length))]; + const flattened_input_ids = input_ids.flat(); + + // 准备ONNX输入 + const inputs = { + input_ids: new ort.Tensor("int64", new BigInt64Array(flattened_input_ids.map(BigInt)), [ + flattened_input_ids.length, + ]), + offsets: new ort.Tensor("int64", new BigInt64Array(offsets.map(BigInt)), [offsets.length]), + }; + + // 执行推理 + const { embeddings } = await session.run(inputs); + return Array.from(embeddings.data as Float32Array); +} + +// 分类推理函数 +async function runClassification(embeddings: number[]): Promise { + const inputTensor = new ort.Tensor( + Float32Array.from(embeddings), + [1, 3, 1024], + ); + + const { logits } = await sessionClassifier.run({ channel_features: inputTensor }); + return softmax(logits.data as Float32Array); +} + +// 指标计算函数 +function calculateMetrics(labels: number[], predictions: number[], elapsedTime: number): { + accuracy: number; + precision: number; + recall: number; + f1: number; + "Class 0 Prec": number; + speed: string; +} { + // 输出label和prediction不一样的index列表 + const arr = []; + for (let i = 0; i < labels.length; i++) { + if (labels[i] !== predictions[i] && predictions[i] == 0) { + arr.push([i + 1, labels[i], predictions[i]]); + } + } + console.log(arr); + // 初始化混淆矩阵 + const classCount = Math.max(...labels, ...predictions) + 1; + const matrix = Array.from({ length: classCount }, () => Array.from({ length: classCount }, () => 0)); + + // 填充矩阵 + labels.forEach((trueLabel, i) => { + matrix[trueLabel][predictions[i]]++; + }); + + // 计算各指标 + let totalTP = 0, totalFP = 0, totalFN = 0; + + for (let c = 0; c < classCount; c++) { + const TP = matrix[c][c]; + const FP = matrix.flatMap((row, i) => i === c ? [] : [row[c]]).reduce((a, b) => a + b, 0); + const FN = matrix[c].filter((_, i) => i !== c).reduce((a, b) => a + b, 0); + + totalTP += TP; + totalFP += FP; + totalFN += FN; + } + + const precision = totalTP / (totalTP + totalFP); + const recall = totalTP / (totalTP + totalFN); + const f1 = 2 * (precision * recall) / (precision + recall) || 0; + + // 计算Class 0 Precision + const class0TP = matrix[0][0]; + const class0FP = matrix.flatMap((row, i) => i === 0 ? [] : [row[0]]).reduce((a, b) => a + b, 0); + const class0Precision = class0TP / (class0TP + class0FP) || 0; + + return { + accuracy: labels.filter((l, i) => l === predictions[i]).length / labels.length, + precision, + recall, + f1, + speed: `${(labels.length / (elapsedTime / 1000)).toFixed(1)} samples/sec`, + "Class 0 Prec": class0Precision, + }; +} + +// 改造后的评估函数 +async function evaluateModel(session: ort.InferenceSession): Promise<{ + accuracy: number; + precision: number; + recall: number; + f1: number; + "Class 0 Prec": number; +}> { + const data = await Deno.readTextFile(testDataPath); + const samples = data.split("\n") + .map((line) => { + try { + return JSON.parse(line); + } catch { + return null; + } + }) + .filter(Boolean); + + const allPredictions: number[] = []; + const allLabels: number[] = []; + + const t = new Date().getTime(); + for (const sample of samples) { + try { + const embeddings = await getONNXEmbeddings([ + sample.title, + sample.description, + sample.tags.join(","), + ], session); + + const probabilities = await runClassification(embeddings); + allPredictions.push(probabilities.indexOf(Math.max(...probabilities))); + allLabels.push(sample.label); + } catch (error) { + console.error("Processing error:", error); + } + } + const elapsed = new Date().getTime() - t; + + return calculateMetrics(allLabels, allPredictions, elapsed); +} + +// 主函数 +async function main() { + await loadTokenizer(); + + const metrics = await evaluateModel(sessionEmbedding); + console.log("Model Metrics:"); + console.table(metrics); +} + +await main(); diff --git a/packages/crawler/ml/manager.ts b/packages/crawler/ml/manager.ts new file mode 100644 index 0000000..42f783e --- /dev/null +++ b/packages/crawler/ml/manager.ts @@ -0,0 +1,37 @@ +import * as ort from "onnxruntime"; +import logger from "log/logger.ts"; +import { WorkerError } from "mq/schema.ts"; + +export class AIManager { + public sessions: { [key: string]: ort.InferenceSession } = {}; + public models: { [key: string]: string } = {}; + + constructor() { + } + + public async init() { + const modelKeys = Object.keys(this.models); + for (const key of modelKeys) { + try { + this.sessions[key] = await ort.InferenceSession.create(this.models[key]); + logger.log(`Model ${key} initialized`, "ml"); + } catch (error) { + throw new WorkerError(error as Error, "ml", "fn:init"); + } + } + } + + public getModelSession(key: string): ort.InferenceSession { + if (this.sessions[key] === undefined) { + throw new WorkerError(new Error(`Model ${key} not found / not initialized.`), "ml", "fn:getModelSession"); + } + return this.sessions[key]; + } + + public softmax(logits: Float32Array): number[] { + const maxLogit = Math.max(...logits); + const exponents = logits.map((logit) => Math.exp(logit - maxLogit)); + const sumOfExponents = exponents.reduce((sum, exp) => sum + exp, 0); + return Array.from(exponents.map((exp) => exp / sumOfExponents)); + } +} diff --git a/packages/crawler/ml/quant_benchmark.ts b/packages/crawler/ml/quant_benchmark.ts new file mode 100644 index 0000000..aab6308 --- /dev/null +++ b/packages/crawler/ml/quant_benchmark.ts @@ -0,0 +1,171 @@ +import { AutoTokenizer, PreTrainedTokenizer } from "@huggingface/transformers"; +import * as ort from "onnxruntime"; + +function softmax(logits: Float32Array): number[] { + const maxLogit = Math.max(...logits); + const exponents = logits.map((logit) => Math.exp(logit - maxLogit)); + const sumOfExponents = exponents.reduce((sum, exp) => sum + exp, 0); + return Array.from(exponents.map((exp) => exp / sumOfExponents)); +} + +// 配置参数 +const sentenceTransformerModelName = "alikia2x/jina-embedding-v3-m2v-1024"; +const onnxClassifierPath = "./model/video_classifier_v3_11.onnx"; +const onnxEmbeddingOriginalPath = "./model/embedding_original.onnx"; +const onnxEmbeddingQuantizedPath = "./model/embedding_original.onnx"; + +// 初始化会话 +const [sessionClassifier, sessionEmbeddingOriginal, sessionEmbeddingQuantized] = await Promise.all([ + ort.InferenceSession.create(onnxClassifierPath), + ort.InferenceSession.create(onnxEmbeddingOriginalPath), + ort.InferenceSession.create(onnxEmbeddingQuantizedPath), +]); + +let tokenizer: PreTrainedTokenizer; + +// 初始化分词器 +async function loadTokenizer() { + const tokenizerConfig = { local_files_only: true }; + tokenizer = await AutoTokenizer.from_pretrained(sentenceTransformerModelName, tokenizerConfig); +} + +// 新的嵌入生成函数(使用ONNX) +async function getONNXEmbeddings(texts: string[], session: ort.InferenceSession): Promise { + const { input_ids } = await tokenizer(texts, { + add_special_tokens: false, + return_tensor: false, + }); + + // 构造输入参数 + const cumsum = (arr: number[]): number[] => + arr.reduce((acc: number[], num: number, i: number) => [...acc, num + (acc[i - 1] || 0)], []); + + const offsets: number[] = [0, ...cumsum(input_ids.slice(0, -1).map((x: string) => x.length))]; + const flattened_input_ids = input_ids.flat(); + + // 准备ONNX输入 + const inputs = { + input_ids: new ort.Tensor("int64", new BigInt64Array(flattened_input_ids.map(BigInt)), [ + flattened_input_ids.length, + ]), + offsets: new ort.Tensor("int64", new BigInt64Array(offsets.map(BigInt)), [offsets.length]), + }; + + // 执行推理 + const { embeddings } = await session.run(inputs); + return Array.from(embeddings.data as Float32Array); +} + +// 分类推理函数 +async function runClassification(embeddings: number[]): Promise { + const inputTensor = new ort.Tensor( + Float32Array.from(embeddings), + [1, 4, 1024], + ); + + const { logits } = await sessionClassifier.run({ channel_features: inputTensor }); + return softmax(logits.data as Float32Array); +} + +// 指标计算函数 +function calculateMetrics(labels: number[], predictions: number[], elapsedTime: number): { + accuracy: number; + precision: number; + recall: number; + f1: number; + speed: string; +} { + // 初始化混淆矩阵 + const classCount = Math.max(...labels, ...predictions) + 1; + const matrix = Array.from({ length: classCount }, () => Array.from({ length: classCount }, () => 0)); + + // 填充矩阵 + labels.forEach((trueLabel, i) => { + matrix[trueLabel][predictions[i]]++; + }); + + // 计算各指标 + let totalTP = 0, totalFP = 0, totalFN = 0; + + for (let c = 0; c < classCount; c++) { + const TP = matrix[c][c]; + const FP = matrix.flatMap((row, i) => i === c ? [] : [row[c]]).reduce((a, b) => a + b, 0); + const FN = matrix[c].filter((_, i) => i !== c).reduce((a, b) => a + b, 0); + + totalTP += TP; + totalFP += FP; + totalFN += FN; + } + + const precision = totalTP / (totalTP + totalFP); + const recall = totalTP / (totalTP + totalFN); + const f1 = 2 * (precision * recall) / (precision + recall) || 0; + + return { + accuracy: labels.filter((l, i) => l === predictions[i]).length / labels.length, + precision, + recall, + f1, + speed: `${(labels.length / (elapsedTime / 1000)).toFixed(1)} samples/sec`, + }; +} + +// 改造后的评估函数 +async function evaluateModel(session: ort.InferenceSession): Promise<{ + accuracy: number; + precision: number; + recall: number; + f1: number; +}> { + const data = await Deno.readTextFile("./data/filter/test1.jsonl"); + const samples = data.split("\n") + .map((line) => { + try { + return JSON.parse(line); + } catch { + return null; + } + }) + .filter(Boolean); + + const allPredictions: number[] = []; + const allLabels: number[] = []; + + const t = new Date().getTime(); + for (const sample of samples) { + try { + const embeddings = await getONNXEmbeddings([ + sample.title, + sample.description, + sample.tags.join(","), + sample.author_info, + ], session); + + const probabilities = await runClassification(embeddings); + allPredictions.push(probabilities.indexOf(Math.max(...probabilities))); + allLabels.push(sample.label); + } catch (error) { + console.error("Processing error:", error); + } + } + const elapsed = new Date().getTime() - t; + + return calculateMetrics(allLabels, allPredictions, elapsed); +} + +// 主函数 +async function main() { + await loadTokenizer(); + + // 评估原始模型 + const originalMetrics = await evaluateModel(sessionEmbeddingOriginal); + console.log("Original Model Metrics:"); + console.table(originalMetrics); + + // 评估量化模型 + const quantizedMetrics = await evaluateModel(sessionEmbeddingQuantized); + console.log("Quantized Model Metrics:"); + console.table(quantizedMetrics); +} + +await main(); diff --git a/packages/crawler/mq/exec/classifyVideo.ts b/packages/crawler/mq/exec/classifyVideo.ts new file mode 100644 index 0000000..c813b7b --- /dev/null +++ b/packages/crawler/mq/exec/classifyVideo.ts @@ -0,0 +1,70 @@ +import { Job } from "bullmq"; +import { db } from "db/init.ts"; +import { getUnlabelledVideos, getVideoInfoFromAllData, insertVideoLabel } from "db/allData.ts"; +import Akari from "ml/akari.ts"; +import { ClassifyVideoQueue } from "mq/index.ts"; +import logger from "log/logger.ts"; +import { lockManager } from "mq/lockManager.ts"; +import { aidExistsInSongs } from "db/songs.ts"; +import { insertIntoSongs } from "mq/task/collectSongs.ts"; +import { scheduleSnapshot } from "db/snapshotSchedule.ts"; +import { MINUTE } from "$std/datetime/constants.ts"; + +export const classifyVideoWorker = async (job: Job) => { + const client = await db.connect(); + const aid = job.data.aid; + if (!aid) { + return 3; + } + + const videoInfo = await getVideoInfoFromAllData(client, aid); + const title = videoInfo.title?.trim() || "untitled"; + const description = videoInfo.description?.trim() || "N/A"; + const tags = videoInfo.tags?.trim() || "empty"; + const label = await Akari.classifyVideo(title, description, tags, aid); + if (label == -1) { + logger.warn(`Failed to classify video ${aid}`, "ml"); + } + await insertVideoLabel(client, aid, label); + + const exists = await aidExistsInSongs(client, aid); + if (!exists && label !== 0) { + await scheduleSnapshot(client, aid, "new", Date.now() + 10 * MINUTE, true); + await insertIntoSongs(client, aid); + } + + client.release(); + + await job.updateData({ + ...job.data, + label: label, + }); + + return 0; +}; + +export const classifyVideosWorker = async () => { + if (await lockManager.isLocked("classifyVideos")) { + logger.log("job:classifyVideos is locked, skipping.", "mq"); + return; + } + + await lockManager.acquireLock("classifyVideos"); + + const client = await db.connect(); + const videos = await getUnlabelledVideos(client); + logger.log(`Found ${videos.length} unlabelled videos`); + client.release(); + + let i = 0; + for (const aid of videos) { + if (i > 200) { + await lockManager.releaseLock("classifyVideos"); + return 10000 + i; + } + await ClassifyVideoQueue.add("classifyVideo", { aid: Number(aid) }); + i++; + } + await lockManager.releaseLock("classifyVideos"); + return 0; +}; diff --git a/packages/crawler/mq/exec/getLatestVideos.ts b/packages/crawler/mq/exec/getLatestVideos.ts new file mode 100644 index 0000000..7a19738 --- /dev/null +++ b/packages/crawler/mq/exec/getLatestVideos.ts @@ -0,0 +1,37 @@ +import { Job } from "bullmq"; +import { queueLatestVideos } from "mq/task/queueLatestVideo.ts"; +import { db } from "db/init.ts"; +import { insertVideoInfo } from "mq/task/getVideoDetails.ts"; +import { collectSongs } from "mq/task/collectSongs.ts"; + +export const getLatestVideosWorker = async (_job: Job): Promise => { + const client = await db.connect(); + try { + await queueLatestVideos(client); + } finally { + client.release(); + } +}; + +export const collectSongsWorker = async (_job: Job): Promise => { + const client = await db.connect(); + try { + await collectSongs(client); + } finally { + client.release(); + } +}; + +export const getVideoInfoWorker = async (job: Job): Promise => { + const client = await db.connect(); + try { + const aid = job.data.aid; + if (!aid) { + return 3; + } + await insertVideoInfo(client, aid); + return 0; + } finally { + client.release(); + } +}; diff --git a/packages/crawler/mq/exec/snapshotTick.ts b/packages/crawler/mq/exec/snapshotTick.ts new file mode 100644 index 0000000..32e7b23 --- /dev/null +++ b/packages/crawler/mq/exec/snapshotTick.ts @@ -0,0 +1,396 @@ +import { Job } from "bullmq"; +import { db } from "db/init.ts"; +import { getLatestVideoSnapshot, getVideosNearMilestone } from "db/snapshot.ts"; +import { + bulkGetVideosWithoutProcessingSchedules, + bulkScheduleSnapshot, + bulkSetSnapshotStatus, + findClosestSnapshot, + findSnapshotBefore, + getBulkSnapshotsInNextSecond, + getLatestSnapshot, + getSnapshotsInNextSecond, + getVideosWithoutActiveSnapshotSchedule, + hasAtLeast2Snapshots, + scheduleSnapshot, + setSnapshotStatus, + snapshotScheduleExists, + videoHasProcessingSchedule, +} from "db/snapshotSchedule.ts"; +import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { HOUR, MINUTE, SECOND, WEEK } from "$std/datetime/constants.ts"; +import logger from "log/logger.ts"; +import { SnapshotQueue } from "mq/index.ts"; +import { insertVideoSnapshot } from "mq/task/getVideoStats.ts"; +import { NetSchedulerError } from "mq/scheduler.ts"; +import { getBiliVideoStatus, setBiliVideoStatus } from "db/allData.ts"; +import { truncate } from "utils/truncate.ts"; +import { lockManager } from "mq/lockManager.ts"; +import { getSongsPublihsedAt } from "db/songs.ts"; +import { bulkGetVideoStats } from "net/bulkGetVideoStats.ts"; + +const priorityMap: { [key: string]: number } = { + "milestone": 1, + "normal": 3, +}; + +const snapshotTypeToTaskMap: { [key: string]: string } = { + "milestone": "snapshotMilestoneVideo", + "normal": "snapshotVideo", + "new": "snapshotMilestoneVideo", +}; + +export const bulkSnapshotTickWorker = async (_job: Job) => { + const client = await db.connect(); + try { + const schedules = await getBulkSnapshotsInNextSecond(client); + const count = schedules.length; + const groups = Math.ceil(count / 30); + for (let i = 0; i < groups; i++) { + const group = schedules.slice(i * 30, (i + 1) * 30); + const aids = group.map((schedule) => Number(schedule.aid)); + const filteredAids = await bulkGetVideosWithoutProcessingSchedules(client, aids); + if (filteredAids.length === 0) continue; + await bulkSetSnapshotStatus(client, filteredAids, "processing"); + const dataMap: { [key: number]: number } = {}; + for (const schedule of group) { + const id = Number(schedule.id); + dataMap[id] = Number(schedule.aid); + } + await SnapshotQueue.add("bulkSnapshotVideo", { + map: dataMap, + }, { priority: 3 }); + } + } catch (e) { + logger.error(e as Error); + } finally { + client.release(); + } +}; + +export const snapshotTickWorker = async (_job: Job) => { + const client = await db.connect(); + try { + const schedules = await getSnapshotsInNextSecond(client); + for (const schedule of schedules) { + if (await videoHasProcessingSchedule(client, Number(schedule.aid))) { + return `ALREADY_PROCESSING`; + } + let priority = 3; + if (schedule.type && priorityMap[schedule.type]) { + priority = priorityMap[schedule.type]; + } + const aid = Number(schedule.aid); + await setSnapshotStatus(client, schedule.id, "processing"); + await SnapshotQueue.add("snapshotVideo", { + aid: aid, + id: Number(schedule.id), + type: schedule.type ?? "normal", + }, { priority }); + } + } catch (e) { + logger.error(e as Error); + } finally { + client.release(); + } +}; + +export const closetMilestone = (views: number) => { + if (views < 100000) return 100000; + if (views < 1000000) return 1000000; + return 10000000; +}; + +const log = (value: number, base: number = 10) => Math.log(value) / Math.log(base); + +/* + * Returns the minimum ETA in hours for the next snapshot + * @param client - Postgres client + * @param aid - aid of the video + * @returns ETA in hours + */ +export const getAdjustedShortTermETA = async (client: Client, aid: number) => { + const latestSnapshot = await getLatestSnapshot(client, aid); + // Immediately dispatch a snapshot if there is no snapshot yet + if (!latestSnapshot) return 0; + const snapshotsEnough = await hasAtLeast2Snapshots(client, aid); + if (!snapshotsEnough) return 0; + + const currentTimestamp = new Date().getTime(); + const timeIntervals = [3 * MINUTE, 20 * MINUTE, 1 * HOUR, 3 * HOUR, 6 * HOUR, 72 * HOUR]; + const DELTA = 0.00001; + let minETAHours = Infinity; + + for (const timeInterval of timeIntervals) { + const date = new Date(currentTimestamp - timeInterval); + const snapshot = await findClosestSnapshot(client, aid, date); + if (!snapshot) continue; + const hoursDiff = (latestSnapshot.created_at - snapshot.created_at) / HOUR; + const viewsDiff = latestSnapshot.views - snapshot.views; + if (viewsDiff <= 0) continue; + const speed = viewsDiff / (hoursDiff + DELTA); + const target = closetMilestone(latestSnapshot.views); + const viewsToIncrease = target - latestSnapshot.views; + const eta = viewsToIncrease / (speed + DELTA); + let factor = log(2.97 / log(viewsToIncrease + 1), 1.14); + factor = truncate(factor, 3, 100); + const adjustedETA = eta / factor; + if (adjustedETA < minETAHours) { + minETAHours = adjustedETA; + } + } + + if (isNaN(minETAHours)) { + minETAHours = Infinity; + } + + return minETAHours; +}; + +export const collectMilestoneSnapshotsWorker = async (_job: Job) => { + const client = await db.connect(); + try { + const videos = await getVideosNearMilestone(client); + for (const video of videos) { + const aid = Number(video.aid); + const eta = await getAdjustedShortTermETA(client, aid); + if (eta > 72) continue; + const now = Date.now(); + const scheduledNextSnapshotDelay = eta * HOUR; + const maxInterval = 4 * HOUR; + const minInterval = 1 * SECOND; + const delay = truncate(scheduledNextSnapshotDelay, minInterval, maxInterval); + const targetTime = now + delay; + await scheduleSnapshot(client, aid, "milestone", targetTime); + } + } catch (e) { + logger.error(e as Error, "mq", "fn:collectMilestoneSnapshotsWorker"); + } finally { + client.release(); + } +}; + +const getRegularSnapshotInterval = async (client: Client, aid: number) => { + const now = Date.now(); + const date = new Date(now - 24 * HOUR); + let oldSnapshot = await findSnapshotBefore(client, aid, date); + if (!oldSnapshot) oldSnapshot = await findClosestSnapshot(client, aid, date); + const latestSnapshot = await getLatestSnapshot(client, aid); + if (!oldSnapshot || !latestSnapshot) return 0; + if (oldSnapshot.created_at === latestSnapshot.created_at) return 0; + const hoursDiff = (latestSnapshot.created_at - oldSnapshot.created_at) / HOUR; + if (hoursDiff < 8) return 24; + const viewsDiff = latestSnapshot.views - oldSnapshot.views; + if (viewsDiff === 0) return 72; + const speedPerDay = viewsDiff / (hoursDiff + 0.001) * 24; + if (speedPerDay < 6) return 36; + if (speedPerDay < 120) return 24; + if (speedPerDay < 320) return 12; + return 6; +}; + +export const regularSnapshotsWorker = async (_job: Job) => { + const client = await db.connect(); + const startedAt = Date.now(); + if (await lockManager.isLocked("dispatchRegularSnapshots")) { + logger.log("dispatchRegularSnapshots is already running", "mq"); + client.release(); + return; + } + await lockManager.acquireLock("dispatchRegularSnapshots", 30 * 60); + try { + const aids = await getVideosWithoutActiveSnapshotSchedule(client); + for (const rawAid of aids) { + const aid = Number(rawAid); + const latestSnapshot = await getLatestVideoSnapshot(client, aid); + const now = Date.now(); + const lastSnapshotedAt = latestSnapshot?.time ?? now; + const interval = await getRegularSnapshotInterval(client, aid); + logger.log(`Scheduled regular snapshot for aid ${aid} in ${interval} hours.`, "mq"); + const targetTime = truncate(lastSnapshotedAt + interval * HOUR, now + 1, now + 100000 * WEEK); + await scheduleSnapshot(client, aid, "normal", targetTime); + if (now - startedAt > 25 * MINUTE) { + return; + } + } + } catch (e) { + logger.error(e as Error, "mq", "fn:regularSnapshotsWorker"); + } finally { + lockManager.releaseLock("dispatchRegularSnapshots"); + client.release(); + } +}; + +export const takeBulkSnapshotForVideosWorker = async (job: Job) => { + const dataMap: { [key: number]: number } = job.data.map; + const ids = Object.keys(dataMap).map((id) => Number(id)); + const aidsToFetch: number[] = []; + const client = await db.connect(); + try { + for (const id of ids) { + const aid = Number(dataMap[id]); + const exists = await snapshotScheduleExists(client, id); + if (!exists) { + continue; + } + aidsToFetch.push(aid); + } + const data = await bulkGetVideoStats(aidsToFetch); + if (typeof data === "number") { + await bulkSetSnapshotStatus(client, ids, "failed"); + await bulkScheduleSnapshot(client, aidsToFetch, "normal", Date.now() + 15 * SECOND); + return `GET_BILI_STATUS_${data}`; + } + for (const video of data) { + const aid = video.id; + const stat = video.cnt_info; + const views = stat.play; + const danmakus = stat.danmaku; + const replies = stat.reply; + const likes = stat.thumb_up; + const coins = stat.coin; + const shares = stat.share; + const favorites = stat.collect; + const query: string = ` + INSERT INTO video_snapshot (aid, views, danmakus, replies, likes, coins, shares, favorites) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8) + `; + await client.queryObject( + query, + [aid, views, danmakus, replies, likes, coins, shares, favorites], + ); + + logger.log(`Taken snapshot for video ${aid} in bulk.`, "net", "fn:takeBulkSnapshotForVideosWorker"); + } + await bulkSetSnapshotStatus(client, ids, "completed"); + for (const aid of aidsToFetch) { + const interval = await getRegularSnapshotInterval(client, aid); + logger.log(`Scheduled regular snapshot for aid ${aid} in ${interval} hours.`, "mq"); + await scheduleSnapshot(client, aid, "normal", Date.now() + interval * HOUR); + } + return `DONE`; + } catch (e) { + if (e instanceof NetSchedulerError && e.code === "NO_PROXY_AVAILABLE") { + logger.warn( + `No available proxy for bulk request now.`, + "mq", + "fn:takeBulkSnapshotForVideosWorker", + ); + await bulkSetSnapshotStatus(client, ids, "completed"); + await bulkScheduleSnapshot(client, aidsToFetch, "normal", Date.now() + 2 * MINUTE); + return; + } + logger.error(e as Error, "mq", "fn:takeBulkSnapshotForVideosWorker"); + await bulkSetSnapshotStatus(client, ids, "failed"); + } finally { + client.release(); + } +}; + +export const takeSnapshotForVideoWorker = async (job: Job) => { + const id = job.data.id; + const aid = Number(job.data.aid); + const type = job.data.type; + const task = snapshotTypeToTaskMap[type] ?? "snapshotVideo"; + const client = await db.connect(); + const retryInterval = type === "milestone" ? 5 * SECOND : 2 * MINUTE; + const exists = await snapshotScheduleExists(client, id); + if (!exists) { + client.release(); + return; + } + const status = await getBiliVideoStatus(client, aid); + if (status !== 0) { + client.release(); + return `REFUSE_WORKING_BILI_STATUS_${status}`; + } + try { + await setSnapshotStatus(client, id, "processing"); + const stat = await insertVideoSnapshot(client, aid, task); + if (typeof stat === "number") { + await setBiliVideoStatus(client, aid, stat); + await setSnapshotStatus(client, id, "completed"); + return `GET_BILI_STATUS_${stat}`; + } + await setSnapshotStatus(client, id, "completed"); + if (type === "normal") { + const interval = await getRegularSnapshotInterval(client, aid); + logger.log(`Scheduled regular snapshot for aid ${aid} in ${interval} hours.`, "mq"); + await scheduleSnapshot(client, aid, type, Date.now() + interval * HOUR); + return `DONE`; + } else if (type === "new") { + const publihsedAt = await getSongsPublihsedAt(client, aid); + const timeSincePublished = stat.time - publihsedAt!; + const viewsPerHour = stat.views / timeSincePublished * HOUR; + if (timeSincePublished > 48 * HOUR) { + return `DONE`; + } + if (timeSincePublished > 2 * HOUR && viewsPerHour < 10) { + return `DONE`; + } + let intervalMins = 240; + if (viewsPerHour > 50) { + intervalMins = 120; + } + if (viewsPerHour > 100) { + intervalMins = 60; + } + if (viewsPerHour > 1000) { + intervalMins = 15; + } + await scheduleSnapshot(client, aid, type, Date.now() + intervalMins * MINUTE, true); + } + if (type !== "milestone") return `DONE`; + const eta = await getAdjustedShortTermETA(client, aid); + if (eta > 72) return "ETA_TOO_LONG"; + const now = Date.now(); + const targetTime = now + eta * HOUR; + await scheduleSnapshot(client, aid, type, targetTime); + return `DONE`; + } catch (e) { + if (e instanceof NetSchedulerError && e.code === "NO_PROXY_AVAILABLE") { + logger.warn( + `No available proxy for aid ${job.data.aid}.`, + "mq", + "fn:takeSnapshotForVideoWorker", + ); + await setSnapshotStatus(client, id, "completed"); + await scheduleSnapshot(client, aid, type, Date.now() + retryInterval); + return; + } + logger.error(e as Error, "mq", "fn:takeSnapshotForVideoWorker"); + await setSnapshotStatus(client, id, "failed"); + } finally { + client.release(); + } +}; + +export const scheduleCleanupWorker = async (_job: Job) => { + const client = await db.connect(); + try { + const query = ` + SELECT id, aid, type + FROM snapshot_schedule + WHERE status IN ('pending', 'processing') + AND started_at < NOW() - INTERVAL '30 minutes' + `; + const { rows } = await client.queryObject<{ id: bigint; aid: bigint; type: string }>(query); + if (rows.length === 0) return; + for (const row of rows) { + const id = Number(row.id); + const aid = Number(row.aid); + const type = row.type; + await setSnapshotStatus(client, id, "timeout"); + await scheduleSnapshot(client, aid, type, Date.now() + 10 * SECOND); + logger.log( + `Schedule ${id} has no response received for 5 minutes, rescheduled.`, + "mq", + "fn:scheduleCleanupWorker", + ); + } + } catch (e) { + logger.error(e as Error, "mq", "fn:scheduleCleanupWorker"); + } finally { + client.release(); + } +}; diff --git a/packages/crawler/mq/executors.ts b/packages/crawler/mq/executors.ts new file mode 100644 index 0000000..1e486e1 --- /dev/null +++ b/packages/crawler/mq/executors.ts @@ -0,0 +1 @@ +export * from "mq/exec/getLatestVideos.ts"; diff --git a/packages/crawler/mq/index.ts b/packages/crawler/mq/index.ts new file mode 100644 index 0000000..ef8b0f2 --- /dev/null +++ b/packages/crawler/mq/index.ts @@ -0,0 +1,7 @@ +import { Queue } from "bullmq"; + +export const LatestVideosQueue = new Queue("latestVideos"); + +export const ClassifyVideoQueue = new Queue("classifyVideo"); + +export const SnapshotQueue = new Queue("snapshot"); diff --git a/packages/crawler/mq/init.ts b/packages/crawler/mq/init.ts new file mode 100644 index 0000000..4a302d1 --- /dev/null +++ b/packages/crawler/mq/init.ts @@ -0,0 +1,67 @@ +import { MINUTE, SECOND } from "$std/datetime/constants.ts"; +import { ClassifyVideoQueue, LatestVideosQueue, SnapshotQueue } from "mq/index.ts"; +import logger from "log/logger.ts"; +import { initSnapshotWindowCounts } from "db/snapshotSchedule.ts"; +import { db } from "db/init.ts"; +import { redis } from "db/redis.ts"; + +export async function initMQ() { + const client = await db.connect(); + try { + await initSnapshotWindowCounts(client, redis); + + await LatestVideosQueue.upsertJobScheduler("getLatestVideos", { + every: 1 * MINUTE, + immediately: true, + }); + + await ClassifyVideoQueue.upsertJobScheduler("classifyVideos", { + every: 5 * MINUTE, + immediately: true, + }); + + await LatestVideosQueue.upsertJobScheduler("collectSongs", { + every: 3 * MINUTE, + immediately: true, + }); + + await SnapshotQueue.upsertJobScheduler("snapshotTick", { + every: 1 * SECOND, + immediately: true, + }, { + opts: { + removeOnComplete: 1, + removeOnFail: 1, + }, + }); + + await SnapshotQueue.upsertJobScheduler("bulkSnapshotTick", { + every: 15 * SECOND, + immediately: true, + }, { + opts: { + removeOnComplete: 1, + removeOnFail: 1, + }, + }); + + await SnapshotQueue.upsertJobScheduler("collectMilestoneSnapshots", { + every: 5 * MINUTE, + immediately: true, + }); + + await SnapshotQueue.upsertJobScheduler("dispatchRegularSnapshots", { + every: 30 * MINUTE, + immediately: true, + }); + + await SnapshotQueue.upsertJobScheduler("scheduleCleanup", { + every: 30 * MINUTE, + immediately: true, + }); + + logger.log("Message queue initialized."); + } finally { + client.release(); + } +} diff --git a/lib/mq/lockManager.ts b/packages/crawler/mq/lockManager.ts similarity index 89% rename from lib/mq/lockManager.ts rename to packages/crawler/mq/lockManager.ts index 0aa989e..e0c7f8a 100644 --- a/lib/mq/lockManager.ts +++ b/packages/crawler/mq/lockManager.ts @@ -1,5 +1,5 @@ import { Redis } from "ioredis"; -import { redis } from "lib/db/redis.ts"; +import { redis } from "db/redis.ts"; class LockManager { private redis: Redis; @@ -23,12 +23,12 @@ class LockManager { const result = await this.redis.set(key, "locked", "NX"); if (result !== "OK") { - return false; + return false; } - if (timeout) { - await this.redis.expire(key, timeout); - } - return true; + if (timeout) { + await this.redis.expire(key, timeout); + } + return true; } /* diff --git a/lib/mq/rateLimiter.ts b/packages/crawler/mq/rateLimiter.ts similarity index 87% rename from lib/mq/rateLimiter.ts rename to packages/crawler/mq/rateLimiter.ts index 41a2f4f..aba7c3e 100644 --- a/lib/mq/rateLimiter.ts +++ b/packages/crawler/mq/rateLimiter.ts @@ -1,4 +1,4 @@ -import { SlidingWindow } from "lib/mq/slidingWindow.ts"; +import { SlidingWindow } from "mq/slidingWindow.ts"; export interface RateLimiterConfig { window: SlidingWindow; @@ -7,7 +7,7 @@ export interface RateLimiterConfig { export class RateLimiter { private readonly configs: RateLimiterConfig[]; - private readonly configEventNames: string[]; + private readonly configEventNames: string[]; /* * @param name The name of the rate limiter @@ -17,7 +17,7 @@ export class RateLimiter { */ constructor(name: string, configs: RateLimiterConfig[]) { this.configs = configs; - this.configEventNames = configs.map((_, index) => `${name}_config_${index}`); + this.configEventNames = configs.map((_, index) => `${name}_config_${index}`); } /* @@ -53,4 +53,4 @@ export class RateLimiter { await config.window.clear(eventName); } } -} \ No newline at end of file +} diff --git a/packages/crawler/mq/scheduler.ts b/packages/crawler/mq/scheduler.ts new file mode 100644 index 0000000..0e8c036 --- /dev/null +++ b/packages/crawler/mq/scheduler.ts @@ -0,0 +1,404 @@ +import logger from "log/logger.ts"; +import { RateLimiter, RateLimiterConfig } from "mq/rateLimiter.ts"; +import { SlidingWindow } from "mq/slidingWindow.ts"; +import { redis } from "db/redis.ts"; +import Redis from "ioredis"; +import { SECOND } from "$std/datetime/constants.ts"; + +interface Proxy { + type: string; + data: string; +} + +interface Task { + provider: string; + proxies: string[] | "all"; +} + +interface ProxiesMap { + [name: string]: Proxy; +} + +type NetSchedulerErrorCode = + | "NO_PROXY_AVAILABLE" + | "PROXY_RATE_LIMITED" + | "PROXY_NOT_FOUND" + | "FETCH_ERROR" + | "NOT_IMPLEMENTED" + | "ALICLOUD_PROXY_ERR"; + +export class NetSchedulerError extends Error { + public code: NetSchedulerErrorCode; + public rawError: unknown | undefined; + constructor(message: string, errorCode: NetSchedulerErrorCode, rawError?: unknown) { + super(message); + this.name = "NetSchedulerError"; + this.code = errorCode; + this.rawError = rawError; + } +} + +type LimiterMap = { + [name: string]: RateLimiter; +}; + +type OptionalLimiterMap = { + [name: string]: RateLimiter | null; +}; + +type TaskMap = { + [name: string]: Task; +}; + +function shuffleArray(array: T[]): T[] { + const newArray = [...array]; // Create a shallow copy to avoid in-place modification + for (let i = newArray.length - 1; i > 0; i--) { + const j = Math.floor(Math.random() * (i + 1)); + [newArray[i], newArray[j]] = [newArray[j], newArray[i]]; // Swap elements + } + return newArray; +} + +class NetScheduler { + private proxies: ProxiesMap = {}; + private providerLimiters: LimiterMap = {}; + private proxyLimiters: OptionalLimiterMap = {}; + private tasks: TaskMap = {}; + + addProxy(proxyName: string, type: string, data: string): void { + this.proxies[proxyName] = { type, data }; + } + + removeProxy(proxyName: string): void { + if (!this.proxies[proxyName]) { + throw new Error(`Proxy ${proxyName} not found`); + } + delete this.proxies[proxyName]; + // Clean up associated limiters + this.cleanupProxyLimiters(proxyName); + } + + private cleanupProxyLimiters(proxyName: string): void { + for (const limiterId in this.proxyLimiters) { + if (limiterId.startsWith(`proxy-${proxyName}`)) { + delete this.proxyLimiters[limiterId]; + } + } + } + + addTask(taskName: string, provider: string, proxies: string[] | "all"): void { + this.tasks[taskName] = { provider, proxies }; + } + + getTaskProxies(taskName: string): string[] { + if (!this.tasks[taskName]) { + return []; + } + if (this.tasks[taskName].proxies === "all") { + return Object.keys(this.proxies); + } + return this.tasks[taskName].proxies; + } + + setTaskLimiter(taskName: string, config: RateLimiterConfig[] | null): void { + const proxies = this.getTaskProxies(taskName); + for (const proxyName of proxies) { + const limiterId = "proxy-" + proxyName + "-" + taskName; + this.proxyLimiters[limiterId] = config ? new RateLimiter(limiterId, config) : null; + } + } + + async triggerLimiter(task: string, proxy: string): Promise { + const limiterId = "proxy-" + proxy + "-" + task; + const providerLimiterId = "provider-" + proxy + "-" + this.tasks[task].provider; + try { + await this.proxyLimiters[limiterId]?.trigger(); + await this.providerLimiters[providerLimiterId]?.trigger(); + } catch (e) { + const error = e as Error; + if (e instanceof Redis.ReplyError) { + logger.error(error, "redis"); + } + logger.warn(`Unhandled error: ${error.message}`, "mq", "proxyRequest"); + } + } + + setProviderLimiter(providerName: string, config: RateLimiterConfig[]): void { + let bindProxies: string[] = []; + for (const taskName in this.tasks) { + if (this.tasks[taskName].provider !== providerName) continue; + const proxies = this.getTaskProxies(taskName); + bindProxies = bindProxies.concat(proxies); + } + for (const proxyName of bindProxies) { + const limiterId = "provider-" + proxyName + "-" + providerName; + this.providerLimiters[limiterId] = new RateLimiter(limiterId, config); + } + } + + /* + * Make a request to the specified URL with any available proxy + * @param {string} url - The URL to request. + * @param {string} method - The HTTP method to use for the request. Default is "GET". + * @returns {Promise} - A promise that resolves to the response body. + * @throws {NetSchedulerError} - The error will be thrown in following cases: + * - No proxy is available currently: with error code `NO_PROXY_AVAILABLE` + * - The native `fetch` function threw an error: with error code `FETCH_ERROR` + * - The alicloud-fc threw an error: with error code `ALICLOUD_FC_ERROR` + * - The proxy type is not supported: with error code `NOT_IMPLEMENTED` + */ + async request(url: string, task: string, method: string = "GET"): Promise { + // find a available proxy + const proxiesNames = this.getTaskProxies(task); + for (const proxyName of shuffleArray(proxiesNames)) { + if (await this.getProxyAvailability(proxyName, task)) { + return await this.proxyRequest(url, proxyName, task, method); + } + } + throw new NetSchedulerError("No proxy is available currently.", "NO_PROXY_AVAILABLE"); + } + + /* + * Make a request to the specified URL with the specified proxy + * @param {string} url - The URL to request. + * @param {string} proxyName - The name of the proxy to use. + * @param {string} task - The name of the task to use. + * @param {string} method - The HTTP method to use for the request. Default is "GET". + * @param {boolean} force - If true, the request will be made even if the proxy is rate limited. Default is false. + * @returns {Promise} - A promise that resolves to the response body. + * @throws {NetSchedulerError} - The error will be thrown in following cases: + * - Proxy not found: with error code `PROXY_NOT_FOUND` + * - Proxy is under rate limit: with error code `PROXY_RATE_LIMITED` + * - The native `fetch` function threw an error: with error code `FETCH_ERROR` + * - The alicloud-fc threw an error: with error code `ALICLOUD_FC_ERROR` + * - The proxy type is not supported: with error code `NOT_IMPLEMENTED` + */ + async proxyRequest( + url: string, + proxyName: string, + task: string, + method: string = "GET", + force: boolean = false, + ): Promise { + const proxy = this.proxies[proxyName]; + if (!proxy) { + throw new NetSchedulerError(`Proxy "${proxyName}" not found`, "PROXY_NOT_FOUND"); + } + + if (!force) { + const isAvailable = await this.getProxyAvailability(proxyName, task); + const limiter = "proxy-" + proxyName + "-" + task; + if (!isAvailable) { + throw new NetSchedulerError(`Proxy "${limiter}" is rate limited`, "PROXY_RATE_LIMITED"); + } + } + + const result = await this.makeRequest(url, proxy, method); + await this.triggerLimiter(task, proxyName); + return result; + } + + private async makeRequest(url: string, proxy: Proxy, method: string): Promise { + switch (proxy.type) { + case "native": + return await this.nativeRequest(url, method); + case "alicloud-fc": + return await this.alicloudFcRequest(url, proxy.data); + default: + throw new NetSchedulerError(`Proxy type ${proxy.type} not supported`, "NOT_IMPLEMENTED"); + } + } + + private async getProxyAvailability(proxyName: string, taskName: string): Promise { + try { + const task = this.tasks[taskName]; + const provider = task.provider; + const proxyLimiterId = "proxy-" + proxyName + "-" + task; + const providerLimiterId = "provider-" + proxyName + "-" + provider; + if (!this.proxyLimiters[proxyLimiterId]) { + const providerLimiter = this.providerLimiters[providerLimiterId]; + const providerAvailable = await providerLimiter.getAvailability(); + return providerAvailable; + } + const proxyLimiter = this.proxyLimiters[proxyLimiterId]; + const providerLimiter = this.providerLimiters[providerLimiterId]; + const providerAvailable = await providerLimiter.getAvailability(); + const proxyAvailable = await proxyLimiter.getAvailability(); + return providerAvailable && proxyAvailable; + } catch (e) { + const error = e as Error; + if (e instanceof Redis.ReplyError) { + logger.error(error, "redis"); + return false; + } + logger.error(error, "mq", "getProxyAvailability"); + return false; + } + } + + private async nativeRequest(url: string, method: string): Promise { + try { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), 10 * SECOND); + + const response = await fetch(url, { + method, + signal: controller.signal, + }); + + clearTimeout(timeout); + + return await response.json() as R; + } catch (e) { + throw new NetSchedulerError("Fetch error", "FETCH_ERROR", e); + } + } + + private async alicloudFcRequest(url: string, region: string): Promise { + try { + const decoder = new TextDecoder(); + const output = await new Deno.Command("aliyun", { + args: [ + "fc", + "POST", + `/2023-03-30/functions/proxy-${region}/invocations`, + "--qualifier", + "LATEST", + "--header", + "Content-Type=application/json;x-fc-invocation-type=Sync;x-fc-log-type=None;", + "--body", + JSON.stringify({ url: url }), + "--retry-count", + "5", + "--read-timeout", + "30", + "--connect-timeout", + "10", + "--profile", + `CVSA-${region}`, + ], + }).output(); + const out = decoder.decode(output.stdout); + const rawData = JSON.parse(out); + if (rawData.statusCode !== 200) { + throw new NetSchedulerError( + `Error proxying ${url} to ali-fc region ${region}, code: ${rawData.statusCode}.`, + "ALICLOUD_PROXY_ERR", + ); + } else { + return JSON.parse(JSON.parse(rawData.body)) as R; + } + } catch (e) { + logger.error(e as Error, "net", "fn:alicloudFcRequest"); + throw new NetSchedulerError(`Unhandled error: Cannot proxy ${url} to ali-fc.`, "ALICLOUD_PROXY_ERR", e); + } + } +} + +const netScheduler = new NetScheduler(); +const videoInfoRateLimiterConfig: RateLimiterConfig[] = [ + { + window: new SlidingWindow(redis, 0.3), + max: 1, + }, + { + window: new SlidingWindow(redis, 3), + max: 5, + }, + { + window: new SlidingWindow(redis, 30), + max: 30, + }, + { + window: new SlidingWindow(redis, 2 * 60), + max: 50, + }, +]; +const biliLimiterConfig: RateLimiterConfig[] = [ + { + window: new SlidingWindow(redis, 1), + max: 6, + }, + { + window: new SlidingWindow(redis, 5), + max: 20, + }, + { + window: new SlidingWindow(redis, 30), + max: 100, + }, + { + window: new SlidingWindow(redis, 5 * 60), + max: 200, + }, +]; + +const bili_test = [...biliLimiterConfig]; +bili_test[0].max = 10; +bili_test[1].max = 36; +bili_test[2].max = 150; +bili_test[3].max = 1000; + +const bili_strict = [...biliLimiterConfig]; +bili_strict[0].max = 1; +bili_strict[1].max = 4; +bili_strict[2].max = 12; +bili_strict[3].max = 100; + +/* +Execution order for setup: + +1. addProxy(proxyName, type, data): + - Must be called first. Registers proxies in the system, making them available for tasks. + - Define all proxies before proceeding to define tasks or set up limiters. +2. addTask(taskName, provider, proxies): + - Call after addProxy. Defines tasks and associates them with providers and proxies. + - Relies on proxies being already added. + - Must be called before setting task-specific or provider-specific limiters. +3. setTaskLimiter(taskName, config): + - Call after addProxy and addTask. Configures rate limiters specifically for tasks and their associated proxies. + - Depends on tasks and proxies being defined to apply limiters correctly. +4. setProviderLimiter(providerName, config): + - Call after addProxy and addTask. + - It sets rate limiters at the provider level, affecting all proxies used by tasks of that provider. + - Depends on tasks and proxies being defined to identify which proxies to apply provider-level limiters to. + +In summary: addProxy -> addTask -> (setTaskLimiter and/or setProviderLimiter). +The order of setTaskLimiter and setProviderLimiter relative to each other is flexible, +but both should come after addProxy and addTask to ensure proper setup and dependencies are met. +*/ + +const regions = ["shanghai", "hangzhou", "qingdao", "beijing", "zhangjiakou", "chengdu", "shenzhen", "hohhot"]; +netScheduler.addProxy("native", "native", ""); +for (const region of regions) { + netScheduler.addProxy(`alicloud-${region}`, "alicloud-fc", region); +} +netScheduler.addTask("getVideoInfo", "bilibili", "all"); +netScheduler.addTask("getLatestVideos", "bilibili", "all"); +netScheduler.addTask("snapshotMilestoneVideo", "bilibili", regions.map((region) => `alicloud-${region}`)); +netScheduler.addTask("snapshotVideo", "bili_test", [ + "alicloud-qingdao", + "alicloud-shanghai", + "alicloud-zhangjiakou", + "alicloud-chengdu", + "alicloud-shenzhen", + "alicloud-hohhot", +]); +netScheduler.addTask("bulkSnapshot", "bili_strict", [ + "alicloud-qingdao", + "alicloud-shanghai", + "alicloud-zhangjiakou", + "alicloud-chengdu", + "alicloud-shenzhen", + "alicloud-hohhot", +]); +netScheduler.setTaskLimiter("getVideoInfo", videoInfoRateLimiterConfig); +netScheduler.setTaskLimiter("getLatestVideos", null); +netScheduler.setTaskLimiter("snapshotMilestoneVideo", null); +netScheduler.setTaskLimiter("snapshotVideo", null); +netScheduler.setTaskLimiter("bulkSnapshot", null); +netScheduler.setProviderLimiter("bilibili", biliLimiterConfig); +netScheduler.setProviderLimiter("bili_test", bili_test); +netScheduler.setProviderLimiter("bili_strict", bili_strict); + +export default netScheduler; diff --git a/packages/crawler/mq/schema.ts b/packages/crawler/mq/schema.ts new file mode 100644 index 0000000..07e4033 --- /dev/null +++ b/packages/crawler/mq/schema.ts @@ -0,0 +1,12 @@ +export class WorkerError extends Error { + public service?: string; + public codePath?: string; + public rawError: Error; + constructor(rawError: Error, service?: string, codePath?: string) { + super(rawError.message); + this.name = "WorkerFailure"; + this.codePath = codePath; + this.service = service; + this.rawError = rawError; + } +} diff --git a/lib/mq/slidingWindow.ts b/packages/crawler/mq/slidingWindow.ts similarity index 99% rename from lib/mq/slidingWindow.ts rename to packages/crawler/mq/slidingWindow.ts index 049a9f0..499528f 100644 --- a/lib/mq/slidingWindow.ts +++ b/packages/crawler/mq/slidingWindow.ts @@ -21,7 +21,7 @@ export class SlidingWindow { async event(eventName: string): Promise { const now = Date.now(); const key = `cvsa:sliding_window:${eventName}`; - + const uniqueMember = `${now}-${Math.random()}`; // Add current timestamp to an ordered set await this.redis.zadd(key, now, uniqueMember); diff --git a/packages/crawler/mq/task/collectSongs.ts b/packages/crawler/mq/task/collectSongs.ts new file mode 100644 index 0000000..389ca06 --- /dev/null +++ b/packages/crawler/mq/task/collectSongs.ts @@ -0,0 +1,31 @@ +import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { aidExistsInSongs, getNotCollectedSongs } from "db/songs.ts"; +import logger from "log/logger.ts"; +import { scheduleSnapshot } from "db/snapshotSchedule.ts"; +import { MINUTE } from "$std/datetime/constants.ts"; + +export async function collectSongs(client: Client) { + const aids = await getNotCollectedSongs(client); + for (const aid of aids) { + const exists = await aidExistsInSongs(client, aid); + if (exists) continue; + await insertIntoSongs(client, aid); + await scheduleSnapshot(client, aid, "new", Date.now() + 10 * MINUTE, true); + logger.log(`Video ${aid} was added into the songs table.`, "mq", "fn:collectSongs"); + } +} + +export async function insertIntoSongs(client: Client, aid: number) { + await client.queryObject( + ` + INSERT INTO songs (aid, published_at, duration) + VALUES ( + $1, + (SELECT published_at FROM bilibili_metadata WHERE aid = $1), + (SELECT duration FROM bilibili_metadata WHERE aid = $1) + ) + ON CONFLICT DO NOTHING + `, + [aid], + ); +} diff --git a/packages/crawler/mq/task/getVideoDetails.ts b/packages/crawler/mq/task/getVideoDetails.ts new file mode 100644 index 0000000..ee015fd --- /dev/null +++ b/packages/crawler/mq/task/getVideoDetails.ts @@ -0,0 +1,47 @@ +import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { getVideoDetails } from "net/getVideoDetails.ts"; +import { formatTimestampToPsql } from "utils/formatTimestampToPostgre.ts"; +import logger from "log/logger.ts"; +import { ClassifyVideoQueue } from "mq/index.ts"; +import { userExistsInBiliUsers, videoExistsInAllData } from "db/allData.ts"; +import { HOUR, SECOND } from "$std/datetime/constants.ts"; + +export async function insertVideoInfo(client: Client, aid: number) { + const videoExists = await videoExistsInAllData(client, aid); + if (videoExists) { + return; + } + const data = await getVideoDetails(aid); + if (data === null) { + return null; + } + const bvid = data.View.bvid; + const desc = data.View.desc; + const uid = data.View.owner.mid; + const tags = data.Tags + .filter((tag) => !["old_channel", "topic"].indexOf(tag.tag_type)) + .map((tag) => tag.tag_name).join(","); + const title = data.View.title; + const published_at = formatTimestampToPsql(data.View.pubdate * SECOND + 8 * HOUR); + const duration = data.View.duration; + const cover = data.View.pic; + await client.queryObject( + `INSERT INTO bilibili_metadata (aid, bvid, description, uid, tags, title, published_at, duration, cover_url) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)`, + [aid, bvid, desc, uid, tags, title, published_at, duration, cover], + ); + const userExists = await userExistsInBiliUsers(client, aid); + if (!userExists) { + await client.queryObject( + `INSERT INTO bilibili_user (uid, username, "desc", fans) VALUES ($1, $2, $3, $4)`, + [uid, data.View.owner.name, data.Card.card.sign, data.Card.follower], + ); + } else { + await client.queryObject( + `UPDATE bilibili_user SET fans = $1 WHERE uid = $2`, + [data.Card.follower, uid], + ); + } + logger.log(`Inserted video metadata for aid: ${aid}`, "mq"); + await ClassifyVideoQueue.add("classifyVideo", { aid }); +} diff --git a/packages/crawler/mq/task/getVideoStats.ts b/packages/crawler/mq/task/getVideoStats.ts new file mode 100644 index 0000000..34b6c42 --- /dev/null +++ b/packages/crawler/mq/task/getVideoStats.ts @@ -0,0 +1,58 @@ +import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { getVideoInfo } from "net/getVideoInfo.ts"; +import { LatestSnapshotType } from "db/schema.d.ts"; +import logger from "log/logger.ts"; + +/* + * Fetch video stats from bilibili API and insert into database + * @returns {Promise} + * A number indicating the status code when receiving non-0 status code from bilibili, + * otherwise an VideoSnapshot object containing the video stats + * @throws {NetSchedulerError} - The error will be thrown in following cases: + * - No proxy is available currently: with error code `NO_PROXY_AVAILABLE` + * - The native `fetch` function threw an error: with error code `FETCH_ERROR` + * - The alicloud-fc threw an error: with error code `ALICLOUD_FC_ERROR` + */ +export async function insertVideoSnapshot( + client: Client, + aid: number, + task: string, +): Promise { + const data = await getVideoInfo(aid, task); + if (typeof data == "number") { + return data; + } + const time = new Date().getTime(); + const views = data.stat.view; + const danmakus = data.stat.danmaku; + const replies = data.stat.reply; + const likes = data.stat.like; + const coins = data.stat.coin; + const shares = data.stat.share; + const favorites = data.stat.favorite; + + const query: string = ` + INSERT INTO video_snapshot (aid, views, danmakus, replies, likes, coins, shares, favorites) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8) + `; + await client.queryObject( + query, + [aid, views, danmakus, replies, likes, coins, shares, favorites], + ); + + logger.log(`Taken snapshot for video ${aid}.`, "net", "fn:insertVideoSnapshot"); + + const snapshot: LatestSnapshotType = { + aid, + views, + danmakus, + replies, + likes, + coins, + shares, + favorites, + time, + }; + + return snapshot; +} diff --git a/packages/crawler/mq/task/queueLatestVideo.ts b/packages/crawler/mq/task/queueLatestVideo.ts new file mode 100644 index 0000000..d8b3993 --- /dev/null +++ b/packages/crawler/mq/task/queueLatestVideo.ts @@ -0,0 +1,56 @@ +import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { getLatestVideoAids } from "net/getLatestVideoAids.ts"; +import { videoExistsInAllData } from "db/allData.ts"; +import { sleep } from "utils/sleep.ts"; +import { SECOND } from "$std/datetime/constants.ts"; +import logger from "log/logger.ts"; +import { LatestVideosQueue } from "mq/index.ts"; + +export async function queueLatestVideos( + client: Client, +): Promise { + let page = 1; + let i = 0; + const videosFound = new Set(); + while (true) { + const pageSize = page == 1 ? 10 : 30; + const aids = await getLatestVideoAids(page, pageSize); + if (aids.length == 0) { + logger.verbose("No more videos found", "net", "fn:insertLatestVideos()"); + break; + } + let allExists = true; + let delay = 0; + for (const aid of aids) { + const videoExists = await videoExistsInAllData(client, aid); + if (videoExists) { + continue; + } + await LatestVideosQueue.add("getVideoInfo", { aid }, { + delay, + attempts: 100, + backoff: { + type: "fixed", + delay: SECOND * 5, + }, + }); + videosFound.add(aid); + allExists = false; + delay += Math.random() * SECOND * 1.5; + } + i += aids.length; + logger.log( + `Page ${page} crawled, total: ${videosFound.size}/${i} videos added/observed.`, + "net", + "fn:queueLatestVideos()", + ); + if (allExists) { + return 0; + } + page++; + const randomTime = Math.random() * 4000; + const delta = SECOND; + await sleep(randomTime + delta); + } + return 0; +} diff --git a/packages/crawler/net/bilibili.d.ts b/packages/crawler/net/bilibili.d.ts new file mode 100644 index 0000000..4327f51 --- /dev/null +++ b/packages/crawler/net/bilibili.d.ts @@ -0,0 +1,263 @@ +interface BaseResponse { + code: number; + message: string; + ttl: number; + data: T; +} + +export type VideoListResponse = BaseResponse; +export type VideoDetailsResponse = BaseResponse; +export type VideoTagsResponse = BaseResponse; +export type VideoInfoResponse = BaseResponse; +export type MediaListInfoResponse = BaseResponse; + +export type MediaListInfoData = MediaListInfoItem[]; + +export interface MediaListInfoItem { + attr: number; + bvid: string; + id: number; + cnt_info: { + coin: number; + collect: number; + danmaku: number; + play: number; + reply: number; + share: number; + thumb_up: number; + }; +} + +interface VideoInfoData { + bvid: string; + aid: number; + copyright: number; + pic: string; + title: string; + pubdate: number; + ctime: number; + desc: string; + desc_v2: string; + state: number; + duration: number; + owner: { + mid: number; + name: string; + face: string; + }; + stat: VideoStats; +} + +interface VideoDetailsData { + View: { + bvid: string; + aid: number; + videos: number; + tid: number; + tid_v2: number; + tname: string; + tname_v2: string; + copyright: number; + pic: string; + title: string; + pubdate: number; + ctime: number; + desc: string; + desc_v2: string; + state: number; + duration: number; + mission_id: number; + rights: VideoRights; + owner: { + mid: number; + name: string; + face: string; + }; + stat: VideoStats; + argue_info: { + argue_msg: string; + argue_type: number; + argue_link: string; + }; + dynamic: ""; + cid: number; + dimension: VideoDimension; + pages: VideoPage[]; + subtitle: { + allow_submit: number; + list: VideoSubTitle[]; + }; + staff: VideoStaff[]; + }; + Card: { + card: { + mid: number; + name: string; + sex: string; + face: string; + fans: number; + attention: number; + friend: number; + sign: string; + level_info: { + current_level: number; + }; + }; + archive_count: number; + article_count: number; + follower: number; + like_num: number; + }; + Tags: VideoTagsLite[]; +} + +interface VideoTagsLite { + tag_id: number; + tag_name: string; + music_id: string; + tag_type: string; + jump_url: string; +} + +type VideoTagsData = VideoTags[]; + +type VideoStaff = { + mid: number; + title: string; + name: string; + face: string; + follower: number; +}; + +type VideoSubTitle = { + id: number; + lan: string; + lan_doc: string; + is_lock: number; + subtitle_url: string; + type: number; + id_str: string; + ai_type: number; + ai_status: number; +}; + +type VideoDimension = { + width: number; + height: number; + rotate: number; +}; + +interface VideoPage { + cid: number; + page: number; + from: string; + part: string; + duration: number; + vid: string; + weblink: string; + dimension: VideoDimension; + first_frame: string; +} + +interface VideoTags { + tag_id: number; + tag_name: string; + cover: string; + head_cover: string; + content: string; + short_content: string; + type: number; + state: number; + ctime: number; + count: { + view: number; + use: number; + atten: number; + }; + is_atten: number; + likes: number; + hates: number; + attribute: number; + liked: number; + hated: number; + extra_attr: number; +} + +interface VideoListData { + archives: VideoListVideo[]; + page: { + num: number; + size: number; + count: number; + }; +} + +type VideoRights = { + bp: number; + elec: number; + download: number; + movie: number; + pay: number; + hd5: number; + no_reprint: number; + autoplay: number; + ugc_pay: number; + is_cooperation: number; + ugc_pay_preview: number; + no_background: number; + arc_pay: number; + pay_free_watch: number; +}; + +type VideoStats = { + aid: number; + view: number; + danmaku: number; + reply: number; + favorite: number; + coin: number; + share: number; + now_rank: number; + his_rank: number; + like: number; +}; + +interface VideoListVideo { + aid: number; + videos: number; + tid: number; + tname: string; + copyright: number; + pic: string; + title: string; + pubdate: number; + ctime: number; + desc: string; + state: number; + duration: number; + mission_id?: number; + rights: VideoRights; + owner: { + mid: number; + name: string; + face: string; + }; + stat: VideoStats; + dynamic: string; + cid: number; + dimension: VideoDimension; + season_id?: number; + short_link_v2: string; + first_frame: string; + pub_location: string; + cover43: string; + tidv2: number; + tname_v2: string; + bvid: string; + season_type: number; + is_ogv: number; + ovg_info: string | null; + rcmd_season: string; + enable_vt: number; + ai_rcmd: null | string; +} diff --git a/packages/crawler/net/bulkGetVideoStats.ts b/packages/crawler/net/bulkGetVideoStats.ts new file mode 100644 index 0000000..3ed518c --- /dev/null +++ b/packages/crawler/net/bulkGetVideoStats.ts @@ -0,0 +1,27 @@ +import netScheduler from "mq/scheduler.ts"; +import { MediaListInfoData, MediaListInfoResponse } from "net/bilibili.d.ts"; +import logger from "log/logger.ts"; + +/* + * Bulk fetch video metadata from bilibili API + * @param {number[]} aids - The aid list to fetch + * @returns {Promise} MediaListInfoData or the error code returned by bilibili API + * @throws {NetSchedulerError} - The error will be thrown in following cases: + * - No proxy is available currently: with error code `NO_PROXY_AVAILABLE` + * - The native `fetch` function threw an error: with error code `FETCH_ERROR` + * - The alicloud-fc threw an error: with error code `ALICLOUD_FC_ERROR` + */ +export async function bulkGetVideoStats(aids: number[]): Promise { + const baseURL = `https://api.bilibili.com/medialist/gateway/base/resource/infos?resources=`; + let url = baseURL; + for (const aid of aids) { + url += `${aid}:2,`; + } + const data = await netScheduler.request(url, "bulkSnapshot"); + const errMessage = `Error fetching metadata for aid list: ${aids.join(",")}:`; + if (data.code !== 0) { + logger.error(errMessage + data.code + "-" + data.message, "net", "fn:getVideoInfo"); + return data.code; + } + return data.data; +} diff --git a/packages/crawler/net/getLatestVideoAids.ts b/packages/crawler/net/getLatestVideoAids.ts new file mode 100644 index 0000000..7dacd46 --- /dev/null +++ b/packages/crawler/net/getLatestVideoAids.ts @@ -0,0 +1,21 @@ +import { VideoListResponse } from "net/bilibili.d.ts"; +import logger from "log/logger.ts"; +import netScheduler from "mq/scheduler.ts"; + +export async function getLatestVideoAids(page: number = 1, pageSize: number = 10): Promise { + const startFrom = 1 + pageSize * (page - 1); + const endTo = pageSize * page; + const range = `${startFrom}-${endTo}`; + const errMessage = `Error fetching latest aid for ${range}:`; + const url = `https://api.bilibili.com/x/web-interface/newlist?rid=30&ps=${pageSize}&pn=${page}`; + const data = await netScheduler.request(url, "getLatestVideos"); + if (data.code != 0) { + logger.error(errMessage + data.message, "net", "getLastestVideos"); + return []; + } + if (data.data.archives.length === 0) { + logger.verbose("No more videos found", "net", "getLatestVideos"); + return []; + } + return data.data.archives.map((video) => video.aid); +} diff --git a/packages/crawler/net/getVideoDetails.ts b/packages/crawler/net/getVideoDetails.ts new file mode 100644 index 0000000..d6d52c1 --- /dev/null +++ b/packages/crawler/net/getVideoDetails.ts @@ -0,0 +1,14 @@ +import netScheduler from "mq/scheduler.ts"; +import { VideoDetailsData, VideoDetailsResponse } from "net/bilibili.d.ts"; +import logger from "log/logger.ts"; + +export async function getVideoDetails(aid: number): Promise { + const url = `https://api.bilibili.com/x/web-interface/view/detail?aid=${aid}`; + const data = await netScheduler.request(url, "getVideoInfo"); + const errMessage = `Error fetching metadata for ${aid}:`; + if (data.code !== 0) { + logger.error(errMessage + data.code + "-" + data.message, "net", "fn:getVideoInfo"); + return null; + } + return data.data; +} diff --git a/packages/crawler/net/getVideoInfo.ts b/packages/crawler/net/getVideoInfo.ts new file mode 100644 index 0000000..0533c53 --- /dev/null +++ b/packages/crawler/net/getVideoInfo.ts @@ -0,0 +1,27 @@ +import netScheduler from "mq/scheduler.ts"; +import { VideoInfoData, VideoInfoResponse } from "net/bilibili.d.ts"; +import logger from "log/logger.ts"; + +/* + * Fetch video metadata from bilibili API + * @param {number} aid - The video's aid + * @param {string} task - The task name used in scheduler. It can be one of the following: + * - snapshotVideo + * - getVideoInfo + * - snapshotMilestoneVideo + * @returns {Promise} VideoInfoData or the error code returned by bilibili API + * @throws {NetSchedulerError} - The error will be thrown in following cases: + * - No proxy is available currently: with error code `NO_PROXY_AVAILABLE` + * - The native `fetch` function threw an error: with error code `FETCH_ERROR` + * - The alicloud-fc threw an error: with error code `ALICLOUD_FC_ERROR` + */ +export async function getVideoInfo(aid: number, task: string): Promise { + const url = `https://api.bilibili.com/x/web-interface/view?aid=${aid}`; + const data = await netScheduler.request(url, task); + const errMessage = `Error fetching metadata for ${aid}:`; + if (data.code !== 0) { + logger.error(errMessage + data.code + "-" + data.message, "net", "fn:getVideoInfo"); + return data.code; + } + return data.data; +} diff --git a/src/bullui.ts b/packages/crawler/src/bullui.ts similarity index 73% rename from src/bullui.ts rename to packages/crawler/src/bullui.ts index 9aab14b..5765540 100644 --- a/src/bullui.ts +++ b/packages/crawler/src/bullui.ts @@ -2,13 +2,17 @@ import express from "express"; import { createBullBoard } from "@bull-board/api"; import { BullMQAdapter } from "@bull-board/api/bullMQAdapter.js"; import { ExpressAdapter } from "@bull-board/express"; -import { LatestVideosQueue, VideoTagsQueue } from "lib/mq/index.ts"; +import { ClassifyVideoQueue, LatestVideosQueue, SnapshotQueue } from "mq/index.ts"; const serverAdapter = new ExpressAdapter(); serverAdapter.setBasePath("/"); createBullBoard({ - queues: [new BullMQAdapter(LatestVideosQueue), new BullMQAdapter(VideoTagsQueue)], + queues: [ + new BullMQAdapter(LatestVideosQueue), + new BullMQAdapter(ClassifyVideoQueue), + new BullMQAdapter(SnapshotQueue), + ], serverAdapter: serverAdapter, }); @@ -16,8 +20,6 @@ const app = express(); app.use("/", serverAdapter.getRouter()); -// other configurations of your server - app.listen(3000, () => { console.log("Running on 3000..."); console.log("For the UI, open http://localhost:3000/"); diff --git a/packages/crawler/src/filterWorker.ts b/packages/crawler/src/filterWorker.ts new file mode 100644 index 0000000..cb336c4 --- /dev/null +++ b/packages/crawler/src/filterWorker.ts @@ -0,0 +1,49 @@ +import { ConnectionOptions, Job, Worker } from "bullmq"; +import { redis } from "db/redis.ts"; +import logger from "log/logger.ts"; +import { classifyVideosWorker, classifyVideoWorker } from "mq/exec/classifyVideo.ts"; +import { WorkerError } from "mq/schema.ts"; +import { lockManager } from "mq/lockManager.ts"; +import Akari from "ml/akari.ts"; + +Deno.addSignalListener("SIGINT", async () => { + logger.log("SIGINT Received: Shutting down workers...", "mq"); + await filterWorker.close(true); + Deno.exit(); +}); + +Deno.addSignalListener("SIGTERM", async () => { + logger.log("SIGTERM Received: Shutting down workers...", "mq"); + await filterWorker.close(true); + Deno.exit(); +}); + +Akari.init(); + +const filterWorker = new Worker( + "classifyVideo", + async (job: Job) => { + switch (job.name) { + case "classifyVideo": + return await classifyVideoWorker(job); + case "classifyVideos": + return await classifyVideosWorker(); + default: + break; + } + }, + { connection: redis as ConnectionOptions, concurrency: 2, removeOnComplete: { count: 1000 } }, +); + +filterWorker.on("active", () => { + logger.log("Worker (filter) activated.", "mq"); +}); + +filterWorker.on("error", (err) => { + const e = err as WorkerError; + logger.error(e.rawError, e.service, e.codePath); +}); + +filterWorker.on("closed", async () => { + await lockManager.releaseLock("classifyVideos"); +}); diff --git a/packages/crawler/src/jobAdder.ts b/packages/crawler/src/jobAdder.ts new file mode 100644 index 0000000..3aefd24 --- /dev/null +++ b/packages/crawler/src/jobAdder.ts @@ -0,0 +1,3 @@ +import { initMQ } from "mq/init.ts"; + +await initMQ(); diff --git a/packages/crawler/src/worker.ts b/packages/crawler/src/worker.ts new file mode 100644 index 0000000..6507d63 --- /dev/null +++ b/packages/crawler/src/worker.ts @@ -0,0 +1,109 @@ +import { ConnectionOptions, Job, Worker } from "bullmq"; +import { collectSongsWorker, getLatestVideosWorker } from "mq/executors.ts"; +import { redis } from "db/redis.ts"; +import logger from "log/logger.ts"; +import { lockManager } from "mq/lockManager.ts"; +import { WorkerError } from "mq/schema.ts"; +import { getVideoInfoWorker } from "mq/exec/getLatestVideos.ts"; +import { + bulkSnapshotTickWorker, + collectMilestoneSnapshotsWorker, + regularSnapshotsWorker, + scheduleCleanupWorker, + snapshotTickWorker, + takeBulkSnapshotForVideosWorker, + takeSnapshotForVideoWorker, +} from "mq/exec/snapshotTick.ts"; + +Deno.addSignalListener("SIGINT", async () => { + logger.log("SIGINT Received: Shutting down workers...", "mq"); + await latestVideoWorker.close(true); + await snapshotWorker.close(true); + Deno.exit(); +}); + +Deno.addSignalListener("SIGTERM", async () => { + logger.log("SIGTERM Received: Shutting down workers...", "mq"); + await latestVideoWorker.close(true); + await snapshotWorker.close(true); + Deno.exit(); +}); + +const latestVideoWorker = new Worker( + "latestVideos", + async (job: Job) => { + switch (job.name) { + case "getLatestVideos": + await getLatestVideosWorker(job); + break; + case "getVideoInfo": + await getVideoInfoWorker(job); + break; + case "collectSongs": + await collectSongsWorker(job); + break; + default: + break; + } + }, + { + connection: redis as ConnectionOptions, + concurrency: 6, + removeOnComplete: { count: 1440 }, + removeOnFail: { count: 0 }, + }, +); + +latestVideoWorker.on("active", () => { + logger.log("Worker (latestVideos) activated.", "mq"); +}); + +latestVideoWorker.on("error", (err) => { + const e = err as WorkerError; + logger.error(e.rawError, e.service, e.codePath); +}); + +latestVideoWorker.on("closed", async () => { + await lockManager.releaseLock("getLatestVideos"); +}); + +const snapshotWorker = new Worker( + "snapshot", + async (job: Job) => { + switch (job.name) { + case "snapshotVideo": + await takeSnapshotForVideoWorker(job); + break; + case "snapshotTick": + await snapshotTickWorker(job); + break; + case "collectMilestoneSnapshots": + await collectMilestoneSnapshotsWorker(job); + break; + case "dispatchRegularSnapshots": + await regularSnapshotsWorker(job); + break; + case "scheduleCleanup": + await scheduleCleanupWorker(job); + break; + case "bulkSnapshotVideo": + await takeBulkSnapshotForVideosWorker(job); + break; + case "bulkSnapshotTick": + await bulkSnapshotTickWorker(job); + break; + default: + break; + } + }, + { connection: redis as ConnectionOptions, concurrency: 50, removeOnComplete: { count: 2000 } }, +); + +snapshotWorker.on("error", (err) => { + const e = err as WorkerError; + logger.error(e.rawError, e.service, e.codePath); +}); + +snapshotWorker.on("closed", async () => { + await lockManager.releaseLock("dispatchRegularSnapshots"); +}); diff --git a/packages/crawler/utils/formatSeconds.ts b/packages/crawler/utils/formatSeconds.ts new file mode 100644 index 0000000..491dfd6 --- /dev/null +++ b/packages/crawler/utils/formatSeconds.ts @@ -0,0 +1,9 @@ +export const formatSeconds = (seconds: number) => { + if (seconds < 60) { + return `${seconds.toFixed(1)}s`; + } + if (seconds < 3600) { + return `${Math.floor(seconds / 60)}m${(seconds % 60).toFixed(1)}s`; + } + return `${Math.floor(seconds / 3600)}h ${((seconds % 3600) / 60).toFixed(2)}m`; +}; diff --git a/lib/utils/formatTimestampToPostgre.ts b/packages/crawler/utils/formatTimestampToPostgre.ts similarity index 100% rename from lib/utils/formatTimestampToPostgre.ts rename to packages/crawler/utils/formatTimestampToPostgre.ts diff --git a/lib/utils/sleep.ts b/packages/crawler/utils/sleep.ts similarity index 98% rename from lib/utils/sleep.ts rename to packages/crawler/utils/sleep.ts index 3a5dcb9..63e382d 100644 --- a/lib/utils/sleep.ts +++ b/packages/crawler/utils/sleep.ts @@ -1,3 +1,3 @@ export async function sleep(ms: number) { await new Promise((resolve) => setTimeout(resolve, ms)); -} \ No newline at end of file +} diff --git a/lib/utils/truncate.ts b/packages/crawler/utils/truncate.ts similarity index 58% rename from lib/utils/truncate.ts rename to packages/crawler/utils/truncate.ts index 677978d..3d5800d 100644 --- a/lib/utils/truncate.ts +++ b/packages/crawler/utils/truncate.ts @@ -1,3 +1,3 @@ export function truncate(num: number, min: number, max: number) { - return Math.max(min, Math.min(num, max)) -} \ No newline at end of file + return Math.max(min, Math.min(num, max)); +} diff --git a/packages/frontend/.gitignore b/packages/frontend/.gitignore new file mode 100644 index 0000000..016b59e --- /dev/null +++ b/packages/frontend/.gitignore @@ -0,0 +1,24 @@ +# build output +dist/ + +# generated types +.astro/ + +# dependencies +node_modules/ + +# logs +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* + +# environment variables +.env +.env.production + +# macOS-specific files +.DS_Store + +# jetbrains setting folder +.idea/ diff --git a/packages/frontend/astro.config.mjs b/packages/frontend/astro.config.mjs new file mode 100644 index 0000000..79ca8e0 --- /dev/null +++ b/packages/frontend/astro.config.mjs @@ -0,0 +1,24 @@ +// @ts-check +import { defineConfig } from "astro/config"; +import tailwind from "@astrojs/tailwind"; + +// https://astro.build/config +import tsconfigPaths from "vite-tsconfig-paths"; +import node from "@astrojs/node"; +import svelte from "@astrojs/svelte"; + +export default defineConfig({ + output: "server", + adapter: node({ + mode: "standalone", + }), + integrations: [tailwind(), svelte()], + vite: { + server: { + fs: { + allow: [".", "../../"], + }, + }, + plugins: [tsconfigPaths()], + }, +}); diff --git a/packages/frontend/deno.json b/packages/frontend/deno.json new file mode 100644 index 0000000..675c374 --- /dev/null +++ b/packages/frontend/deno.json @@ -0,0 +1,8 @@ +{ + "name": "@cvsa/frontend", + "tasks": { + "preview": "bun run astro preview", + "build": "bun run astro build" + }, + "exports": "./main.ts" +} diff --git a/packages/frontend/main.ts b/packages/frontend/main.ts new file mode 100644 index 0000000..3f50c75 --- /dev/null +++ b/packages/frontend/main.ts @@ -0,0 +1 @@ +export const VERSION = "1.2.6"; diff --git a/packages/frontend/package.json b/packages/frontend/package.json new file mode 100644 index 0000000..d94c869 --- /dev/null +++ b/packages/frontend/package.json @@ -0,0 +1,23 @@ +{ + "name": "frontend", + "type": "module", + "version": "0.0.1", + "scripts": { + "dev": "astro dev", + "build": "astro build", + "preview": "astro preview", + "astro": "astro" + }, + "dependencies": { + "@astrojs/tailwind": "^6.0.2", + "astro": "^5.5.5", + "autoprefixer": "^10.4.21", + "pg": "^8.11.11", + "postcss": "^8.5.3", + "tailwindcss": "^3.0.24", + "vite-tsconfig-paths": "^5.1.4" + }, + "devDependencies": { + "@types/pg": "^8.11.11" + } +} diff --git a/packages/frontend/public/favicon.svg b/packages/frontend/public/favicon.svg new file mode 100644 index 0000000..f157bd1 --- /dev/null +++ b/packages/frontend/public/favicon.svg @@ -0,0 +1,9 @@ + + + + diff --git a/packages/frontend/src/assets/TitleBar Mobile Dark.svg b/packages/frontend/src/assets/TitleBar Mobile Dark.svg new file mode 100644 index 0000000..3b1a7e5 --- /dev/null +++ b/packages/frontend/src/assets/TitleBar Mobile Dark.svg @@ -0,0 +1,7 @@ + + + Created with Pixso. + + + + diff --git a/packages/frontend/src/assets/TitleBar Mobile Light.svg b/packages/frontend/src/assets/TitleBar Mobile Light.svg new file mode 100644 index 0000000..365e6a4 --- /dev/null +++ b/packages/frontend/src/assets/TitleBar Mobile Light.svg @@ -0,0 +1,7 @@ + + + Created with Pixso. + + + + diff --git a/packages/frontend/src/assets/TitleBar-Dark.svg b/packages/frontend/src/assets/TitleBar-Dark.svg new file mode 100644 index 0000000..02f0398 --- /dev/null +++ b/packages/frontend/src/assets/TitleBar-Dark.svg @@ -0,0 +1,15 @@ + + + Created with Pixso. + + + + + + + + + + + + diff --git a/packages/frontend/src/assets/TitleBar-Light.svg b/packages/frontend/src/assets/TitleBar-Light.svg new file mode 100644 index 0000000..92482e0 --- /dev/null +++ b/packages/frontend/src/assets/TitleBar-Light.svg @@ -0,0 +1,15 @@ + + + Created with Pixso. + + + + + + + + + + + + diff --git a/packages/frontend/src/assets/header-logo-dark.svg b/packages/frontend/src/assets/header-logo-dark.svg new file mode 100644 index 0000000..002540f --- /dev/null +++ b/packages/frontend/src/assets/header-logo-dark.svg @@ -0,0 +1,15 @@ + + + Created with Pixso. + + + + + + + + + + + + diff --git a/packages/frontend/src/assets/header-logo-light.svg b/packages/frontend/src/assets/header-logo-light.svg new file mode 100644 index 0000000..acc0c15 --- /dev/null +++ b/packages/frontend/src/assets/header-logo-light.svg @@ -0,0 +1,15 @@ + + + Created with Pixso. + + + + + + + + + + + + diff --git a/packages/frontend/src/assets/标题-浅色.svg b/packages/frontend/src/assets/标题-浅色.svg new file mode 100644 index 0000000..77e3e1e --- /dev/null +++ b/packages/frontend/src/assets/标题-浅色.svg @@ -0,0 +1,15 @@ + + + Created with Pixso. + + + + + + + + + + + + diff --git a/packages/frontend/src/assets/标题-深色.svg b/packages/frontend/src/assets/标题-深色.svg new file mode 100644 index 0000000..a24984d --- /dev/null +++ b/packages/frontend/src/assets/标题-深色.svg @@ -0,0 +1,15 @@ + + + Created with Pixso. + + + + + + + + + + + + diff --git a/packages/frontend/src/components/CloseIcon.svelte b/packages/frontend/src/components/CloseIcon.svelte new file mode 100644 index 0000000..cb98947 --- /dev/null +++ b/packages/frontend/src/components/CloseIcon.svelte @@ -0,0 +1,12 @@ + + +
+ + + +
diff --git a/packages/frontend/src/components/DarkModeImage.svelte b/packages/frontend/src/components/DarkModeImage.svelte new file mode 100644 index 0000000..aa4caeb --- /dev/null +++ b/packages/frontend/src/components/DarkModeImage.svelte @@ -0,0 +1,41 @@ + + + \ No newline at end of file diff --git a/packages/frontend/src/components/MenuIcon.svelte b/packages/frontend/src/components/MenuIcon.svelte new file mode 100644 index 0000000..1c74e68 --- /dev/null +++ b/packages/frontend/src/components/MenuIcon.svelte @@ -0,0 +1,20 @@ + + +
+ + + Created with Pixso. + + + + + + + + + + + +
diff --git a/packages/frontend/src/components/SearchBox.svelte b/packages/frontend/src/components/SearchBox.svelte new file mode 100644 index 0000000..52d8a1a --- /dev/null +++ b/packages/frontend/src/components/SearchBox.svelte @@ -0,0 +1,44 @@ + + + +
+ +
+ + diff --git a/packages/frontend/src/components/SearchIcon.svelte b/packages/frontend/src/components/SearchIcon.svelte new file mode 100644 index 0000000..d23c8be --- /dev/null +++ b/packages/frontend/src/components/SearchIcon.svelte @@ -0,0 +1,12 @@ + + +
+ + + +
\ No newline at end of file diff --git a/packages/frontend/src/components/TitleBar.astro b/packages/frontend/src/components/TitleBar.astro new file mode 100644 index 0000000..aa0963e --- /dev/null +++ b/packages/frontend/src/components/TitleBar.astro @@ -0,0 +1,30 @@ +--- +import astroLogoLight from "@assets/标题-浅色.svg"; +import astroLogoDark from "@assets/标题-深色.svg"; +import DarkModeImage from "@components/DarkModeImage.svelte"; +import SearchBox from "@components/SearchBox.svelte"; +import TitleBarMobile from "@components/TitleBarMobile.svelte"; +--- + + + + diff --git a/packages/frontend/src/components/TitleBarMobile.svelte b/packages/frontend/src/components/TitleBarMobile.svelte new file mode 100644 index 0000000..5a8754b --- /dev/null +++ b/packages/frontend/src/components/TitleBarMobile.svelte @@ -0,0 +1,47 @@ + + +
+ {#if !showSearchBox} + +
+ + + +
+ {/if} + {#if showSearchBox} + + {/if} + +
diff --git a/packages/frontend/src/components/Welcome.astro b/packages/frontend/src/components/Welcome.astro new file mode 100644 index 0000000..04130ae --- /dev/null +++ b/packages/frontend/src/components/Welcome.astro @@ -0,0 +1,10 @@ +--- +import TitleBar from "@components/TitleBar.astro"; +--- + + + +
+

正在施工中……

+

在搜索栏输入BV号或AV号,可以查询目前数据库收集到的信息~

+
diff --git a/packages/frontend/src/content/about.md b/packages/frontend/src/content/about.md new file mode 100644 index 0000000..8c2506a --- /dev/null +++ b/packages/frontend/src/content/about.md @@ -0,0 +1,60 @@ +# 关于「中V档案馆」 + +「中V档案馆」是一个旨在收录与展示「中文歌声合成作品」及有关信息的网站。 + +## 创建背景与关联工作 + +纵观整个互联网,对于「中文歌声合成」或「中文虚拟歌手」(常简称为中V或VC)相关信息进行较为系统、全面地整理收集的主要有以下几个网站: + +- [萌娘百科](https://zh.moegirl.org.cn/): + 收录了大量中V歌曲及歌姬的信息,呈现形式为传统维基(基于[MediaWiki](https://www.mediawiki.org/))。 +- [VCPedia](https://vcpedia.cn/): + 由原萌娘百科中文歌声合成编辑团队的部分成员搭建,专属于中文歌声合成相关内容的信息集成站点[^1],呈现形式为传统维基(基于[MediaWiki](https://www.mediawiki.org/))。 +- [VocaDB](https://vocadb.net/): 一个围绕 Vocaloid、UTAU 和其他歌声合成器的协作数据库,其中包含艺术家、唱片、PV + 等[^2],其中包含大量中文歌声合成作品。 +- [天钿Daily](https://tdd.bunnyxt.com/):一个VC相关数据交流与分享的网站。致力于VC相关数据交流,定期抓取VC相关数据,选取有意义的纬度展示。[^3] + +上述网站中,或多或少存在一些不足,例如: + +- 萌娘百科、VCPedia受限于传统维基,绝大多数内容依赖人工编辑。 +- VocaDB基于结构化数据库构建,由此可以依赖程序生成一些信息,但**条目收录**仍然完全依赖人工完成。 +- VocaDB主要专注于元数据展示,少有关于歌曲、作者等的描述性的文字,也缺乏描述性的背景信息。 +- 天钿Daily只展示歌曲的统计数据及历史趋势,没有关于歌曲其它信息的收集。 + +因此,**中V档案馆**吸取前人经验,克服上述网站的不足,希望做到: + +- 歌曲收录(指发现歌曲并创建条目)的完全自动化 +- 歌曲元信息提取的高度自动化 +- 歌曲统计数据收集的完全自动化 +- 在程序辅助的同时欢迎并鼓励贡献者参与编辑(主要为描述性内容)或纠错 +- 在适当的许可声明下,引用来自上述源的数据,使内容更加全面、丰富。 + +## 技术架构 + +参见[CVSA文档](https://cvsa.gitbook.io/)。 + +## 开放许可 + +受本文以[CC BY-NC-SA 4.0协议](https://creativecommons.org/licenses/by-nc-sa/4.0/)提供。 + +### 数据库 + +中V档案馆使用[PostgreSQL](https://postgresql.org)作为数据库,我们承诺定期导出数据库转储 (dump) +文件并公开,其内容遵从以下协议或条款: + +- 数据库中的事实性数据,根据适用法律,不构成受版权保护的内容。中V档案馆放弃一切可能的权利([CC0 1.0 Universal](https://creativecommons.org/publicdomain/zero/1.0/))。 +- 对于数据库中有原创性的内容(如贡献者编辑的描述性内容),如无例外,以[CC BY 4.0协议](https://creativecommons.org/licenses/by/4.0/)提供。 +- 对于引用、摘编或改编自萌娘百科、VCPedia的内容,以与原始协议(CC BY-NC-SA 3.0 + CN)兼容的协议[CC BY-NC-SA 4.0协议](https://creativecommons.org/licenses/by-nc-sa/4.0/)提供,并注明原始协议 。 + > 根据原始协议第四条第2项内容,CC BY-NC-SA 4.0协议为与原始协议具有相同授权要素的后续版本(“可适用的协议”)。 +- 中V档案馆文档使用[CC BY 4.0协议](https://creativecommons.org/licenses/by/4.0/)。 + +### 软件代码 + +用于构建中V档案馆的软件代码在[AGPL 3.0](https://www.gnu.org/licenses/agpl-3.0.html)许可证下公开,参见[LICENSE](./LICENSE) + +[^1]: 引用自[VCPedia](https://vcpedia.cn/%E9%A6%96%E9%A1%B5),于[知识共享 署名-非商业性使用-相同方式共享 3.0中国大陆 (CC BY-NC-SA 3.0 CN) 许可协议](https://creativecommons.org/licenses/by-nc-sa/3.0/cn/)下提供。 + +[^2]: 翻译自[VocaDB](https://vocadb.net/),于[CC BY 4.0协议](https://creativecommons.org/licenses/by/4.0/)下提供。 + +[^3]: 引用自[关于 - 天钿Daily](https://tdd.bunnyxt.com/about) diff --git a/packages/frontend/src/layouts/Layout.astro b/packages/frontend/src/layouts/Layout.astro new file mode 100644 index 0000000..e33cf4b --- /dev/null +++ b/packages/frontend/src/layouts/Layout.astro @@ -0,0 +1,15 @@ +--- +import "../styles/global.css"; +--- + + + + + + + CVSA 前端 + + + + + diff --git a/packages/frontend/src/pages/404.astro b/packages/frontend/src/pages/404.astro new file mode 100644 index 0000000..2410b5c --- /dev/null +++ b/packages/frontend/src/pages/404.astro @@ -0,0 +1,10 @@ +--- +import Layout from '@layouts/Layout.astro'; +--- + + +
+

404

+

咦……页面去哪里了(゚Д゚≡゚д゚)!?

+
+
diff --git a/packages/frontend/src/pages/about.astro b/packages/frontend/src/pages/about.astro new file mode 100644 index 0000000..8f50a8e --- /dev/null +++ b/packages/frontend/src/pages/about.astro @@ -0,0 +1,15 @@ +--- +import TitleBar from "@components/TitleBar.astro"; +import Layout from '@layouts/Layout.astro'; +import {Content as AboutContent} from '../content/about.md'; +import "../styles/content.css"; +--- + + + +
+
+ +
+
+
\ No newline at end of file diff --git a/packages/frontend/src/pages/index.astro b/packages/frontend/src/pages/index.astro new file mode 100644 index 0000000..2dffcbe --- /dev/null +++ b/packages/frontend/src/pages/index.astro @@ -0,0 +1,8 @@ +--- +import Welcome from '@components/Welcome.astro'; +import Layout from '@layouts/Layout.astro'; +--- + + + + diff --git a/packages/frontend/src/pages/song/[id]/info.astro b/packages/frontend/src/pages/song/[id]/info.astro new file mode 100644 index 0000000..f6fd40b --- /dev/null +++ b/packages/frontend/src/pages/song/[id]/info.astro @@ -0,0 +1,200 @@ +--- +import Layout from "@layouts/Layout.astro"; +import TitleBar from "@components/TitleBar.astro"; +import pg from "pg"; +import { format } from 'date-fns'; +import { zhCN } from 'date-fns/locale'; + +const databaseHost = process.env.DB_HOST +const databaseName = process.env.DB_NAME +const databaseUser = process.env.DB_USER +const databasePassword = process.env.DB_PASSWORD +const databasePort = process.env.DB_PORT + +const postgresConfig = { + hostname: databaseHost, + port: parseInt(databasePort!), + database: databaseName, + user: databaseUser, + password: databasePassword, +}; + +// 路由参数 +const { id } = Astro.params; +const { Client } = pg; +const client = new Client(postgresConfig); +await client.connect(); + +// 数据库查询函数 +async function getVideoMetadata(aid: number) { + const res = await client.query("SELECT * FROM bilibili_metadata WHERE aid = $1", [aid]); + if (res.rows.length <= 0) { + return null; + } + const row = res.rows[0]; + if (row) { + return row; + } + return {}; +} + +async function getVideoSnapshots(aid: number) { + const res = await client.query("SELECT * FROM video_snapshot WHERE aid = $1 ORDER BY created_at DESC", [ + aid, + ]); + if (res.rows.length <= 0) { + return null; + } + return res.rows; +} + +async function getAidFromBV(bv: string) { + const res = await client.query("SELECT aid FROM bilibili_metadata WHERE bvid = $1", [bv]); + if (res.rows.length <= 0) { + return null; + } + const row = res.rows[0]; + if (row && row.aid) { + return Number(row.aid); + } + return null; +} + +async function getVideoAid(id: string) { + if (id.startsWith("av")) { + return parseInt(id.slice(2)); + } else if (id.startsWith("BV")) { + return getAidFromBV(id); + } + return parseInt(id); +} + +// 获取数据 +if (!id) { + Astro.response.status = 404; + client.end(); + return new Response(null, { status: 404 }); +} +const aid = await getVideoAid(id); +if (!aid || isNaN(aid)) { + Astro.response.status = 404; + client.end(); + return new Response(null, { status: 404 }); +} +const videoInfo = await getVideoMetadata(aid); +const snapshots = await getVideoSnapshots(aid); +client.end(); + +interface Snapshot { + created_at: Date; + views: number; + danmakus: number; + replies: number; + coins: number; + likes: number; + favorites: number; + shares: number; + id: number; +} +--- + + + +
+
+

视频信息: av{aid}

+ +
+

基本信息

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ID{videoInfo?.id}
AID{videoInfo?.aid}
BVID{videoInfo?.bvid}
标题{videoInfo?.title}
描述{videoInfo?.description}
UID{videoInfo?.uid}
标签{videoInfo?.tags}
发布时间{videoInfo?.published_at ? format(new Date(videoInfo.published_at), 'yyyy-MM-dd HH:mm:ss', { locale: zhCN }) : '-'}
时长 (秒){videoInfo?.duration}
创建时间{videoInfo?.created_at ? format(new Date(videoInfo.created_at), 'yyyy-MM-dd HH:mm:ss', { locale: zhCN }) : '-'}
封面{videoInfo?.cover_url ? videoInfo.cover_url : '-'}
+
+
+ +
+

播放量历史数据

+ {snapshots && snapshots.length > 0 ? ( +
+ + + + + + + + + + + + + + + {snapshots.map((snapshot: Snapshot) => ( + + + + + + + + + + + ))} + +
创建时间观看硬币点赞收藏分享弹幕评论
{format(new Date(snapshot.created_at), 'yyyy-MM-dd HH:mm:ss', { locale: zhCN })}{snapshot.views}{snapshot.coins}{snapshot.likes}{snapshot.favorites}{snapshot.shares}{snapshot.danmakus}{snapshot.replies}
+
+ ) : ( +

暂无历史数据。

+ )} +
+
+
+
diff --git a/packages/frontend/src/styles/content.css b/packages/frontend/src/styles/content.css new file mode 100644 index 0000000..6d23e15 --- /dev/null +++ b/packages/frontend/src/styles/content.css @@ -0,0 +1,76 @@ +@tailwind base; +@tailwind components; +@tailwind utilities; + +.content { + @apply text-gray-800 dark:text-zinc-100; + + h1, + h2, + h3, + h4 { + @apply font-bold text-gray-900 dark:text-white my-4; + } + + h1 { + @apply text-3xl; + } + + h2 { + @apply text-2xl; + } + + h3 { + @apply text-xl; + } + + h4 { + @apply text-lg; + } + + p { + @apply my-4; + } + + a { + @apply text-blue-500 hover:text-blue-700 dark:hover:text-blue-400 underline; + } + + ul, + ol { + @apply list-disc list-inside my-4; + } + + li { + @apply my-2; + } + + blockquote { + @apply border-l-4 border-gray-300 pl-4 italic my-4; + } + + code { + @apply bg-gray-100 text-gray-800 rounded px-1 duration-300; + } + + pre { + @apply bg-gray-100 p-4 rounded overflow-x-auto my-4 duration-300 h-0; + } + + table { + @apply w-full border-collapse my-4; + } + + th, + td { + @apply border border-gray-300 p-2; + } + + th { + @apply bg-gray-200 font-bold; + } + ul li p, + ol li p { + @apply inline; + } +} diff --git a/static/styles.css b/packages/frontend/src/styles/global.css similarity index 100% rename from static/styles.css rename to packages/frontend/src/styles/global.css diff --git a/packages/frontend/tailwind.config.js b/packages/frontend/tailwind.config.js new file mode 100644 index 0000000..b34cf45 --- /dev/null +++ b/packages/frontend/tailwind.config.js @@ -0,0 +1,8 @@ +/** @type {import('tailwindcss').Config} */ +module.exports = { + content: ["./src/**/*.{astro,html,js,jsx,md,mdx,svelte,ts,tsx,vue}"], + theme: { + extend: {}, + }, + plugins: [], +}; diff --git a/packages/frontend/tsconfig.json b/packages/frontend/tsconfig.json new file mode 100644 index 0000000..b8575e7 --- /dev/null +++ b/packages/frontend/tsconfig.json @@ -0,0 +1,16 @@ +{ + "extends": "astro/tsconfigs/strict", + "include": [".astro/types.d.ts", "**/*"], + "exclude": ["dist"], + "compilerOptions": { + "baseUrl": ".", + "paths": { + "@components/*": ["src/components/*"], + "@layouts/*": ["src/layouts/*"], + "@utils/*": ["src/utils/*"], + "@assets/*": ["src/assets/*"], + "@styles": ["src/styles/*"], + "@core/*": ["../core/*"] + } + } +} diff --git a/routes/_404.tsx b/routes/_404.tsx deleted file mode 100644 index 4628eeb..0000000 --- a/routes/_404.tsx +++ /dev/null @@ -1,27 +0,0 @@ -import { Head } from "$fresh/runtime.ts"; - -export default function Error404() { - return ( - <> - - 404 - Page not found - -
-
- the Fresh logo: a sliced lemon dripping with juice -

404 - Page not found

-

- The page you were looking for doesn't exist. -

- Go back home -
-
- - ); -} diff --git a/routes/_app.tsx b/routes/_app.tsx deleted file mode 100644 index a44414e..0000000 --- a/routes/_app.tsx +++ /dev/null @@ -1,16 +0,0 @@ -import { type PageProps } from "$fresh/server.ts"; -export default function App({ Component }: PageProps) { - return ( - - - - - cvsa - - - - - - - ); -} diff --git a/routes/api/joke.ts b/routes/api/joke.ts deleted file mode 100644 index 68b0ebe..0000000 --- a/routes/api/joke.ts +++ /dev/null @@ -1,21 +0,0 @@ -import { FreshContext } from "$fresh/server.ts"; - -// Jokes courtesy of https://punsandoneliners.com/randomness/programmer-jokes/ -const JOKES = [ - "Why do Java developers often wear glasses? They can't C#.", - "A SQL query walks into a bar, goes up to two tables and says “can I join you?”", - "Wasn't hard to crack Forrest Gump's password. 1forrest1.", - "I love pressing the F5 key. It's refreshing.", - "Called IT support and a chap from Australia came to fix my network connection. I asked “Do you come from a LAN down under?”", - "There are 10 types of people in the world. Those who understand binary and those who don't.", - "Why are assembly programmers often wet? They work below C level.", - "My favourite computer based band is the Black IPs.", - "What programme do you use to predict the music tastes of former US presidential candidates? An Al Gore Rhythm.", - "An SEO expert walked into a bar, pub, inn, tavern, hostelry, public house.", -]; - -export const handler = (_req: Request, _ctx: FreshContext): Response => { - const randomIndex = Math.floor(Math.random() * JOKES.length); - const body = JOKES[randomIndex]; - return new Response(body); -}; diff --git a/routes/greet/[name].tsx b/routes/greet/[name].tsx deleted file mode 100644 index a7a5fe1..0000000 --- a/routes/greet/[name].tsx +++ /dev/null @@ -1,5 +0,0 @@ -import { PageProps } from "$fresh/server.ts"; - -export default function Greet(props: PageProps) { - return
Hello {props.params.name}
; -} diff --git a/routes/index.tsx b/routes/index.tsx deleted file mode 100644 index 67a22a7..0000000 --- a/routes/index.tsx +++ /dev/null @@ -1,25 +0,0 @@ -import { useSignal } from "@preact/signals"; -import Counter from "../islands/Counter.tsx"; - -export default function Home() { - const count = useSignal(3); - return ( -
-
- the Fresh logo: a sliced lemon dripping with juice -

Welcome to Fresh

-

- Try updating this message in the - ./routes/index.tsx file, and refresh. -

- -
-
- ); -} diff --git a/src/db/raw/fetchAids.ts b/src/db/raw/fetchAids.ts index 10770d5..01b27d1 100644 --- a/src/db/raw/fetchAids.ts +++ b/src/db/raw/fetchAids.ts @@ -14,14 +14,20 @@ const db = new Database(DATABASE_PATH, { int64: true }); // 设置日志 async function setupLogging() { await ensureDir(LOG_DIR); - const logStream = await Deno.open(LOG_FILE, { write: true, create: true, append: true }); + const logStream = await Deno.open(LOG_FILE, { + write: true, + create: true, + append: true, + }); const redirectConsole = // deno-lint-ignore no-explicit-any (originalConsole: (...args: any[]) => void) => // deno-lint-ignore no-explicit-any (...args: any[]) => { - const message = args.map((arg) => (typeof arg === "object" ? JSON.stringify(arg) : arg)).join(" "); + const message = args.map(( + arg, + ) => (typeof arg === "object" ? JSON.stringify(arg) : arg)).join(" "); originalConsole(message); logStream.write(new TextEncoder().encode(message + "\n")); }; @@ -38,14 +44,17 @@ interface Metadata { // 获取最后一次更新的时间 function getLastUpdate(): Date { - const result = db.prepare("SELECT value FROM metadata WHERE key = 'fetchAid-lastUpdate'").get() as Metadata; + const result = db.prepare( + "SELECT value FROM metadata WHERE key = 'fetchAid-lastUpdate'", + ).get() as Metadata; return result ? new Date(result.value as string) : new Date(0); } // 更新最后更新时间 function updateLastUpdate() { const now = new Date().toISOString(); - db.prepare("UPDATE metadata SET value = ? WHERE key = 'fetchAid-lastUpdate'").run(now); + db.prepare("UPDATE metadata SET value = ? WHERE key = 'fetchAid-lastUpdate'") + .run(now); } // 辅助函数:获取数据 @@ -66,7 +75,9 @@ async function fetchData(pn: number, retries = MAX_RETRIES): Promise { // 插入 aid 到数据库 function insertAid(aid: number) { - db.prepare("INSERT OR IGNORE INTO bili_info_crawl (aid, status) VALUES (?, 'pending')").run(aid); + db.prepare( + "INSERT OR IGNORE INTO bili_info_crawl (aid, status) VALUES (?, 'pending')", + ).run(aid); } // 主函数 diff --git a/src/db/raw/insertAidsToDB.ts b/src/db/raw/insertAidsToDB.ts index b067d69..b5855c7 100644 --- a/src/db/raw/insertAidsToDB.ts +++ b/src/db/raw/insertAidsToDB.ts @@ -5,7 +5,16 @@ import { ensureDir } from "https://deno.land/std@0.113.0/fs/mod.ts"; const aidPath = "./data/2025010104_c30_aids.txt"; const db = new Database("./data/main.db", { int64: true }); -const regions = ["shanghai", "hangzhou", "qingdao", "beijing", "zhangjiakou", "chengdu", "shenzhen", "hohhot"]; +const regions = [ + "shanghai", + "hangzhou", + "qingdao", + "beijing", + "zhangjiakou", + "chengdu", + "shenzhen", + "hohhot", +]; const logDir = "./logs/bili-info-crawl"; const logFile = path.join(logDir, `run-${Date.now() / 1000}.log`); const shouldReadTextFile = false; @@ -26,14 +35,20 @@ const requestQueue: number[] = []; async function setupLogging() { await ensureDir(logDir); - const logStream = await Deno.open(logFile, { write: true, create: true, append: true }); + const logStream = await Deno.open(logFile, { + write: true, + create: true, + append: true, + }); const redirectConsole = // deno-lint-ignore no-explicit-any (originalConsole: (...args: any[]) => void) => // deno-lint-ignore no-explicit-any (...args: any[]) => { - const message = args.map((arg) => (typeof arg === "object" ? JSON.stringify(arg) : arg)).join(" "); + const message = args.map(( + arg, + ) => (typeof arg === "object" ? JSON.stringify(arg) : arg)).join(" "); originalConsole(message); logStream.write(new TextEncoder().encode(message + "\n")); }; @@ -78,7 +93,9 @@ async function readFromText() { const newAids = aids.filter((aid) => !existingAidsSet.has(aid)); // 插入这些新条目 - const insertStmt = db.prepare("INSERT OR IGNORE INTO bili_info_crawl (aid, status) VALUES (?, 'pending')"); + const insertStmt = db.prepare( + "INSERT OR IGNORE INTO bili_info_crawl (aid, status) VALUES (?, 'pending')", + ); newAids.forEach((aid) => insertStmt.run(aid)); } @@ -88,7 +105,9 @@ async function insertAidsToDB() { } const aidsInDB = db - .prepare("SELECT aid FROM bili_info_crawl WHERE status = 'pending' OR status = 'failed'") + .prepare( + "SELECT aid FROM bili_info_crawl WHERE status = 'pending' OR status = 'failed'", + ) .all() .map((row) => row.aid) as number[]; @@ -98,13 +117,21 @@ async function insertAidsToDB() { const processAid = async (aid: number) => { try { - const res = await getBiliBiliVideoInfo(aid, regions[processedAids % regions.length]); + const res = await getBiliBiliVideoInfo( + aid, + regions[processedAids % regions.length], + ); if (res === null) { updateAidStatus(aid, "failed"); } else { const rawData = JSON.parse(res); if (rawData.code === 0) { - updateAidStatus(aid, "success", rawData.data.View.bvid, JSON.stringify(rawData.data)); + updateAidStatus( + aid, + "success", + rawData.data.View.bvid, + JSON.stringify(rawData.data), + ); } else { updateAidStatus(aid, "error", undefined, res); } @@ -136,7 +163,12 @@ async function insertAidsToDB() { console.log("Starting to process aids..."); } -function updateAidStatus(aid: number, status: string, bvid?: string, data?: string) { +function updateAidStatus( + aid: number, + status: string, + bvid?: string, + data?: string, +) { const stmt = db.prepare(` UPDATE bili_info_crawl SET status = ?, @@ -145,11 +177,22 @@ function updateAidStatus(aid: number, status: string, bvid?: string, data?: stri timestamp = ? WHERE aid = ? `); - const params = [status, ...(bvid ? [bvid] : []), ...(data ? [data] : []), Date.now() / 1000, aid]; + const params = [ + status, + ...(bvid ? [bvid] : []), + ...(data ? [data] : []), + Date.now() / 1000, + aid, + ]; stmt.run(...params); } -function logProgress(aid: number, processedAids: number, totalAids: number, startTime: number) { +function logProgress( + aid: number, + processedAids: number, + totalAids: number, + startTime: number, +) { const elapsedTime = Date.now() - startTime; const elapsedSeconds = Math.floor(elapsedTime / 1000); const elapsedMinutes = Math.floor(elapsedSeconds / 60); diff --git a/src/db/raw/videoInfo.ts b/src/db/raw/videoInfo.ts index 10272b2..28299d3 100644 --- a/src/db/raw/videoInfo.ts +++ b/src/db/raw/videoInfo.ts @@ -1,4 +1,7 @@ -export async function getBiliBiliVideoInfo(bvidORaid?: string | number, region: string = "hangzhou") { +export async function getBiliBiliVideoInfo( + bvidORaid?: string | number, + region: string = "hangzhou", +) { const bvid = typeof bvidORaid === "string" ? bvidORaid : undefined; const aid = typeof bvidORaid === "number" ? bvidORaid : undefined; @@ -18,7 +21,10 @@ export async function getBiliBiliVideoInfo(bvidORaid?: string | number, region: } } -async function proxyRequestWithRegion(url: string, region: string): Promise { +async function proxyRequestWithRegion( + url: string, + region: string, +): Promise { const td = new TextDecoder(); // aliyun configure set --access-key-id $ALIYUN_AK --access-key-secret $ALIYUN_SK --region cn-shenzhen --profile CVSA-shenzhen --mode AK const p = await new Deno.Command("aliyun", { @@ -40,7 +46,9 @@ async function proxyRequestWithRegion(url: string, region: string): Promise { - switch (job.name) { - case "getLatestVideos": - await getLatestVideosWorker(job); - break; - default: - break; - } - }, - { connection: redis, concurrency: 1 }, -); - -latestVideoWorker.on("active", () => { - logger.log("Worker activated.", "mq"); -}); - -latestVideoWorker.on("error", (err) => { - const e = err as WorkerError; - logger.error(e.rawError, e.service, e.codePath); -}); - - -const videoTagsWorker = new Worker( - "videoTags", - async (job: Job) => { - switch (job.name) { - case "getVideoTags": - return await getVideoTagsWorker(job); - case "getVideosTags": - return await getVideoTagsInitializer(); - default: - break; - } - }, - { connection: redis, concurrency: 6 }, -); - -videoTagsWorker.on("active", () => { - logger.log("Worker activated.", "mq"); -}); - -videoTagsWorker.on("error", (err) => { - const e = err as WorkerError; - logger.error(e.rawError, e.service, e.codePath); -}); - diff --git a/static/favicon.ico b/static/favicon.ico deleted file mode 100644 index 1cfaaa2..0000000 Binary files a/static/favicon.ico and /dev/null differ diff --git a/static/logo.svg b/static/logo.svg deleted file mode 100644 index ef2fbe4..0000000 --- a/static/logo.svg +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/tailwind.config.ts b/tailwind.config.ts deleted file mode 100644 index 0c790d0..0000000 --- a/tailwind.config.ts +++ /dev/null @@ -1,7 +0,0 @@ -import { type Config } from "tailwindcss"; - -export default { - content: [ - "{routes,islands,components}/**/*.{ts,tsx,js,jsx}", - ], -} satisfies Config; diff --git a/test/db/videoTagIsNull.test.ts b/test/db/videoTagIsNull.test.ts deleted file mode 100644 index 7ffc8cc..0000000 --- a/test/db/videoTagIsNull.test.ts +++ /dev/null @@ -1,33 +0,0 @@ -import { assertEquals } from "jsr:@std/assert"; -import { videoTagsIsNull } from "lib/db/allData.ts"; -import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { postgresConfig } from "lib/db/pgConfig.ts"; - -// A minimal aid which has an empty tags field in our database -const TEST_AID = 63569; - -Deno.test("videoTagsIsNull function", async () => { - const client = new Client(postgresConfig); - - try { - const transaction = client.createTransaction("test_transaction"); - await transaction.begin(); - - const result1 = await videoTagsIsNull(transaction, TEST_AID); - assertEquals(typeof result1, "boolean", "The result should be a boolean value."); - assertEquals(result1, false, "The result should be false if tags is not NULL for the given aid."); - - await transaction.queryArray`UPDATE all_data SET tags = NULL WHERE aid = ${TEST_AID}`; - - const result2 = await videoTagsIsNull(transaction, TEST_AID); - assertEquals(typeof result2, "boolean", "The result should be a boolean value."); - assertEquals(result2, true, "The result should be true if tags is NULL for the given aid."); - - await transaction.rollback(); - } catch (error) { - console.error("Error during test:", error); - throw error; - } finally { - client.end(); - } -}); diff --git a/test/mq/rateLimiter.test.ts b/test/mq/rateLimiter.test.ts deleted file mode 100644 index 054e945..0000000 --- a/test/mq/rateLimiter.test.ts +++ /dev/null @@ -1,91 +0,0 @@ -import {assertEquals} from "jsr:@std/assert"; -import {SlidingWindow} from "lib/mq/slidingWindow.ts"; -import {RateLimiter, RateLimiterConfig} from "lib/mq/rateLimiter.ts"; -import {Redis} from "npm:ioredis@5.5.0"; - -Deno.test("RateLimiter works correctly", async () => { - const redis = new Redis({ maxRetriesPerRequest: null }); - const windowSize = 5; - const maxRequests = 10; - - const slidingWindow = new SlidingWindow(redis, windowSize); - const config: RateLimiterConfig = { - window: slidingWindow, - max: maxRequests, - }; - const rateLimiter = new RateLimiter("test_event", [config]); - await rateLimiter.clear(); - - // Initial availability should be true - assertEquals(await rateLimiter.getAvailability(), true); - - // Trigger events up to the limit - for (let i = 0; i < maxRequests; i++) { - await rateLimiter.trigger(); - } - - // Availability should now be false - assertEquals(await rateLimiter.getAvailability(), false); - - // Wait for the window to slide - await new Promise((resolve) => setTimeout(resolve, windowSize * 1000 + 500)); - - // Availability should be true again - assertEquals(await rateLimiter.getAvailability(), true); - - redis.quit(); -}); - -Deno.test("Multiple configs work correctly", async () => { - const redis = new Redis({ maxRetriesPerRequest: null }); - const windowSize1 = 1; - const maxRequests1 = 2; - const windowSize2 = 5; - const maxRequests2 = 6; - - const slidingWindow1 = new SlidingWindow(redis, windowSize1); - const config1: RateLimiterConfig = { - window: slidingWindow1, - max: maxRequests1, - }; - const slidingWindow2 = new SlidingWindow(redis, windowSize2); - const config2: RateLimiterConfig = { - window: slidingWindow2, - max: maxRequests2, - }; - const rateLimiter = new RateLimiter("test_event_multi", [config1, config2]); - await rateLimiter.clear(); - - // Initial availability should be true - assertEquals(await rateLimiter.getAvailability(), true); - - // Trigger events up to the limit of the first config - for (let i = 0; i < maxRequests1; i++) { - await rateLimiter.trigger(); - } - - // Availability should now be false (due to config1) - assertEquals(await rateLimiter.getAvailability(), false); - - // Wait for the first window to slide - await new Promise((resolve) => setTimeout(resolve, windowSize1 * 1000 + 500)); - - // Availability should now be true (due to config1) - assertEquals(await rateLimiter.getAvailability(), true); - - // Trigger events up to the limit of the second config - for (let i = maxRequests1; i < maxRequests2; i++) { - await rateLimiter.trigger(); - } - - // Availability should still be false (due to config2) - assertEquals(await rateLimiter.getAvailability(), false); - - // Wait for the second window to slide - await new Promise((resolve) => setTimeout(resolve, windowSize2 * 1000 + 500)); - - // Availability should be true again - assertEquals(await rateLimiter.getAvailability(), true); - - redis.quit(); -}); \ No newline at end of file diff --git a/test/mq/slidingWindow.test.ts b/test/mq/slidingWindow.test.ts deleted file mode 100644 index cde8d11..0000000 --- a/test/mq/slidingWindow.test.ts +++ /dev/null @@ -1,84 +0,0 @@ -import { assertEquals } from "jsr:@std/assert"; -import { SlidingWindow } from "lib/mq/slidingWindow.ts"; -import { Redis } from "ioredis"; - -Deno.test("SlidingWindow - event and count", async () => { - const redis = new Redis({ maxRetriesPerRequest: null }); - const windowSize = 5000; // 5 seconds - const slidingWindow = new SlidingWindow(redis, windowSize); - const eventName = "test_event"; - await slidingWindow.clear(eventName); - - await slidingWindow.event(eventName); - const count = await slidingWindow.count(eventName); - - assertEquals(count, 1); - redis.quit(); -}); - -Deno.test("SlidingWindow - multiple events", async () => { - const redis = new Redis({ maxRetriesPerRequest: null }); - const windowSize = 5000; // 5 seconds - const slidingWindow = new SlidingWindow(redis, windowSize); - const eventName = "test_event"; - await slidingWindow.clear(eventName); - - await slidingWindow.event(eventName); - await slidingWindow.event(eventName); - await slidingWindow.event(eventName); - const count = await slidingWindow.count(eventName); - - assertEquals(count, 3); - redis.quit(); -}); - -Deno.test("SlidingWindow - no events", async () => { - const redis = new Redis({ maxRetriesPerRequest: null }); - const windowSize = 5000; // 5 seconds - const slidingWindow = new SlidingWindow(redis, windowSize); - const eventName = "test_event"; - await slidingWindow.clear(eventName); - - const count = await slidingWindow.count(eventName); - - assertEquals(count, 0); - redis.quit(); -}); - -Deno.test("SlidingWindow - different event names", async () => { - const redis = new Redis({ maxRetriesPerRequest: null }); - const windowSize = 5000; // 5 seconds - const slidingWindow = new SlidingWindow(redis, windowSize); - const eventName1 = "test_event_1"; - const eventName2 = "test_event_2"; - await slidingWindow.clear(eventName1); - await slidingWindow.clear(eventName2); - - await slidingWindow.event(eventName1); - await slidingWindow.event(eventName2); - - const count1 = await slidingWindow.count(eventName1); - const count2 = await slidingWindow.count(eventName2); - - assertEquals(count1, 1); - assertEquals(count2, 1); - redis.quit(); -}); - -Deno.test("SlidingWindow - large number of events", async () => { - const redis = new Redis({ maxRetriesPerRequest: null }); - const windowSize = 5000; // 5 seconds - const slidingWindow = new SlidingWindow(redis, windowSize); - const eventName = "test_event"; - await slidingWindow.clear(eventName); - const numEvents = 1000; - - for (let i = 0; i < numEvents; i++) { - await slidingWindow.event(eventName); - } - - const count = await slidingWindow.count(eventName); - - assertEquals(count, numEvents); - redis.quit(); -}); diff --git a/test/net/getLatestVideos.test.ts b/test/net/getLatestVideos.test.ts deleted file mode 100644 index b2daa4d..0000000 --- a/test/net/getLatestVideos.test.ts +++ /dev/null @@ -1,25 +0,0 @@ -import { assertEquals } from "jsr:@std/assert"; -import { getLatestVideos } from "lib/net/getLatestVideos.ts"; - -Deno.test("Get latest videos", async () => { - const videos = (await getLatestVideos(1, 5))!; - assertEquals(videos.length, 5); - - videos.forEach((video) => { - assertVideoProperties(video); - }); -}); - -function assertVideoProperties(video: object) { - const aid = "aid" in video && typeof video.aid === "number"; - const bvid = "bvid" in video && typeof video.bvid === "string" && - video.bvid.length === 12 && video.bvid.startsWith("BV"); - const description = "description" in video && typeof video.description === "string"; - const uid = "uid" in video && typeof video.uid === "number"; - const tags = "tags" in video && (typeof video.tags === "string" || video.tags === null); - const title = "title" in video && typeof video.title === "string"; - const publishedAt = "published_at" in video && typeof video.published_at === "string"; - - const match = aid && bvid && description && uid && tags && title && publishedAt; - assertEquals(match, true); -} diff --git a/test/net/getVideoTags.test.ts b/test/net/getVideoTags.test.ts deleted file mode 100644 index 0487dfb..0000000 --- a/test/net/getVideoTags.test.ts +++ /dev/null @@ -1,28 +0,0 @@ -import { assertEquals } from "jsr:@std/assert"; -import { getVideoTags } from "lib/net/getVideoTags.ts"; - -Deno.test("Get video tags - regular video", async () => { - const tags = (await getVideoTags(826597951))!.sort(); - assertEquals(tags, [ - "纯白P", - "中华墨水娘", - "中华少女", - "中华粘土娘", - "中华缘木娘", - "中华少女Project", - "提糯Tino", - "中华烛火娘", - "中华烁金娘", - "新世代音乐人计划女生季", - ].sort()); -}); - -Deno.test("Get video tags - non-existent video", async () => { - const tags = (await getVideoTags(8265979511111111)); - assertEquals(tags, []); -}); - -Deno.test("Get video tags - video with no tag", async () => { - const tags = (await getVideoTags(981001865)); - assertEquals(tags, []); -}); \ No newline at end of file