diff --git a/README.md b/README.md index 9033ec6..6a46cde 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,12 @@ 纵观整个互联网,对于「中文歌声合成」或「中文虚拟歌手」(常简称为中V或VC)相关信息进行较为系统、全面地整理收集的主要有以下几个网站: -- [萌娘百科](https://zh.moegirl.org.cn/): 收录了大量中V歌曲及歌姬的信息,呈现形式为传统维基(基于[MediaWiki](https://www.mediawiki.org/))。 -- [VCPedia](https://vcpedia.cn/): 由原萌娘百科中文歌声合成编辑团队的部分成员搭建,专属于中文歌声合成相关内容的信息集成站点[^1],呈现形式为传统维基(基于[MediaWiki](https://www.mediawiki.org/))。 -- [VocaDB](https://vocadb.net/): 一个围绕 Vocaloid、UTAU 和其他歌声合成器的协作数据库,其中包含艺术家、唱片、PV 等[^2],其中包含大量中文歌声合成作品。 +- [萌娘百科](https://zh.moegirl.org.cn/): + 收录了大量中V歌曲及歌姬的信息,呈现形式为传统维基(基于[MediaWiki](https://www.mediawiki.org/))。 +- [VCPedia](https://vcpedia.cn/): + 由原萌娘百科中文歌声合成编辑团队的部分成员搭建,专属于中文歌声合成相关内容的信息集成站点[^1],呈现形式为传统维基(基于[MediaWiki](https://www.mediawiki.org/))。 +- [VocaDB](https://vocadb.net/): 一个围绕 Vocaloid、UTAU 和其他歌声合成器的协作数据库,其中包含艺术家、唱片、PV + 等[^2],其中包含大量中文歌声合成作品。 - [天钿Daily](https://tdd.bunnyxt.com/):一个VC相关数据交流与分享的网站。致力于VC相关数据交流,定期抓取VC相关数据,选取有意义的纬度展示。[^3] 上述网站中,或多或少存在一些不足,例如: @@ -36,19 +39,22 @@ ### 数据库 -中V档案馆使用[PostgreSQL](https://postgresql.org)作为数据库,我们承诺定期导出数据库转储 (dump) 文件并公开,其内容遵从以下协议或条款: +中V档案馆使用[PostgreSQL](https://postgresql.org)作为数据库,我们承诺定期导出数据库转储 (dump) +文件并公开,其内容遵从以下协议或条款: - 数据库中的事实性数据,根据适用法律,不构成受版权保护的内容。中V档案馆放弃一切可能的权利([CC0 1.0 Universal](https://creativecommons.org/publicdomain/zero/1.0/))。 - 对于数据库中有原创性的内容(如贡献者编辑的描述性内容),如无例外,以[CC BY 4.0协议](https://creativecommons.org/licenses/by/4.0/)提供。 -- 对于引用、摘编或改编自萌娘百科、VCPedia的内容,以与原始协议(CC BY-NC-SA 3.0 CN)兼容的协议[CC BY-NC-SA 4.0协议](https://creativecommons.org/licenses/by-nc-sa/4.0/)提供,并注明原始协议 。 - > 根据原始协议第四条第2项内容,CC BY-NC-SA 4.0协议为与原始协议具有相同授权要素的后续版本(“可适用的协议”)。 +- 对于引用、摘编或改编自萌娘百科、VCPedia的内容,以与原始协议(CC BY-NC-SA 3.0 + CN)兼容的协议[CC BY-NC-SA 4.0协议](https://creativecommons.org/licenses/by-nc-sa/4.0/)提供,并注明原始协议 。 + > 根据原始协议第四条第2项内容,CC BY-NC-SA 4.0协议为与原始协议具有相同授权要素的后续版本(“可适用的协议”)。 - 中V档案馆文档使用[CC BY 4.0协议](https://creativecommons.org/licenses/by/4.0/)。 ### 软件代码 用于构建中V档案馆的软件代码在[AGPL 3.0](https://www.gnu.org/licenses/agpl-3.0.html)许可证下公开,参见[LICENSE](./LICENSE) - [^1]: 引用自[VCPedia](https://vcpedia.cn/%E9%A6%96%E9%A1%B5),于[知识共享 署名-非商业性使用-相同方式共享 3.0中国大陆 (CC BY-NC-SA 3.0 CN) 许可协议](https://creativecommons.org/licenses/by-nc-sa/3.0/cn/)下提供。 + [^2]: 翻译自[VocaDB](https://vocadb.net/),于[CC BY 4.0协议](https://creativecommons.org/licenses/by/4.0/)下提供。 -[^3]: 引用自[关于 - 天钿Daily](https://tdd.bunnyxt.com/about) \ No newline at end of file + +[^3]: 引用自[关于 - 天钿Daily](https://tdd.bunnyxt.com/about) diff --git a/deno.json b/deno.json index c2fca5b..7392dae 100644 --- a/deno.json +++ b/deno.json @@ -1,60 +1,60 @@ { - "lock": false, - "tasks": { - "crawl-raw-bili": "deno --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run src/db/raw/insertAidsToDB.ts", - "crawl-bili-aids": "deno --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run src/db/raw/fetchAids.ts", - "check": "deno fmt --check && deno lint && deno check **/*.ts && deno check **/*.tsx", - "cli": "echo \"import '\\$fresh/src/dev/cli.ts'\" | deno run --unstable -A -", - "manifest": "deno task cli manifest $(pwd)", - "start": "deno run -A --watch=static/,routes/ dev.ts", - "build": "deno run -A dev.ts build", - "preview": "deno run -A main.ts", - "update": "deno run -A -r https://fresh.deno.dev/update .", - "worker:main": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net --allow-write ./src/worker.ts", - "worker:filter": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net --allow-write ./src/filterWorker.ts", - "adder": "deno run --allow-env --allow-read --allow-ffi --allow-net ./src/jobAdder.ts", - "bullui": "deno run --allow-read --allow-env --allow-ffi --allow-net ./src/bullui.ts", - "all": "concurrently 'deno task worker:main' 'deno task adder' 'deno task bullui' 'deno task worker:filter'", - "test": "deno test ./test/ --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run" - }, - "lint": { - "rules": { - "tags": ["fresh", "recommended"] - } - }, - "exclude": ["**/_fresh/*"], - "imports": { - "@std/assert": "jsr:@std/assert@1", - "$fresh/": "https://deno.land/x/fresh@1.7.3/", - "preact": "https://esm.sh/preact@10.22.0", - "preact/": "https://esm.sh/preact@10.22.0/", - "@preact/signals": "https://esm.sh/*@preact/signals@1.2.2", - "@preact/signals-core": "https://esm.sh/*@preact/signals-core@1.5.1", - "tailwindcss": "npm:tailwindcss@3.4.1", - "tailwindcss/": "npm:/tailwindcss@3.4.1/", - "tailwindcss/plugin": "npm:/tailwindcss@3.4.1/plugin.js", - "$std/": "https://deno.land/std@0.216.0/", - "@huggingface/transformers": "npm:@huggingface/transformers@3.0.0", - "bullmq": "npm:bullmq", - "lib/": "./lib/", - "ioredis": "npm:ioredis", - "@bull-board/api": "npm:@bull-board/api", - "@bull-board/express": "npm:@bull-board/express", - "express": "npm:express", - "src/": "./src/", - "onnxruntime": "npm:onnxruntime-node@1.19.2", - "chalk": "npm:chalk" - }, - "compilerOptions": { - "jsx": "react-jsx", - "jsxImportSource": "preact" - }, - "nodeModulesDir": "auto", - "fmt": { - "useTabs": true, - "lineWidth": 120, - "indentWidth": 4, - "semiColons": true, - "proseWrap": "always" - } + "lock": false, + "tasks": { + "crawl-raw-bili": "deno --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run src/db/raw/insertAidsToDB.ts", + "crawl-bili-aids": "deno --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run src/db/raw/fetchAids.ts", + "check": "deno fmt --check && deno lint && deno check **/*.ts && deno check **/*.tsx", + "cli": "echo \"import '\\$fresh/src/dev/cli.ts'\" | deno run --unstable -A -", + "manifest": "deno task cli manifest $(pwd)", + "start": "deno run -A --watch=static/,routes/ dev.ts", + "build": "deno run -A dev.ts build", + "preview": "deno run -A main.ts", + "update": "deno run -A -r https://fresh.deno.dev/update .", + "worker:main": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net --allow-write ./src/worker.ts", + "worker:filter": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net --allow-write ./src/filterWorker.ts", + "adder": "deno run --allow-env --allow-read --allow-ffi --allow-net ./src/jobAdder.ts", + "bullui": "deno run --allow-read --allow-env --allow-ffi --allow-net ./src/bullui.ts", + "all": "concurrently 'deno task worker:main' 'deno task adder' 'deno task bullui' 'deno task worker:filter'", + "test": "deno test ./test/ --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run" + }, + "lint": { + "rules": { + "tags": ["fresh", "recommended"] + } + }, + "exclude": ["**/_fresh/*"], + "imports": { + "@std/assert": "jsr:@std/assert@1", + "$fresh/": "https://deno.land/x/fresh@1.7.3/", + "preact": "https://esm.sh/preact@10.22.0", + "preact/": "https://esm.sh/preact@10.22.0/", + "@preact/signals": "https://esm.sh/*@preact/signals@1.2.2", + "@preact/signals-core": "https://esm.sh/*@preact/signals-core@1.5.1", + "tailwindcss": "npm:tailwindcss@3.4.1", + "tailwindcss/": "npm:/tailwindcss@3.4.1/", + "tailwindcss/plugin": "npm:/tailwindcss@3.4.1/plugin.js", + "$std/": "https://deno.land/std@0.216.0/", + "@huggingface/transformers": "npm:@huggingface/transformers@3.0.0", + "bullmq": "npm:bullmq", + "lib/": "./lib/", + "ioredis": "npm:ioredis", + "@bull-board/api": "npm:@bull-board/api", + "@bull-board/express": "npm:@bull-board/express", + "express": "npm:express", + "src/": "./src/", + "onnxruntime": "npm:onnxruntime-node@1.19.2", + "chalk": "npm:chalk" + }, + "compilerOptions": { + "jsx": "react-jsx", + "jsxImportSource": "preact" + }, + "nodeModulesDir": "auto", + "fmt": { + "useTabs": true, + "lineWidth": 120, + "indentWidth": 4, + "semiColons": true, + "proseWrap": "always" + } } diff --git a/doc/en/README.md b/doc/en/README.md index f769d98..2eadf84 100644 --- a/doc/en/README.md +++ b/doc/en/README.md @@ -17,7 +17,8 @@ layout: Welcome to the CVSA Documentation! -This doc contains various information about the CVSA project, including technical architecture, tutorials for visitors, etc. +This doc contains various information about the CVSA project, including technical architecture, tutorials for visitors, +etc. ### Jump right in diff --git a/doc/en/SUMMARY.md b/doc/en/SUMMARY.md index 5137229..345f141 100644 --- a/doc/en/SUMMARY.md +++ b/doc/en/SUMMARY.md @@ -1,22 +1,22 @@ # Table of contents -* [Welcome](README.md) +- [Welcome](README.md) ## About -* [About CVSA Project](about/this-project.md) -* [Scope of Inclusion](about/scope-of-inclusion.md) +- [About CVSA Project](about/this-project.md) +- [Scope of Inclusion](about/scope-of-inclusion.md) ## Architecure -* [Overview](architecure/overview.md) -* [Database Structure](architecure/database-structure/README.md) - * [Type of Song](architecure/database-structure/type-of-song.md) -* [Message Queue](architecure/message-queue/README.md) - * [VideoTagsQueue](architecure/message-queue/videotagsqueue.md) -* [Artificial Intelligence](architecure/artificial-intelligence.md) +- [Overview](architecure/overview.md) +- [Database Structure](architecure/database-structure/README.md) + - [Type of Song](architecure/database-structure/type-of-song.md) +- [Message Queue](architecure/message-queue/README.md) + - [VideoTagsQueue](architecure/message-queue/videotagsqueue.md) +- [Artificial Intelligence](architecure/artificial-intelligence.md) ## API Doc -* [Catalog](api-doc/catalog.md) -* [Songs](api-doc/songs.md) +- [Catalog](api-doc/catalog.md) +- [Songs](api-doc/songs.md) diff --git a/doc/en/about/scope-of-inclusion.md b/doc/en/about/scope-of-inclusion.md index d893e33..f8e9765 100644 --- a/doc/en/about/scope-of-inclusion.md +++ b/doc/en/about/scope-of-inclusion.md @@ -1,19 +1,27 @@ # Scope of Inclusion -CVSA contains many aspects of Chinese Vocal Synthesis, including songs, albums, artists (publisher, manipulators, arranger, etc), singers and voice engines / voicebanks. +CVSA contains many aspects of Chinese Vocal Synthesis, including songs, albums, artists (publisher, manipulators, +arranger, etc), singers and voice engines / voicebanks. For a **song**, it must meet the following conditions to be included in CVSA: ### Category 30 -In principle, the songs featured in CVSA must be included in a video categorized under VOCALOID·UTAU (ID 30) that is posted on Bilibili. In some special cases, this rule may not be enforced. +In principle, the songs featured in CVSA must be included in a video categorized under VOCALOID·UTAU (ID 30) that is +posted on Bilibili. In some special cases, this rule may not be enforced. ### At Leats One Line of Chinese -The lyrics of the song must contain at least one line in Chinese. This means that even if a voicebank that only supports Chinese is used, if the lyrics of the song do not contain Chinese, it will not be included in the CVSA. +The lyrics of the song must contain at least one line in Chinese. This means that even if a voicebank that only supports +Chinese is used, if the lyrics of the song do not contain Chinese, it will not be included in the CVSA. ### Using Vocal Synthesizer -To be included in CVSA, at least one line of the song must be produced by a Vocal Synthesizer (including harmony vocals). +To be included in CVSA, at least one line of the song must be produced by a Vocal Synthesizer (including harmony +vocals). -We define a vocal synthesizer as a software or system that generates synthesized singing voices by algorithmically modeling vocal characteristics and producing audio from input parameters such as lyrics, pitch, and dynamics, encompassing both waveform-concatenation-based (e.g., VOCALOID, UTAU) and AI-based (e.g., Synthesizer V, ACE Studio) approaches, **but excluding voice conversion tools that solely alter the timbre of pre-existing recordings** (e.g., [so-vits svc](https://github.com/svc-develop-team/so-vits-svc)). +We define a vocal synthesizer as a software or system that generates synthesized singing voices by algorithmically +modeling vocal characteristics and producing audio from input parameters such as lyrics, pitch, and dynamics, +encompassing both waveform-concatenation-based (e.g., VOCALOID, UTAU) and AI-based (e.g., Synthesizer V, ACE Studio) +approaches, **but excluding voice conversion tools that solely alter the timbre of pre-existing recordings** (e.g., +[so-vits svc](https://github.com/svc-develop-team/so-vits-svc)). diff --git a/doc/en/about/this-project.md b/doc/en/about/this-project.md index 1d2d610..4e386f9 100644 --- a/doc/en/about/this-project.md +++ b/doc/en/about/this-project.md @@ -1,11 +1,13 @@ # About CVSA Project -CVSA (Chinese Vocal Synthesis Archive) aims to collect as much content as possible about the Chinese Vocal Synthesis community in a highly automation-assisted way. +CVSA (Chinese Vocal Synthesis Archive) aims to collect as much content as possible about the Chinese Vocal Synthesis +community in a highly automation-assisted way. -Unlike existing projects such as [VocaDB](https://vocadb.net), CVSA collects and displays the following content in an automated and manually edited way: - -* Metadata of songs (name, duration, publisher, singer, etc.) -* Descriptive information of songs (content introduction, creation background, lyrics, etc.) -* Engagement data snapshots of songs, i.e. historical snapshots of their engagement data (including views, favorites, likes, etc.) on the [Bilibili](https://en.wikipedia.org/wiki/Bilibili) website. -* Information about artists, albums, vocal synthesizers, and voicebanks. +Unlike existing projects such as [VocaDB](https://vocadb.net), CVSA collects and displays the following content in an +automated and manually edited way: +- Metadata of songs (name, duration, publisher, singer, etc.) +- Descriptive information of songs (content introduction, creation background, lyrics, etc.) +- Engagement data snapshots of songs, i.e. historical snapshots of their engagement data (including views, favorites, + likes, etc.) on the [Bilibili](https://en.wikipedia.org/wiki/Bilibili) website. +- Information about artists, albums, vocal synthesizers, and voicebanks. diff --git a/doc/en/api-doc/catalog.md b/doc/en/api-doc/catalog.md index 2a57a78..eea6596 100644 --- a/doc/en/api-doc/catalog.md +++ b/doc/en/api-doc/catalog.md @@ -1,4 +1,3 @@ # Catalog -* [**Songs**](songs.md) - +- [**Songs**](songs.md) diff --git a/doc/en/architecure/artificial-intelligence.md b/doc/en/architecure/artificial-intelligence.md index 849cb27..6d52e54 100644 --- a/doc/en/architecure/artificial-intelligence.md +++ b/doc/en/architecure/artificial-intelligence.md @@ -6,8 +6,9 @@ The AI ​​systems we currently use are: ### The Filter -Located at `/filter/` under project root dir, it classifies a video in the [category 30](../about/scope-of-inclusion.md#category-30) into the following categories: +Located at `/filter/` under project root dir, it classifies a video in the +[category 30](../about/scope-of-inclusion.md#category-30) into the following categories: -* 0: Not related to Chinese vocal synthesis -* 1: A original song with Chinese vocal synthesis -* 2: A cover/remix song with Chinese vocal synthesis +- 0: Not related to Chinese vocal synthesis +- 1: A original song with Chinese vocal synthesis +- 2: A cover/remix song with Chinese vocal synthesis diff --git a/doc/en/architecure/database-structure/README.md b/doc/en/architecure/database-structure/README.md index 96704b7..93e164c 100644 --- a/doc/en/architecure/database-structure/README.md +++ b/doc/en/architecure/database-structure/README.md @@ -2,10 +2,11 @@ CVSA uses [PostgreSQL](https://www.postgresql.org/) as our database. -All public data of CVSA (excluding users' personal data) is stored in a database named `cvsa_main`, which contains the following tables: - -* songs: stores the main information of songs -* bili\_user: stores snapshots of Bilibili user information -* all\_data: metadata of all videos in [category 30](../../about/scope-of-inclusion.md#category-30). -* labelling\_result: Contains label of videos in `all_data`tagged by our [AI system](../artificial-intelligence.md#the-filter). +All public data of CVSA (excluding users' personal data) is stored in a database named `cvsa_main`, which contains the +following tables: +- songs: stores the main information of songs +- bili\_user: stores snapshots of Bilibili user information +- all\_data: metadata of all videos in [category 30](../../about/scope-of-inclusion.md#category-30). +- labelling\_result: Contains label of videos in `all_data`tagged by our + [AI system](../artificial-intelligence.md#the-filter). diff --git a/doc/en/architecure/database-structure/type-of-song.md b/doc/en/architecure/database-structure/type-of-song.md index c4af1aa..1855f4a 100644 --- a/doc/en/architecure/database-structure/type-of-song.md +++ b/doc/en/architecure/database-structure/type-of-song.md @@ -1,6 +1,7 @@ # Type of Song -The **Unrelated type** refers specifically to videos that are not in our [Scope of Inclusion](../../about/scope-of-inclusion.md). +The **Unrelated type** refers specifically to videos that are not in our +[Scope of Inclusion](../../about/scope-of-inclusion.md). ### Table: `songs` diff --git a/doc/en/architecure/message-queue/README.md b/doc/en/architecure/message-queue/README.md index d0a8349..4fedf39 100644 --- a/doc/en/architecure/message-queue/README.md +++ b/doc/en/architecure/message-queue/README.md @@ -1,2 +1 @@ # Message Queue - diff --git a/doc/en/architecure/message-queue/videotagsqueue.md b/doc/en/architecure/message-queue/videotagsqueue.md index bdddddb..fed620d 100644 --- a/doc/en/architecure/message-queue/videotagsqueue.md +++ b/doc/en/architecure/message-queue/videotagsqueue.md @@ -2,7 +2,8 @@ ### Jobs -The VideoTagsQueue contains two jobs: `getVideoTags`and `getVideosTags`. The former is used to fetch the tags of a video, and the latter is responsible for scheduling the former. +The VideoTagsQueue contains two jobs: `getVideoTags`and `getVideosTags`. The former is used to fetch the tags of a +video, and the latter is responsible for scheduling the former. ### Return value diff --git a/doc/en/architecure/overview.md b/doc/en/architecure/overview.md index d80036e..468180f 100644 --- a/doc/en/architecure/overview.md +++ b/doc/en/architecure/overview.md @@ -15,4 +15,5 @@ layout: # Overview -Automation is the biggest highlight of CVSA's technical design. To achieve this, we use a message queue powered by [BullMQ](https://bullmq.io/) to concurrently process various tasks in the data collection life cycle. +Automation is the biggest highlight of CVSA's technical design. To achieve this, we use a message queue powered by +[BullMQ](https://bullmq.io/) to concurrently process various tasks in the data collection life cycle. diff --git a/doc/zh/SUMMARY.md b/doc/zh/SUMMARY.md index 981ee4b..b8cd029 100644 --- a/doc/zh/SUMMARY.md +++ b/doc/zh/SUMMARY.md @@ -1,22 +1,22 @@ # Table of contents -* [欢迎](README.md) +- [欢迎](README.md) ## 关于 -* [关于本项目](about/this-project.md) -* [收录范围](about/scope-of-inclusion.md) +- [关于本项目](about/this-project.md) +- [收录范围](about/scope-of-inclusion.md) ## 技术架构 -* [概览](architecture/overview.md) -* [数据库结构](architecture/database-structure/README.md) - * [歌曲类型](architecture/database-structure/type-of-song.md) -* [人工智能](architecture/artificial-intelligence.md) -* [消息队列](architecture/message-queue/README.md) - * [VideoTagsQueue队列](architecture/message-queue/video-tags-queue.md) +- [概览](architecture/overview.md) +- [数据库结构](architecture/database-structure/README.md) + - [歌曲类型](architecture/database-structure/type-of-song.md) +- [人工智能](architecture/artificial-intelligence.md) +- [消息队列](architecture/message-queue/README.md) + - [VideoTagsQueue队列](architecture/message-queue/video-tags-queue.md) ## API 文档 -* [目录](api-doc/catalog.md) -* [歌曲](api-doc/songs.md) +- [目录](api-doc/catalog.md) +- [歌曲](api-doc/songs.md) diff --git a/doc/zh/about/scope-of-inclusion.md b/doc/zh/about/scope-of-inclusion.md index c985544..92ff3be 100644 --- a/doc/zh/about/scope-of-inclusion.md +++ b/doc/zh/about/scope-of-inclusion.md @@ -6,7 +6,8 @@ #### VOCALOID·UATU 分区 -原则上,中V档案馆中收录的歌曲必须包含在哔哩哔哩 VOCALOID·UTAU 分区(分区ID为30)下的视频中。在某些特殊情况下,此规则可能不是强制的。 +原则上,中V档案馆中收录的歌曲必须包含在哔哩哔哩 VOCALOID·UTAU +分区(分区ID为30)下的视频中。在某些特殊情况下,此规则可能不是强制的。 #### 至少一行中文 @@ -16,4 +17,6 @@ 歌曲的至少一行必须由歌声合成器生成(包括和声部分),才能被收录到中V档案馆中。 -我们将歌声合成器定义为通过算法建模声音特征并根据输入的歌词、音高等参数生成音频的软件或系统,包括基于波形拼接的(如 VOCALOID、UTAU)和基于 AI 的(如 Synthesizer V、ACE Studio)方法,**但不包括仅改变现有歌声音色的AI声音转换器**(例如 [so-vits svc](https://github.com/svc-develop-team/so-vits-svc))。 +我们将歌声合成器定义为通过算法建模声音特征并根据输入的歌词、音高等参数生成音频的软件或系统,包括基于波形拼接的(如 +VOCALOID、UTAU)和基于 AI 的(如 Synthesizer V、ACE Studio)方法,**但不包括仅改变现有歌声音色的AI声音转换器**(例如 +[so-vits svc](https://github.com/svc-develop-team/so-vits-svc))。 diff --git a/doc/zh/about/this-project.md b/doc/zh/about/this-project.md index 9459d8e..c78d6d2 100644 --- a/doc/zh/about/this-project.md +++ b/doc/zh/about/this-project.md @@ -6,34 +6,33 @@ 纵观整个互联网,对于「中文歌声合成」或「中文虚拟歌手」(常简称为中V或VC)相关信息进行较为系统、全面地整理收集的主要有以下几个网站: -* [萌娘百科](https://zh.moegirl.org.cn/): 收录了大量中V歌曲及歌姬的信息,呈现形式为传统维基(基于[MediaWiki](https://www.mediawiki.org/))。 -* [VCPedia](https://vcpedia.cn/): 由原萌娘百科中文歌声合成编辑团队的部分成员搭建,专属于中文歌声合成相关内容的信息集成站点[^1],呈现形式为传统维基(基于[MediaWiki](https://www.mediawiki.org/))。 -* [VocaDB](https://vocadb.net/): [一个围绕 Vocaloid、UTAU 和其他歌声合成器的协作数据库,其中包含艺术家、唱片、PV 等](#user-content-fn-2)[^2],其中包含大量中文歌声合成作品。 -* [天钿Daily](https://tdd.bunnyxt.com/):一个VC相关数据交流与分享的网站。致力于VC相关数据交流,定期抓取VC相关数据,选取有意义的纬度展示。 +- [萌娘百科](https://zh.moegirl.org.cn/): + 收录了大量中V歌曲及歌姬的信息,呈现形式为传统维基(基于[MediaWiki](https://www.mediawiki.org/))。 +- [VCPedia](https://vcpedia.cn/): + 由原萌娘百科中文歌声合成编辑团队的部分成员搭建,专属于中文歌声合成相关内容的信息集成站点[^1],呈现形式为传统维基(基于[MediaWiki](https://www.mediawiki.org/))。 +- [VocaDB](https://vocadb.net/): + [一个围绕 Vocaloid、UTAU 和其他歌声合成器的协作数据库,其中包含艺术家、唱片、PV 等](#user-content-fn-2)[^2],其中包含大量中文歌声合成作品。 +- [天钿Daily](https://tdd.bunnyxt.com/):一个VC相关数据交流与分享的网站。致力于VC相关数据交流,定期抓取VC相关数据,选取有意义的纬度展示。 上述网站中,或多或少存在一些不足,例如: -* 萌娘百科、VCPedia受限于传统维基,绝大多数内容依赖人工编辑。 -* VocaDB基于结构化数据库构建,由此可以依赖程序生成一些信息,但**条目收录**仍然完全依赖人工完成。 -* VocaDB主要专注于元数据展示,少有关于歌曲、作者等的描述性的文字,也缺乏描述性的背景信息。 -* 天钿Daily只展示歌曲的统计数据及历史趋势,没有关于歌曲其它信息的收集。 +- 萌娘百科、VCPedia受限于传统维基,绝大多数内容依赖人工编辑。 +- VocaDB基于结构化数据库构建,由此可以依赖程序生成一些信息,但**条目收录**仍然完全依赖人工完成。 +- VocaDB主要专注于元数据展示,少有关于歌曲、作者等的描述性的文字,也缺乏描述性的背景信息。 +- 天钿Daily只展示歌曲的统计数据及历史趋势,没有关于歌曲其它信息的收集。 因此,**中V档案馆**吸取前人经验,克服上述网站的不足,希望做到: -* 歌曲收录(指发现歌曲并创建条目)的完全自动化 -* 歌曲元信息提取的高度自动化 -* 歌曲统计数据收集的完全自动化 -* 在程序辅助的同时欢迎并鼓励贡献者参与编辑(主要为描述性内容)或纠错 -* 在适当的许可声明下,引用来自上述源的数据,使内容更加全面、丰富。 +- 歌曲收录(指发现歌曲并创建条目)的完全自动化 +- 歌曲元信息提取的高度自动化 +- 歌曲统计数据收集的完全自动化 +- 在程序辅助的同时欢迎并鼓励贡献者参与编辑(主要为描述性内容)或纠错 +- 在适当的许可声明下,引用来自上述源的数据,使内容更加全面、丰富。 - - -*** +--- 本文在[CC BY-NC-SA 4.0协议](https://creativecommons.org/licenses/by-nc-sa/4.0/)提供。 - - [^1]: 引用自[VCPedia](https://vcpedia.cn/%E9%A6%96%E9%A1%B5),于[知识共享 署名-非商业性使用-相同方式共享 3.0中国大陆 (CC BY-NC-SA 3.0 CN) 许可协议](https://creativecommons.org/licenses/by-nc-sa/3.0/cn/)下提供。 [^2]: 翻译自[VocaDB](https://vocadb.net/),于[CC BY 4.0协议](https://creativecommons.org/licenses/by/4.0/)下提供。 diff --git a/doc/zh/api-doc/catalog.md b/doc/zh/api-doc/catalog.md index a2b70ae..b76ea7a 100644 --- a/doc/zh/api-doc/catalog.md +++ b/doc/zh/api-doc/catalog.md @@ -1,3 +1,3 @@ # 目录 -* [歌曲](songs.md) +- [歌曲](songs.md) diff --git a/doc/zh/architecture/artificial-intelligence.md b/doc/zh/architecture/artificial-intelligence.md index 8d08f07..53caba1 100644 --- a/doc/zh/architecture/artificial-intelligence.md +++ b/doc/zh/architecture/artificial-intelligence.md @@ -6,8 +6,8 @@ CVSA 的自动化工作流高度依赖人工智能进行信息提取和分类。 #### Filter -位于项目根目录下的 `/filter/`,它将 [30 分区](../about/scope-of-inclusion.md#vocaloiduatu-fen-qu) 中的视频分为以下类别: +位于项目根目录下的 `/filter/`,它将 [30 分区](../about/scope-of-inclusion.md#vocaloiduatu-fen-qu) 中的视频分为以下类别: -* 0:与中文人声合成无关 -* 1:中文人声合成原创曲 -* 2:中文人声合成的翻唱/混音歌曲 +- 0:与中文人声合成无关 +- 1:中文人声合成原创曲 +- 2:中文人声合成的翻唱/混音歌曲 diff --git a/doc/zh/architecture/database-structure/README.md b/doc/zh/architecture/database-structure/README.md index 15d0a59..fbca8b1 100644 --- a/doc/zh/architecture/database-structure/README.md +++ b/doc/zh/architecture/database-structure/README.md @@ -4,7 +4,7 @@ CVSA 使用 [PostgreSQL](https://www.postgresql.org/) 作为数据库。 CVSA 的所有公开数据(不包括用户的个人数据)都存储在名为 `cvsa_main` 的数据库中,该数据库包含以下表: -* songs:存储歌曲的主要信息 -* bili\_user:存储 Bilibili 用户信息快照 -* all\_data:[分区 30](../../about/scope-of-inclusion.md#vocaloiduatu-fen-qu) 中所有视频的元数据。 -* labelling\_result:包含由我们的 AI 系统 标记的 `all_data` 中视频的标签。 +- songs:存储歌曲的主要信息 +- bili\_user:存储 Bilibili 用户信息快照 +- all\_data:[分区 30](../../about/scope-of-inclusion.md#vocaloiduatu-fen-qu) 中所有视频的元数据。 +- labelling\_result:包含由我们的 AI 系统 标记的 `all_data` 中视频的标签。 diff --git a/doc/zh/architecture/database-structure/type-of-song.md b/doc/zh/architecture/database-structure/type-of-song.md index 22aef46..94630e7 100644 --- a/doc/zh/architecture/database-structure/type-of-song.md +++ b/doc/zh/architecture/database-structure/type-of-song.md @@ -7,18 +7,18 @@ `songs` 表格中使用的 `type` 列。 | 类型 | 说明 | -| -- | ---------- | -| 0 | 不相关 | -| 1 | 原创 | -| 2 | 翻唱 (Cover) | -| 3 | 混音 (Remix) | -| 4 | 纯音乐 | -| 10 | 其他 | +| ---- | ------------ | +| 0 | 不相关 | +| 1 | 原创 | +| 2 | 翻唱 (Cover) | +| 3 | 混音 (Remix) | +| 4 | 纯音乐 | +| 10 | 其他 | #### 表格:`labelling_result` -| 标签 | 说明 | -| -- | ----------- | -| 0 | AI 标记:不相关 | -| 1 | AI 标记:原创 | -| 2 | AI 标记:翻唱/混音 | +| 标签 | 说明 | +| ---- | ------------------ | +| 0 | AI 标记:不相关 | +| 1 | AI 标记:原创 | +| 2 | AI 标记:翻唱/混音 | diff --git a/doc/zh/architecture/message-queue/README.md b/doc/zh/architecture/message-queue/README.md index 6493393..b2312f5 100644 --- a/doc/zh/architecture/message-queue/README.md +++ b/doc/zh/architecture/message-queue/README.md @@ -1,2 +1 @@ # 消息队列 - diff --git a/lib/db/allData.ts b/lib/db/allData.ts index 0c6db08..701c112 100644 --- a/lib/db/allData.ts +++ b/lib/db/allData.ts @@ -1,9 +1,5 @@ -import { Client, Transaction } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; import { AllDataType } from "lib/db/schema.d.ts"; -import logger from "lib/log/logger.ts"; -import { formatTimestampToPsql, parseTimestampFromPsql } from "lib/utils/formatTimestampToPostgre.ts"; -import { VideoListVideo } from "lib/net/bilibili.d.ts"; -import { HOUR, SECOND } from "$std/datetime/constants.ts"; import { modelVersion } from "lib/ml/filter_inference.ts"; export async function videoExistsInAllData(client: Client, aid: number) { @@ -11,70 +7,8 @@ export async function videoExistsInAllData(client: Client, aid: number) { .then((result) => result.rows[0].exists); } -export async function biliUserExists(client: Client, uid: number) { +export async function userExistsInBiliUsers(client: Client, uid: number) { return await client.queryObject<{ exists: boolean }>(`SELECT EXISTS(SELECT 1 FROM bili_user WHERE uid = $1)`, [uid]) - .then((result) => result.rows[0].exists); -} - -export async function insertIntoAllData(client: Client, data: VideoListVideo) { - logger.log(`inserted ${data.aid}`, "db-all_data"); - await client.queryObject( - `INSERT INTO all_data (aid, bvid, description, uid, tags, title, published_at, duration) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8) - ON CONFLICT (aid) DO NOTHING`, - [ - data.aid, - data.bvid, - data.desc, - data.owner.mid, - null, - data.title, - formatTimestampToPsql(data.pubdate * SECOND + 8 * HOUR), - data.duration, - ], - ); -} - -export async function getLatestVideoTimestampFromAllData(client: Client) { - return await client.queryObject<{ published_at: string }>( - `SELECT published_at FROM all_data ORDER BY published_at DESC LIMIT 1`, - ) - .then((result) => { - const date = new Date(result.rows[0].published_at); - if (isNaN(date.getTime())) { - return null; - } - return date.getTime(); - }); -} - -export async function videoTagsIsNull(client: Client | Transaction, aid: number) { - return await client.queryObject<{ exists: boolean }>( - `SELECT EXISTS(SELECT 1 FROM all_data WHERE aid = $1 AND tags IS NULL)`, - [aid], - ).then((result) => result.rows[0].exists); -} - -export async function updateVideoTags(client: Client | Transaction, aid: number, tags: string[]) { - return await client.queryObject( - `UPDATE all_data SET tags = $1 WHERE aid = $2`, - [tags.join(","), aid], - ); -} - -export async function getNullVideoTagsList(client: Client) { - const queryResult = await client.queryObject<{ aid: number; published_at: string }>( - `SELECT aid, published_at FROM all_data WHERE tags IS NULL`, - ); - const rows = queryResult.rows; - return rows.map( - (row) => { - return { - aid: Number(row.aid), - published_at: parseTimestampFromPsql(row.published_at), - }; - }, - ); } export async function getUnlabelledVideos(client: Client) { diff --git a/lib/db/init.ts b/lib/db/init.ts index 2c021e5..d206872 100644 --- a/lib/db/init.ts +++ b/lib/db/init.ts @@ -1,5 +1,5 @@ import { Pool } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import {postgresConfig} from "lib/db/pgConfig.ts"; +import { postgresConfig } from "lib/db/pgConfig.ts"; const pool = new Pool(postgresConfig, 12); diff --git a/lib/db/pgConfig.ts b/lib/db/pgConfig.ts index 4c34ef4..5410760 100644 --- a/lib/db/pgConfig.ts +++ b/lib/db/pgConfig.ts @@ -3,7 +3,7 @@ const requiredEnvVars = ["DB_HOST", "DB_NAME", "DB_USER", "DB_PASSWORD", "DB_POR const unsetVars = requiredEnvVars.filter((key) => Deno.env.get(key) === undefined); if (unsetVars.length > 0) { - throw new Error(`Missing required environment variables: ${unsetVars.join(", ")}`); + throw new Error(`Missing required environment variables: ${unsetVars.join(", ")}`); } const databaseHost = Deno.env.get("DB_HOST")!; @@ -18,4 +18,4 @@ export const postgresConfig = { database: databaseName, user: databaseUser, password: databasePassword, -}; \ No newline at end of file +}; diff --git a/lib/db/redis.ts b/lib/db/redis.ts index 7e8152f..51ac02c 100644 --- a/lib/db/redis.ts +++ b/lib/db/redis.ts @@ -1,3 +1,3 @@ import { Redis } from "ioredis"; -export const redis = new Redis({ maxRetriesPerRequest: null }); \ No newline at end of file +export const redis = new Redis({ maxRetriesPerRequest: null }); diff --git a/lib/db/schema.d.ts b/lib/db/schema.d.ts index db8c9a4..068f084 100644 --- a/lib/db/schema.d.ts +++ b/lib/db/schema.d.ts @@ -1,9 +1,9 @@ export interface AllDataType { - aid: number; - bvid: string | null; - description: string | null; - uid: number | null; - tags: string | null; - title: string | null; - published_at: string | null; -} \ No newline at end of file + aid: number; + bvid: string | null; + description: string | null; + uid: number | null; + tags: string | null; + title: string | null; + published_at: string | null; +} diff --git a/lib/log/test.ts b/lib/log/test.ts index 49deb8c..71c719c 100644 --- a/lib/log/test.ts +++ b/lib/log/test.ts @@ -9,4 +9,4 @@ logger.log("foo", "service"); logger.log("foo", "db", "insert.ts"); logger.warn("warn"); logger.error("error"); -logger.verbose("error"); \ No newline at end of file +logger.verbose("error"); diff --git a/lib/ml/filter_inference.ts b/lib/ml/filter_inference.ts index da9ed4a..8758b4d 100644 --- a/lib/ml/filter_inference.ts +++ b/lib/ml/filter_inference.ts @@ -1,7 +1,7 @@ -import {AutoTokenizer, PreTrainedTokenizer} from "@huggingface/transformers"; +import { AutoTokenizer, PreTrainedTokenizer } from "@huggingface/transformers"; import * as ort from "onnxruntime"; import logger from "lib/log/logger.ts"; -import {WorkerError} from "lib/mq/schema.ts"; +import { WorkerError } from "lib/mq/schema.ts"; const tokenizerModel = "alikia2x/jina-embedding-v3-m2v-1024"; const onnxClassifierPath = "./model/video_classifier_v3_11.onnx"; @@ -66,7 +66,6 @@ async function getONNXEmbeddings(texts: string[], session: ort.InferenceSession) return Array.from(embeddings.data as Float32Array); } - async function runClassification(embeddings: number[]): Promise { if (!sessionClassifier) { throw new Error("Classifier session is not initialized. Call initializeModels() first."); @@ -85,7 +84,7 @@ export async function classifyVideo( description: string, tags: string, author_info: string, - aid: number + aid: number, ): Promise { if (!sessionEmbedding) { throw new Error("Embedding session is not initialized. Call initializeModels() first."); @@ -97,6 +96,6 @@ export async function classifyVideo( author_info, ], sessionEmbedding); const probabilities = await runClassification(embeddings); - logger.log(`Prediction result for aid: ${aid}: [${probabilities.map((p) => p.toFixed(5))}]`, "ml") + logger.log(`Prediction result for aid: ${aid}: [${probabilities.map((p) => p.toFixed(5))}]`, "ml"); return probabilities.indexOf(Math.max(...probabilities)); } diff --git a/lib/ml/quant_benchmark.ts b/lib/ml/quant_benchmark.ts index f75bf9b..ced9f99 100644 --- a/lib/ml/quant_benchmark.ts +++ b/lib/ml/quant_benchmark.ts @@ -1,6 +1,6 @@ -import {AutoTokenizer, PreTrainedTokenizer} from "@huggingface/transformers"; +import { AutoTokenizer, PreTrainedTokenizer } from "@huggingface/transformers"; import * as ort from "onnxruntime"; -import {softmax} from "lib/ml/filter_inference.ts"; +import { softmax } from "lib/ml/filter_inference.ts"; // 配置参数 const sentenceTransformerModelName = "alikia2x/jina-embedding-v3-m2v-1024"; diff --git a/lib/mq/exec/classifyVideo.ts b/lib/mq/exec/classifyVideo.ts index df45def..bc7f362 100644 --- a/lib/mq/exec/classifyVideo.ts +++ b/lib/mq/exec/classifyVideo.ts @@ -1,6 +1,6 @@ import { Job } from "bullmq"; import { db } from "lib/db/init.ts"; -import { getUnlabelledVideos, getVideoInfoFromAllData, insertVideoLabel} from "lib/db/allData.ts"; +import { getUnlabelledVideos, getVideoInfoFromAllData, insertVideoLabel } from "lib/db/allData.ts"; import { classifyVideo } from "lib/ml/filter_inference.ts"; import { ClassifyVideoQueue } from "lib/mq/index.ts"; import logger from "lib/log/logger.ts"; @@ -27,7 +27,8 @@ export const classifyVideoWorker = async (job: Job) => { client.release(); await job.updateData({ - ...job.data, label: label, + ...job.data, + label: label, }); return 0; @@ -38,12 +39,12 @@ export const classifyVideosWorker = async () => { logger.log("job:classifyVideos is locked, skipping.", "mq"); return; } - + await lockManager.acquireLock("classifyVideos"); const client = await db.connect(); const videos = await getUnlabelledVideos(client); - logger.log(`Found ${videos.length} unlabelled videos`) + logger.log(`Found ${videos.length} unlabelled videos`); client.release(); let i = 0; diff --git a/lib/mq/exec/getLatestVideos.ts b/lib/mq/exec/getLatestVideos.ts index 17d7677..4f795e0 100644 --- a/lib/mq/exec/getLatestVideos.ts +++ b/lib/mq/exec/getLatestVideos.ts @@ -1,52 +1,12 @@ import { Job } from "bullmq"; -import { insertLatestVideos } from "lib/task/insertLatestVideo.ts"; -import { LatestVideosQueue } from "lib/mq/index.ts"; -import { MINUTE } from "$std/datetime/constants.ts"; +import { queueLatestVideos } from "lib/mq/task/queueLatestVideo.ts"; import { db } from "lib/db/init.ts"; -import { truncate } from "lib/utils/truncate.ts"; -import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import logger from "lib/log/logger.ts"; -import { lockManager } from "lib/mq/lockManager.ts"; -const delayMap = [5, 10, 15, 30, 60, 60]; - -const updateQueueInterval = async (failedCount: number, delay: number) => { - logger.log(`job:getLatestVideos added to queue, delay: ${(delay / MINUTE).toFixed(2)} minutes.`, "mq"); - await LatestVideosQueue.upsertJobScheduler("getLatestVideos", { - every: delay, - }, { - data: { - failedCount: failedCount, - }, - }); - return; -}; - -const executeTask = async (client: Client, failedCount: number) => { - const result = await insertLatestVideos(client); - failedCount = result !== 0 ? truncate(failedCount + 1, 0, 5) : 0; - if (failedCount !== 0) { - await updateQueueInterval(failedCount, delayMap[failedCount] * MINUTE); - } - return; -}; - -export const getLatestVideosWorker = async (job: Job) => { - if (await lockManager.isLocked("getLatestVideos")) { - logger.log("job:getLatestVideos is locked, skipping.", "mq"); - return; - } - - await lockManager.acquireLock("getLatestVideos"); - - const failedCount = (job.data.failedCount ?? 0) as number; +export const getLatestVideosWorker = async (_job: Job): Promise => { const client = await db.connect(); - try { - await executeTask(client, failedCount); + await queueLatestVideos(client); } finally { client.release(); - await lockManager.releaseLock("getLatestVideos"); } - return; }; diff --git a/lib/mq/exec/getVideoInfo.ts b/lib/mq/exec/getVideoInfo.ts new file mode 100644 index 0000000..dfc5e89 --- /dev/null +++ b/lib/mq/exec/getVideoInfo.ts @@ -0,0 +1,17 @@ +import { Job } from "bullmq"; +import { db } from "lib/db/init.ts"; +import { insertVideoInfo } from "lib/mq/task/getVideoInfo.ts"; + +export const getVideoInfoWorker = async (job: Job): Promise => { + const client = await db.connect(); + try { + const aid = job.data.aid; + if (!aid) { + return 3; + } + await insertVideoInfo(client, aid); + return 0; + } finally { + client.release(); + } +}; diff --git a/lib/mq/exec/getVideoTags.ts b/lib/mq/exec/getVideoTags.ts deleted file mode 100644 index 83fe26f..0000000 --- a/lib/mq/exec/getVideoTags.ts +++ /dev/null @@ -1,100 +0,0 @@ -import { Job } from "bullmq"; -import { VideoTagsQueue } from "lib/mq/index.ts"; -import { DAY, HOUR, MINUTE, SECOND } from "$std/datetime/constants.ts"; -import { db } from "lib/db/init.ts"; -import { truncate } from "lib/utils/truncate.ts"; -import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import logger from "lib/log/logger.ts"; -import { getNullVideoTagsList, updateVideoTags } from "lib/db/allData.ts"; -import { getVideoTags } from "lib/net/getVideoTags.ts"; -import { NetSchedulerError } from "lib/mq/scheduler.ts"; -import { WorkerError } from "lib/mq/schema.ts"; - -const delayMap = [0.5, 3, 5, 15, 30, 60]; -const getJobPriority = (diff: number) => { - let priority; - if (diff > 14 * DAY) { - priority = 10; - } else if (diff > 7 * DAY) { - priority = 7; - } else if (diff > DAY) { - priority = 5; - } else if (diff > 6 * HOUR) { - priority = 3; - } else if (diff > HOUR) { - priority = 2; - } else { - priority = 1; - } - return priority; -}; - -const executeTask = async (client: Client, aid: number, failedCount: number, job: Job) => { - try { - const result = await getVideoTags(aid); - if (!result) { - failedCount = truncate(failedCount + 1, 0, 5); - const delay = delayMap[failedCount] * MINUTE; - logger.log( - `job:getVideoTags added to queue, delay: ${delayMap[failedCount]} minutes.`, - "mq", - ); - await VideoTagsQueue.add("getVideoTags", { aid, failedCount }, { delay, priority: 6 - failedCount }); - return 1; - } - await updateVideoTags(client, aid, result); - logger.log(`Fetched tags for aid: ${aid}`, "task"); - return 0; - } catch (e) { - if (!(e instanceof NetSchedulerError)) { - throw new WorkerError( e, "task", "getVideoTags/fn:executeTask"); - } - const err = e as NetSchedulerError; - if (err.code === "NO_AVAILABLE_PROXY" || err.code === "PROXY_RATE_LIMITED") { - logger.warn(`No available proxy for fetching tags, delayed. aid: ${aid}`, "task"); - await VideoTagsQueue.add("getVideoTags", { aid, failedCount }, { - delay: 25 * SECOND * Math.random() + 5 * SECOND, - priority: job.priority, - }); - return 2; - } - throw new WorkerError(err, "task", "getVideoTags/fn:executeTask"); - } -}; - -export const getVideoTagsWorker = async (job: Job) => { - const failedCount = (job.data.failedCount ?? 0) as number; - const client = await db.connect(); - const aid = job.data.aid; - if (!aid) { - return 3; - } - - const v = await executeTask(client, aid, failedCount, job); - client.release(); - return v; -}; - -export const getVideoTagsInitializer = async () => { - const client = await db.connect(); - const videos = await getNullVideoTagsList(client); - client.release(); - if (videos.length == 0) { - return 4; - } - const count = await VideoTagsQueue.getJobCounts("wait", "delayed", "active"); - const total = count.delayed + count.active + count.wait; - const max = 15; - const rest = truncate(max - total, 0, max); - - let i = 0; - for (const video of videos) { - if (i > rest) return 100 + i; - const aid = video.aid; - const timestamp = video.published_at; - const diff = Date.now() - timestamp; - await VideoTagsQueue.add("getVideoTags", { aid }, { priority: getJobPriority(diff) }); - i++; - } - return 0; -}; diff --git a/lib/mq/executors.ts b/lib/mq/executors.ts index 6af60b2..85c2cc1 100644 --- a/lib/mq/executors.ts +++ b/lib/mq/executors.ts @@ -1 +1 @@ -export * from "lib/mq/exec/getLatestVideos.ts"; \ No newline at end of file +export * from "lib/mq/exec/getLatestVideos.ts"; diff --git a/lib/mq/index.ts b/lib/mq/index.ts index 4189ae5..9a22495 100644 --- a/lib/mq/index.ts +++ b/lib/mq/index.ts @@ -2,6 +2,4 @@ import { Queue } from "bullmq"; export const LatestVideosQueue = new Queue("latestVideos"); -export const VideoTagsQueue = new Queue("videoTags"); - export const ClassifyVideoQueue = new Queue("classifyVideo"); diff --git a/lib/mq/init.ts b/lib/mq/init.ts index fbfaa54..3eb2d81 100644 --- a/lib/mq/init.ts +++ b/lib/mq/init.ts @@ -1,19 +1,16 @@ import { MINUTE } from "$std/datetime/constants.ts"; -import { ClassifyVideoQueue, LatestVideosQueue, VideoTagsQueue } from "lib/mq/index.ts"; +import { ClassifyVideoQueue, LatestVideosQueue } from "lib/mq/index.ts"; import logger from "lib/log/logger.ts"; export async function initMQ() { await LatestVideosQueue.upsertJobScheduler("getLatestVideos", { - every: 1 * MINUTE - }); - await VideoTagsQueue.upsertJobScheduler("getVideosTags", { - every: 5 * MINUTE, + every: 1 * MINUTE, immediately: true, }); await ClassifyVideoQueue.upsertJobScheduler("classifyVideos", { every: 5 * MINUTE, immediately: true, - }) + }); logger.log("Message queue initialized."); } diff --git a/lib/mq/lockManager.ts b/lib/mq/lockManager.ts index 0aa989e..f83b148 100644 --- a/lib/mq/lockManager.ts +++ b/lib/mq/lockManager.ts @@ -23,12 +23,12 @@ class LockManager { const result = await this.redis.set(key, "locked", "NX"); if (result !== "OK") { - return false; + return false; } - if (timeout) { - await this.redis.expire(key, timeout); - } - return true; + if (timeout) { + await this.redis.expire(key, timeout); + } + return true; } /* diff --git a/lib/mq/rateLimiter.ts b/lib/mq/rateLimiter.ts index 41a2f4f..7f62547 100644 --- a/lib/mq/rateLimiter.ts +++ b/lib/mq/rateLimiter.ts @@ -7,7 +7,7 @@ export interface RateLimiterConfig { export class RateLimiter { private readonly configs: RateLimiterConfig[]; - private readonly configEventNames: string[]; + private readonly configEventNames: string[]; /* * @param name The name of the rate limiter @@ -17,7 +17,7 @@ export class RateLimiter { */ constructor(name: string, configs: RateLimiterConfig[]) { this.configs = configs; - this.configEventNames = configs.map((_, index) => `${name}_config_${index}`); + this.configEventNames = configs.map((_, index) => `${name}_config_${index}`); } /* @@ -53,4 +53,4 @@ export class RateLimiter { await config.window.clear(eventName); } } -} \ No newline at end of file +} diff --git a/lib/mq/scheduler.ts b/lib/mq/scheduler.ts index ba9fbb2..7d6ed80 100644 --- a/lib/mq/scheduler.ts +++ b/lib/mq/scheduler.ts @@ -1,7 +1,7 @@ import logger from "lib/log/logger.ts"; -import {RateLimiter, RateLimiterConfig} from "lib/mq/rateLimiter.ts"; -import {SlidingWindow} from "lib/mq/slidingWindow.ts"; -import {redis} from "lib/db/redis.ts"; +import { RateLimiter, RateLimiterConfig } from "lib/mq/rateLimiter.ts"; +import { SlidingWindow } from "lib/mq/slidingWindow.ts"; +import { redis } from "lib/db/redis.ts"; import Redis from "ioredis"; import { SECOND } from "$std/datetime/constants.ts"; @@ -152,7 +152,7 @@ class NetScheduler { const proxiesNames = this.getTaskProxies(task); for (const proxyName of shuffleArray(proxiesNames)) { if (await this.getProxyAvailability(proxyName, task)) { - return await this.proxyRequest(url, proxyName, method); + return await this.proxyRequest(url, proxyName, task, method); } } throw new NetSchedulerError("No available proxy currently.", "NO_AVAILABLE_PROXY"); @@ -186,8 +186,9 @@ class NetScheduler { if (!force) { const isAvailable = await this.getProxyAvailability(proxyName, task); + const limiter = "proxy-" + proxyName + "-" + task if (!isAvailable) { - throw new NetSchedulerError(`Proxy "${proxyName}" is rate limited`, "PROXY_RATE_LIMITED"); + throw new NetSchedulerError(`Proxy "${limiter}" is rate limited`, "PROXY_RATE_LIMITED"); } } @@ -225,7 +226,7 @@ class NetScheduler { logger.error(error, "redis"); return false; } - logger.warn(`Unhandled error: ${error.message}`, "mq", "getProxyAvailability"); + logger.error(error, "mq", "getProxyAvailability"); return false; } } @@ -237,7 +238,7 @@ class NetScheduler { const response = await fetch(url, { method, - signal: controller.signal + signal: controller.signal, }); clearTimeout(timeout); @@ -281,7 +282,7 @@ const biliLimiterConfig: RateLimiterConfig[] = [ netScheduler.addProxy("native", "native", ""); netScheduler.addTask("getVideoInfo", "bilibili", "all"); netScheduler.addTask("getLatestVideos", "bilibili", "all"); -netScheduler.setTaskLimiter("getVideoInfo", videoInfoRateLimiterConfig) +netScheduler.setTaskLimiter("getVideoInfo", videoInfoRateLimiterConfig); netScheduler.setTaskLimiter("getLatestVideos", null); netScheduler.setProviderLimiter("bilibili", biliLimiterConfig); diff --git a/lib/mq/schema.ts b/lib/mq/schema.ts index 9b48e99..07e4033 100644 --- a/lib/mq/schema.ts +++ b/lib/mq/schema.ts @@ -9,4 +9,4 @@ export class WorkerError extends Error { this.service = service; this.rawError = rawError; } -} \ No newline at end of file +} diff --git a/lib/mq/slidingWindow.ts b/lib/mq/slidingWindow.ts index 049a9f0..499528f 100644 --- a/lib/mq/slidingWindow.ts +++ b/lib/mq/slidingWindow.ts @@ -21,7 +21,7 @@ export class SlidingWindow { async event(eventName: string): Promise { const now = Date.now(); const key = `cvsa:sliding_window:${eventName}`; - + const uniqueMember = `${now}-${Math.random()}`; // Add current timestamp to an ordered set await this.redis.zadd(key, now, uniqueMember); diff --git a/lib/mq/task/getVideoInfo.ts b/lib/mq/task/getVideoInfo.ts new file mode 100644 index 0000000..4d1c615 --- /dev/null +++ b/lib/mq/task/getVideoInfo.ts @@ -0,0 +1,40 @@ +import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { getVideoInfo } from "lib/net/getVideoInfo.ts"; +import { formatTimestampToPsql } from "lib/utils/formatTimestampToPostgre.ts"; +import logger from "lib/log/logger.ts"; +import { ClassifyVideoQueue } from "lib/mq/index.ts"; +import { userExistsInBiliUsers, videoExistsInAllData } from "lib/db/allData.ts"; + +export async function insertVideoInfo(client: Client, aid: number) { + const videoExists = await videoExistsInAllData(client, aid); + if (videoExists) { + return; + } + const data = await getVideoInfo(aid); + if (data === null) { + return null; + } + const bvid = data.View.bvid; + const desc = data.View.desc; + const uid = data.View.owner.mid; + const tags = data.Tags + .filter((tag) => tag.tag_type in ["old_channel", "topic"]) + .map((tag) => tag.tag_name).join(","); + const title = data.View.title; + const published_at = formatTimestampToPsql(data.View.pubdate); + const duration = data.View.duration; + await client.queryObject( + `INSERT INTO all_data (aid, bvid, description, uid, tags, title, published_at, duration) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8)`, + [aid, bvid, desc, uid, tags, title, published_at, duration], + ); + const userExists = await userExistsInBiliUsers(client, aid); + if (!userExists) { + await client.queryObject( + `INSERT INTO bili_user (uid, username, "desc", fans) VALUES ($1, $2, $3, $4)`, + [uid, data.View.owner.name, data.Card.card.sign, data.Card.follower], + ); + } + logger.log(`Inserted video metadata for aid: ${aid}`, "mq"); + await ClassifyVideoQueue.add("classifyVideo", { aid }); +} diff --git a/lib/mq/task/queueLatestVideo.ts b/lib/mq/task/queueLatestVideo.ts new file mode 100644 index 0000000..f688c2f --- /dev/null +++ b/lib/mq/task/queueLatestVideo.ts @@ -0,0 +1,55 @@ +import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; +import { getLatestVideoAids } from "lib/net/getLatestVideoAids.ts"; +import { videoExistsInAllData } from "lib/db/allData.ts"; +import { sleep } from "lib/utils/sleep.ts"; +import { SECOND } from "$std/datetime/constants.ts"; +import logger from "lib/log/logger.ts"; +import { LatestVideosQueue } from "lib/mq/index.ts"; + +export async function queueLatestVideos( + client: Client, +): Promise { + let page = 1; + let i = 0; + const videosFound = new Set(); + while (true) { + const pageSize = page == 1 ? 10 : 30; + const aids = await getLatestVideoAids(page, pageSize); + if (aids.length == 0) { + logger.verbose("No more videos found", "net", "fn:insertLatestVideos()"); + break; + } + let allExists = true; + let delay = 0; + for (const aid of aids) { + const videoExists = await videoExistsInAllData(client, aid); + if (videoExists) { + continue; + } + await LatestVideosQueue.add("getVideoInfo", { aid }, { delay, + attempts: 100, + backoff: { + type: "fixed", + delay: SECOND * 5 + } + }); + videosFound.add(aid); + allExists = false; + delay += Math.random() * SECOND * 0.5; + } + i += aids.length; + logger.log( + `Page ${page} crawled, total: ${videosFound.size}/${i} videos added/observed.`, + "net", + "fn:queueLatestVideos()", + ); + if (allExists) { + return 0; + } + page++; + const randomTime = Math.random() * 4000; + const delta = SECOND; + await sleep(randomTime + delta); + } + return 0; +} diff --git a/lib/net/bilibili.d.ts b/lib/net/bilibili.d.ts index a0f682d..16c70a0 100644 --- a/lib/net/bilibili.d.ts +++ b/lib/net/bilibili.d.ts @@ -1,117 +1,224 @@ interface BaseResponse { - code: number; - message: string; - ttl: number; - data: T; + code: number; + message: string; + ttl: number; + data: T; } export type VideoListResponse = BaseResponse; +export type VideoDetailsResponse = BaseResponse; export type VideoTagsResponse = BaseResponse; +interface VideoDetailsData { + View: { + bvid: string; + aid: number; + videos: number; + tid: number; + tid_v2: number; + tname: string; + tname_v2: string; + copyright: number; + pic: string; + title: string; + pubdate: number; + ctime: number; + desc: string; + desc_v2: string; + state: number; + duration: number; + mission_id: number; + rights: VideoRights; + owner: { + mid: number; + name: string; + face: string; + }; + stat: VideoStats; + argue_info: { + argue_msg: string; + argue_type: number; + argue_link: string; + }; + dynamic: ""; + cid: number; + dimension: VideoDimension; + pages: VideoPage[]; + subtitle: { + allow_submit: number; + list: VideoSubTitle[]; + }; + staff: VideoStaff[]; + }; + Card: { + card: { + mid: number; + name: string; + sex: string; + face: string; + fans: number; + attention: number; + friend: number; + sign: string; + level_info: { + current_level: number; + }; + }; + archive_count: number; + article_count: number; + follower: number; + like_num: number; + }; + Tags: VideoTagsLite[]; +} + +interface VideoTagsLite { + tag_id: number; + tag_name: string; + music_id: string; + tag_type: string; + jump_url: string; +} + type VideoTagsData = VideoTags[]; +type VideoStaff = { + mid: number; + title: string; + name: string; + face: string; + follower: number; +}; + +type VideoSubTitle = { + id: number; + lan: string; + lan_doc: string; + is_lock: number; + subtitle_url: string; + type: number; + id_str: string; + ai_type: number; + ai_status: number; +}; + +type VideoDimension = { + width: number; + height: number; + rotate: number; +}; + +interface VideoPage { + cid: number; + page: number; + from: string; + part: string; + duration: number; + vid: string; + weblink: string; + dimension: VideoDimension; + first_frame: string; +} + interface VideoTags { - tag_id: number; - tag_name: string; - cover: string; - head_cover: string; - content: string; - short_content: string; - type: number; - state: number; - ctime: number; - count: { - view: number; - use: number; - atten: number; - } - is_atten: number; - likes: number; - hates: number; - attribute: number; - liked: number; - hated: number; - extra_attr: number; + tag_id: number; + tag_name: string; + cover: string; + head_cover: string; + content: string; + short_content: string; + type: number; + state: number; + ctime: number; + count: { + view: number; + use: number; + atten: number; + }; + is_atten: number; + likes: number; + hates: number; + attribute: number; + liked: number; + hated: number; + extra_attr: number; } interface VideoListData { - archives: VideoListVideo[]; - page: { - num: number; - size: number; - count: number; - }; + archives: VideoListVideo[]; + page: { + num: number; + size: number; + count: number; + }; } +type VideoRights = { + bp: number; + elec: number; + download: number; + movie: number; + pay: number; + hd5: number; + no_reprint: number; + autoplay: number; + ugc_pay: number; + is_cooperation: number; + ugc_pay_preview: number; + no_background: number; + arc_pay: number; + pay_free_watch: number; +}; + +type VideoStats = { + aid: number; + view: number; + danmaku: number; + reply: number; + favorite: number; + coin: number; + share: number; + now_rank: number; + his_rank: number; + like: number; +}; + interface VideoListVideo { - aid: number; - videos: number; - tid: number; - tname: string; - copyright: number; - pic: string; - title: string; - pubdate: number; - ctime: number; - desc: string; - state: number; - duration: number; - mission_id?: number; - rights: { - bp: number; - elec: number; - download: number; - movie: number; - pay: number; - hd5: number; - no_reprint: number; - autoplay: number; - ugc_pay: number; - is_cooperation: number; - ugc_pay_preview: number; - no_background: number; - arc_pay: number; - pay_free_watch: number; - }, - owner: { - mid: number; - name: string; - face: string; - }, - stat: { - aid: number; - view: number; - danmaku: number; - reply: number; - favorite: number; - coin: number; - share: number; - now_rank: number; - his_rank: number; - like: number; - dislike: number; - vt: number; - vv: number; - }, - dynamic: string; - cid: number; - dimension: { - width: number; - height: number; - rotate: number; - }, - season_id?: number; - short_link_v2: string; - first_frame: string; - pub_location: string; - cover43: string; - tidv2: number; - tname_v2: string; - bvid: string; - season_type: number; - is_ogv: number; - ovg_info: string | null; - rcmd_season: string; - enable_vt: number; - ai_rcmd: null | string; + aid: number; + videos: number; + tid: number; + tname: string; + copyright: number; + pic: string; + title: string; + pubdate: number; + ctime: number; + desc: string; + state: number; + duration: number; + mission_id?: number; + rights: VideoRights; + owner: { + mid: number; + name: string; + face: string; + }; + stat: VideoStats; + dynamic: string; + cid: number; + dimension: VideoDimension; + season_id?: number; + short_link_v2: string; + first_frame: string; + pub_location: string; + cover43: string; + tidv2: number; + tname_v2: string; + bvid: string; + season_type: number; + is_ogv: number; + ovg_info: string | null; + rcmd_season: string; + enable_vt: number; + ai_rcmd: null | string; } diff --git a/lib/net/bisectVideoStartFrom.ts b/lib/net/bisectVideoStartFrom.ts deleted file mode 100644 index 66d9c27..0000000 --- a/lib/net/bisectVideoStartFrom.ts +++ /dev/null @@ -1,88 +0,0 @@ -import { getLatestVideos } from "lib/net/getLatestVideos.ts"; -import { SECOND } from "$std/datetime/constants.ts"; -import { VideoListVideo } from "lib/net/bilibili.d.ts"; - -export async function getVideoPositionInNewList(timestamp: number): Promise { - const virtualPageSize = 50; - - let lowPage = 1; - let highPage = 1; - let foundUpper = false; - while (true) { - const ps = highPage < 2 ? 50 : 1 - const pn = highPage < 2 ? 1 : highPage * virtualPageSize; - const videos = await getLatestVideos(pn, ps); - if (!videos || videos.length === 0) { - break; - } - const lastVideo = videos[videos.length - 1]; - if (!lastVideo || !lastVideo.pubdate) { - break; - } - const lastTime = lastVideo.pubdate * SECOND - if (lastTime <= timestamp && highPage == 1) { - return videos; - } - else if (lastTime <= timestamp) { - foundUpper = true; - break; - } else { - lowPage = highPage; - highPage *= 2; - } - } - - if (!foundUpper) { - return null; - } - - let boundaryPage = highPage; - let lo = lowPage; - let hi = highPage; - while (lo <= hi) { - const mid = Math.floor((lo + hi) / 2); - const videos = await getLatestVideos(mid * virtualPageSize, 1); - if (!videos) { - return null; - } - if (videos.length === 0) { - hi = mid - 1; - continue; - } - const lastVideo = videos[videos.length - 1]; - if (!lastVideo || !lastVideo.pubdate) { - hi = mid - 1; - continue; - } - const lastTime = lastVideo.pubdate * SECOND - if (lastTime > timestamp) { - lo = mid + 1; - } else { - boundaryPage = mid; - hi = mid - 1; - } - } - - const boundaryVideos = await getLatestVideos(boundaryPage, virtualPageSize); - let indexInPage = 0; - if (boundaryVideos && boundaryVideos.length > 0) { - for (let i = 0; i < boundaryVideos.length; i++) { - const video = boundaryVideos[i]; - if (!video.pubdate) { - continue; - } - const videoTime = video.pubdate * SECOND - if (videoTime > timestamp) { - indexInPage++; - } else { - break; - } - } - } - - const count = (boundaryPage - 1) * virtualPageSize + indexInPage; - - const safetyMargin = 5; - - return count + safetyMargin; -} diff --git a/lib/net/getLatestVideoAids.ts b/lib/net/getLatestVideoAids.ts new file mode 100644 index 0000000..2fb44be --- /dev/null +++ b/lib/net/getLatestVideoAids.ts @@ -0,0 +1,21 @@ +import { VideoListResponse } from "lib/net/bilibili.d.ts"; +import logger from "lib/log/logger.ts"; +import netScheduler from "lib/mq/scheduler.ts"; + +export async function getLatestVideoAids(page: number = 1, pageSize: number = 10): Promise { + const startFrom = 1 + pageSize * (page - 1); + const endTo = pageSize * page; + const range = `${startFrom}-${endTo}`; + const errMessage = `Error fetching latest aid for ${range}:`; + const url = `https://api.bilibili.com/x/web-interface/newlist?rid=30&ps=${pageSize}&pn=${page}`; + const data = await netScheduler.request(url, "getLatestVideos"); + if (data.code != 0) { + logger.error(errMessage + data.message, "net", "getLastestVideos"); + return []; + } + if (data.data.archives.length === 0) { + logger.verbose("No more videos found", "net", "getLatestVideos"); + return []; + } + return data.data.archives.map((video) => video.aid); +} diff --git a/lib/net/getLatestVideos.ts b/lib/net/getLatestVideos.ts deleted file mode 100644 index b41eae5..0000000 --- a/lib/net/getLatestVideos.ts +++ /dev/null @@ -1,36 +0,0 @@ -import {VideoListResponse } from "lib/net/bilibili.d.ts"; -import logger from "lib/log/logger.ts"; -import netScheduler, {NetSchedulerError} from "lib/mq/scheduler.ts"; - -export async function getLatestVideoAids(page: number = 1, pageSize: number = 10): Promise { - const startFrom = 1 + pageSize * (page - 1); - const endTo = pageSize * page; - const range = `${startFrom}-${endTo}` - const errMessage = `Error fetching latest aid for ${range}:` - try { - const url = `https://api.bilibili.com/x/web-interface/newlist?rid=30&ps=${pageSize}&pn=${page}`; - const data = await netScheduler.request(url, 'getLatestVideos'); - if (data.code != 0) { - logger.error(errMessage + data.message, 'net', 'getLastestVideos'); - return []; - } - if (data.data.archives.length === 0) { - logger.verbose("No more videos found", "net", "getLatestVideos"); - return []; - } - return data.data.archives.map(video => video.aid); - } - catch (e) { - const error = e as NetSchedulerError; - if (error.code == "FETCH_ERROR") { - const rawError = error.rawError! as Error; - rawError.message = errMessage + rawError.message; - logger.error(rawError, 'net', 'getVideoTags'); - return null; - } - else { - // Re-throw the error - throw e; - } - } -} diff --git a/lib/net/getVideoInfo.ts b/lib/net/getVideoInfo.ts new file mode 100644 index 0000000..e64a91b --- /dev/null +++ b/lib/net/getVideoInfo.ts @@ -0,0 +1,15 @@ +import netScheduler from "lib/mq/scheduler.ts"; +import { VideoDetailsData, VideoDetailsResponse } from "lib/net/bilibili.d.ts"; +import logger from "lib/log/logger.ts"; + +export async function getVideoInfo(aid: number): Promise { + const url = `https://api.bilibili.com/x/web-interface/view/detail?aid=${aid}`; + const data = await netScheduler.request(url, "getVideoInfo"); + const errMessage = `Error fetching metadata for ${aid}:`; + logger.log("Fetching metadata for " + aid, "net", "fn:getVideoInfo"); + if (data.code !== 0) { + logger.error(errMessage + data.message, "net", "fn:getVideoInfo"); + return null; + } + return data.data; +} diff --git a/lib/net/getVideoTags.ts b/lib/net/getVideoTags.ts deleted file mode 100644 index 4ec0af6..0000000 --- a/lib/net/getVideoTags.ts +++ /dev/null @@ -1,35 +0,0 @@ -import { VideoTagsResponse } from "lib/net/bilibili.d.ts"; -import netScheduler, {NetSchedulerError} from "lib/mq/scheduler.ts"; -import logger from "lib/log/logger.ts"; - -/* - * Fetch the tags for a video - * @param {number} aid The video's aid - * @return {Promise} A promise, which resolves to an array of tags, - * or null if an `fetch` error occurred - * @throws {NetSchedulerError} If the request failed. - */ -export async function getVideoTags(aid: number): Promise { - try { - const url = `https://api.bilibili.com/x/tag/archive/tags?aid=${aid}`; - const data = await netScheduler.request(url, 'getVideoTags'); - if (data.code != 0) { - logger.error(`Error fetching tags for video ${aid}: ${data.message}`, 'net', 'getVideoTags'); - return []; - } - return data.data.map((tag) => tag.tag_name); - } - catch (e) { - const error = e as NetSchedulerError; - if (error.code == "FETCH_ERROR") { - const rawError = error.rawError! as Error; - rawError.message = `Error fetching tags for video ${aid}: ` + rawError.message; - logger.error(rawError, 'net', 'getVideoTags'); - return null; - } - else { - // Re-throw the error - throw e; - } - } -} diff --git a/lib/task/insertLatestVideo.ts b/lib/task/insertLatestVideo.ts deleted file mode 100644 index e6b750b..0000000 --- a/lib/task/insertLatestVideo.ts +++ /dev/null @@ -1,76 +0,0 @@ -import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { getLatestVideos } from "lib/net/getLatestVideos.ts"; -import { getLatestVideoTimestampFromAllData, insertIntoAllData, videoExistsInAllData } from "lib/db/allData.ts"; -import { sleep } from "lib/utils/sleep.ts"; -import { getVideoPositionInNewList } from "lib/net/bisectVideoStartFrom.ts"; -import { SECOND } from "$std/datetime/constants.ts"; -import logger from "lib/log/logger.ts"; - -export async function insertLatestVideos( - client: Client, - pageSize: number = 10, - intervalRate: number = 4000, -): Promise { - const latestVideoTimestamp = await getLatestVideoTimestampFromAllData(client); - if (latestVideoTimestamp == null) { - logger.error("Cannot get latest video timestamp from current database.", "net", "fn:insertLatestVideos()"); - return null - } - logger.log(`Latest video in the database: ${new Date(latestVideoTimestamp).toISOString()}`, "net", "fn:insertLatestVideos()") - const videoIndex = await getVideoPositionInNewList(latestVideoTimestamp); - if (videoIndex == null) { - logger.error("Cannot locate the video through bisect.", "net", "fn:insertLatestVideos()"); - return null - } - if (typeof videoIndex == "object") { - for (const video of videoIndex) { - const videoExists = await videoExistsInAllData(client, video.aid); - if (!videoExists) { - await insertIntoAllData(client, video); - } - } - return 0; - } - let page = Math.floor(videoIndex / pageSize) + 1; - let failCount = 0; - const insertedVideos = new Set(); - while (true) { - try { - const videos = await getLatestVideos(page, pageSize); - if (videos == null) { - failCount++; - if (failCount > 5) { - return null; - } - continue; - } - failCount = 0; - if (videos.length == 0) { - logger.verbose("No more videos found", "net", "fn:insertLatestVideos()"); - break; - } - for (const video of videos) { - const videoExists = await videoExistsInAllData(client, video.aid); - if (!videoExists) { - await insertIntoAllData(client, video); - insertedVideos.add(video.aid); - } - } - logger.log(`Page ${page} crawled, total: ${insertedVideos.size} videos.`, "net", "fn:insertLatestVideos()"); - page--; - if (page < 1) { - return 0; - } - } catch (error) { - logger.error(error as Error, "net", "fn:insertLatestVideos()"); - failCount++; - if (failCount > 5) { - return null; - } - - } finally { - await sleep(Math.random() * intervalRate + failCount * 3 * SECOND + SECOND); - } - } - return 0; -} diff --git a/lib/utils/sleep.ts b/lib/utils/sleep.ts index 3a5dcb9..63e382d 100644 --- a/lib/utils/sleep.ts +++ b/lib/utils/sleep.ts @@ -1,3 +1,3 @@ export async function sleep(ms: number) { await new Promise((resolve) => setTimeout(resolve, ms)); -} \ No newline at end of file +} diff --git a/lib/utils/truncate.ts b/lib/utils/truncate.ts index 677978d..3d5800d 100644 --- a/lib/utils/truncate.ts +++ b/lib/utils/truncate.ts @@ -1,3 +1,3 @@ export function truncate(num: number, min: number, max: number) { - return Math.max(min, Math.min(num, max)) -} \ No newline at end of file + return Math.max(min, Math.min(num, max)); +} diff --git a/src/bullui.ts b/src/bullui.ts index 1850bac..407d1c5 100644 --- a/src/bullui.ts +++ b/src/bullui.ts @@ -2,13 +2,16 @@ import express from "express"; import { createBullBoard } from "@bull-board/api"; import { BullMQAdapter } from "@bull-board/api/bullMQAdapter.js"; import { ExpressAdapter } from "@bull-board/express"; -import { ClassifyVideoQueue, LatestVideosQueue, VideoTagsQueue } from "lib/mq/index.ts"; +import { ClassifyVideoQueue, LatestVideosQueue } from "lib/mq/index.ts"; const serverAdapter = new ExpressAdapter(); serverAdapter.setBasePath("/"); createBullBoard({ - queues: [new BullMQAdapter(LatestVideosQueue), new BullMQAdapter(VideoTagsQueue), new BullMQAdapter(ClassifyVideoQueue)], + queues: [ + new BullMQAdapter(LatestVideosQueue), + new BullMQAdapter(ClassifyVideoQueue), + ], serverAdapter: serverAdapter, }); @@ -16,8 +19,6 @@ const app = express(); app.use("/", serverAdapter.getRouter()); -// other configurations of your server - app.listen(3000, () => { console.log("Running on 3000..."); console.log("For the UI, open http://localhost:3000/"); diff --git a/src/filterWorker.ts b/src/filterWorker.ts index 9746477..8eb43d4 100644 --- a/src/filterWorker.ts +++ b/src/filterWorker.ts @@ -18,7 +18,6 @@ Deno.addSignalListener("SIGTERM", async () => { Deno.exit(); }); - await initializeModels(); const filterWorker = new Worker( @@ -45,6 +44,6 @@ filterWorker.on("error", (err) => { logger.error(e.rawError, e.service, e.codePath); }); -filterWorker.on("closed", async() => { +filterWorker.on("closed", async () => { await lockManager.releaseLock("classifyVideos"); -}) +}); diff --git a/src/worker.ts b/src/worker.ts index b24abed..fbe791c 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -2,22 +2,19 @@ import { Job, Worker } from "bullmq"; import { getLatestVideosWorker } from "lib/mq/executors.ts"; import { redis } from "lib/db/redis.ts"; import logger from "lib/log/logger.ts"; -import { getVideoTagsWorker } from "lib/mq/exec/getVideoTags.ts"; -import { getVideoTagsInitializer } from "lib/mq/exec/getVideoTags.ts"; import { lockManager } from "lib/mq/lockManager.ts"; import { WorkerError } from "lib/mq/schema.ts"; +import { getVideoInfoWorker } from "lib/mq/exec/getVideoInfo.ts"; Deno.addSignalListener("SIGINT", async () => { logger.log("SIGINT Received: Shutting down workers...", "mq"); await latestVideoWorker.close(true); - await videoTagsWorker.close(true); Deno.exit(); }); Deno.addSignalListener("SIGTERM", async () => { logger.log("SIGTERM Received: Shutting down workers...", "mq"); await latestVideoWorker.close(true); - await videoTagsWorker.close(true); Deno.exit(); }); @@ -28,11 +25,14 @@ const latestVideoWorker = new Worker( case "getLatestVideos": await getLatestVideosWorker(job); break; + case "getVideoInfo": + await getVideoInfoWorker(job); + break; default: break; } }, - { connection: redis, concurrency: 1, removeOnComplete: { count: 1440 } }, + { connection: redis, concurrency: 6, removeOnComplete: { count: 1440 } }, ); latestVideoWorker.on("active", () => { @@ -47,33 +47,3 @@ latestVideoWorker.on("error", (err) => { latestVideoWorker.on("closed", async () => { await lockManager.releaseLock("getLatestVideos"); }); - -const videoTagsWorker = new Worker( - "videoTags", - async (job: Job) => { - switch (job.name) { - case "getVideoTags": - return await getVideoTagsWorker(job); - case "getVideosTags": - return await getVideoTagsInitializer(); - default: - break; - } - }, - { - connection: redis, - concurrency: 6, - removeOnComplete: { - count: 1000, - }, - }, -); - -videoTagsWorker.on("active", () => { - logger.log("Worker (videoTags) activated.", "mq"); -}); - -videoTagsWorker.on("error", (err) => { - const e = err as WorkerError; - logger.error(e.rawError, e.service, e.codePath); -}); diff --git a/test/db/videoTagIsNull.test.ts b/test/db/videoTagIsNull.test.ts deleted file mode 100644 index 7ffc8cc..0000000 --- a/test/db/videoTagIsNull.test.ts +++ /dev/null @@ -1,33 +0,0 @@ -import { assertEquals } from "jsr:@std/assert"; -import { videoTagsIsNull } from "lib/db/allData.ts"; -import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts"; -import { postgresConfig } from "lib/db/pgConfig.ts"; - -// A minimal aid which has an empty tags field in our database -const TEST_AID = 63569; - -Deno.test("videoTagsIsNull function", async () => { - const client = new Client(postgresConfig); - - try { - const transaction = client.createTransaction("test_transaction"); - await transaction.begin(); - - const result1 = await videoTagsIsNull(transaction, TEST_AID); - assertEquals(typeof result1, "boolean", "The result should be a boolean value."); - assertEquals(result1, false, "The result should be false if tags is not NULL for the given aid."); - - await transaction.queryArray`UPDATE all_data SET tags = NULL WHERE aid = ${TEST_AID}`; - - const result2 = await videoTagsIsNull(transaction, TEST_AID); - assertEquals(typeof result2, "boolean", "The result should be a boolean value."); - assertEquals(result2, true, "The result should be true if tags is NULL for the given aid."); - - await transaction.rollback(); - } catch (error) { - console.error("Error during test:", error); - throw error; - } finally { - client.end(); - } -}); diff --git a/test/mq/rateLimiter.test.ts b/test/mq/rateLimiter.test.ts index 054e945..2f19723 100644 --- a/test/mq/rateLimiter.test.ts +++ b/test/mq/rateLimiter.test.ts @@ -1,7 +1,7 @@ -import {assertEquals} from "jsr:@std/assert"; -import {SlidingWindow} from "lib/mq/slidingWindow.ts"; -import {RateLimiter, RateLimiterConfig} from "lib/mq/rateLimiter.ts"; -import {Redis} from "npm:ioredis@5.5.0"; +import { assertEquals } from "jsr:@std/assert"; +import { SlidingWindow } from "lib/mq/slidingWindow.ts"; +import { RateLimiter, RateLimiterConfig } from "lib/mq/rateLimiter.ts"; +import { Redis } from "npm:ioredis@5.5.0"; Deno.test("RateLimiter works correctly", async () => { const redis = new Redis({ maxRetriesPerRequest: null }); @@ -71,7 +71,7 @@ Deno.test("Multiple configs work correctly", async () => { await new Promise((resolve) => setTimeout(resolve, windowSize1 * 1000 + 500)); // Availability should now be true (due to config1) - assertEquals(await rateLimiter.getAvailability(), true); + assertEquals(await rateLimiter.getAvailability(), true); // Trigger events up to the limit of the second config for (let i = maxRequests1; i < maxRequests2; i++) { @@ -88,4 +88,4 @@ Deno.test("Multiple configs work correctly", async () => { assertEquals(await rateLimiter.getAvailability(), true); redis.quit(); -}); \ No newline at end of file +}); diff --git a/test/mq/slidingWindow.test.ts b/test/mq/slidingWindow.test.ts index cde8d11..a749edc 100644 --- a/test/mq/slidingWindow.test.ts +++ b/test/mq/slidingWindow.test.ts @@ -7,13 +7,13 @@ Deno.test("SlidingWindow - event and count", async () => { const windowSize = 5000; // 5 seconds const slidingWindow = new SlidingWindow(redis, windowSize); const eventName = "test_event"; - await slidingWindow.clear(eventName); + await slidingWindow.clear(eventName); await slidingWindow.event(eventName); const count = await slidingWindow.count(eventName); assertEquals(count, 1); - redis.quit(); + redis.quit(); }); Deno.test("SlidingWindow - multiple events", async () => { @@ -21,7 +21,7 @@ Deno.test("SlidingWindow - multiple events", async () => { const windowSize = 5000; // 5 seconds const slidingWindow = new SlidingWindow(redis, windowSize); const eventName = "test_event"; - await slidingWindow.clear(eventName); + await slidingWindow.clear(eventName); await slidingWindow.event(eventName); await slidingWindow.event(eventName); @@ -29,7 +29,7 @@ Deno.test("SlidingWindow - multiple events", async () => { const count = await slidingWindow.count(eventName); assertEquals(count, 3); - redis.quit(); + redis.quit(); }); Deno.test("SlidingWindow - no events", async () => { @@ -37,12 +37,12 @@ Deno.test("SlidingWindow - no events", async () => { const windowSize = 5000; // 5 seconds const slidingWindow = new SlidingWindow(redis, windowSize); const eventName = "test_event"; - await slidingWindow.clear(eventName); + await slidingWindow.clear(eventName); const count = await slidingWindow.count(eventName); assertEquals(count, 0); - redis.quit(); + redis.quit(); }); Deno.test("SlidingWindow - different event names", async () => { @@ -51,8 +51,8 @@ Deno.test("SlidingWindow - different event names", async () => { const slidingWindow = new SlidingWindow(redis, windowSize); const eventName1 = "test_event_1"; const eventName2 = "test_event_2"; - await slidingWindow.clear(eventName1); - await slidingWindow.clear(eventName2); + await slidingWindow.clear(eventName1); + await slidingWindow.clear(eventName2); await slidingWindow.event(eventName1); await slidingWindow.event(eventName2); @@ -62,7 +62,7 @@ Deno.test("SlidingWindow - different event names", async () => { assertEquals(count1, 1); assertEquals(count2, 1); - redis.quit(); + redis.quit(); }); Deno.test("SlidingWindow - large number of events", async () => { @@ -70,7 +70,7 @@ Deno.test("SlidingWindow - large number of events", async () => { const windowSize = 5000; // 5 seconds const slidingWindow = new SlidingWindow(redis, windowSize); const eventName = "test_event"; - await slidingWindow.clear(eventName); + await slidingWindow.clear(eventName); const numEvents = 1000; for (let i = 0; i < numEvents; i++) { @@ -80,5 +80,5 @@ Deno.test("SlidingWindow - large number of events", async () => { const count = await slidingWindow.count(eventName); assertEquals(count, numEvents); - redis.quit(); + redis.quit(); }); diff --git a/test/net/getLatestVideos.test.ts b/test/net/getLatestVideos.test.ts deleted file mode 100644 index b2daa4d..0000000 --- a/test/net/getLatestVideos.test.ts +++ /dev/null @@ -1,25 +0,0 @@ -import { assertEquals } from "jsr:@std/assert"; -import { getLatestVideos } from "lib/net/getLatestVideos.ts"; - -Deno.test("Get latest videos", async () => { - const videos = (await getLatestVideos(1, 5))!; - assertEquals(videos.length, 5); - - videos.forEach((video) => { - assertVideoProperties(video); - }); -}); - -function assertVideoProperties(video: object) { - const aid = "aid" in video && typeof video.aid === "number"; - const bvid = "bvid" in video && typeof video.bvid === "string" && - video.bvid.length === 12 && video.bvid.startsWith("BV"); - const description = "description" in video && typeof video.description === "string"; - const uid = "uid" in video && typeof video.uid === "number"; - const tags = "tags" in video && (typeof video.tags === "string" || video.tags === null); - const title = "title" in video && typeof video.title === "string"; - const publishedAt = "published_at" in video && typeof video.published_at === "string"; - - const match = aid && bvid && description && uid && tags && title && publishedAt; - assertEquals(match, true); -} diff --git a/test/net/getVideoTags.test.ts b/test/net/getVideoTags.test.ts deleted file mode 100644 index 0487dfb..0000000 --- a/test/net/getVideoTags.test.ts +++ /dev/null @@ -1,28 +0,0 @@ -import { assertEquals } from "jsr:@std/assert"; -import { getVideoTags } from "lib/net/getVideoTags.ts"; - -Deno.test("Get video tags - regular video", async () => { - const tags = (await getVideoTags(826597951))!.sort(); - assertEquals(tags, [ - "纯白P", - "中华墨水娘", - "中华少女", - "中华粘土娘", - "中华缘木娘", - "中华少女Project", - "提糯Tino", - "中华烛火娘", - "中华烁金娘", - "新世代音乐人计划女生季", - ].sort()); -}); - -Deno.test("Get video tags - non-existent video", async () => { - const tags = (await getVideoTags(8265979511111111)); - assertEquals(tags, []); -}); - -Deno.test("Get video tags - video with no tag", async () => { - const tags = (await getVideoTags(981001865)); - assertEquals(tags, []); -}); \ No newline at end of file