Compare commits
3 Commits
7337538f0b
...
879a6604e5
Author | SHA1 | Date | |
---|---|---|---|
879a6604e5 | |||
d88ad099c4 | |||
636c5e25cb |
12
.gitignore
vendored
12
.gitignore
vendored
@ -51,7 +51,6 @@ internal/
|
||||
!tests/cases/projects/projectOption/**/node_modules
|
||||
!tests/cases/projects/NodeModulesSearch/**/*
|
||||
!tests/baselines/reference/project/nodeModules*/**/*
|
||||
.idea
|
||||
yarn.lock
|
||||
yarn-error.log
|
||||
.parallelperf.*
|
||||
@ -78,10 +77,11 @@ node_modules/
|
||||
# project specific
|
||||
logs/
|
||||
__pycache__
|
||||
filter/runs
|
||||
pred/runs
|
||||
pred/checkpoints
|
||||
data/
|
||||
filter/checkpoints
|
||||
ml/filter/runs
|
||||
ml/pred/runs
|
||||
ml/pred/checkpoints
|
||||
ml/pred/observed
|
||||
ml/data/
|
||||
ml/filter/checkpoints
|
||||
scripts
|
||||
model/
|
||||
|
9
.idea/.gitignore
vendored
Normal file
9
.idea/.gitignore
vendored
Normal file
@ -0,0 +1,9 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
||||
dataSources.xml
|
21
.idea/cvsa.iml
Normal file
21
.idea/cvsa.iml
Normal file
@ -0,0 +1,21 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="WEB_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/.tmp" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/temp" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/tmp" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/ml/data" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/doc" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/ml/filter/checkpoints" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/ml/filter/runs" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/ml/lab/data" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/ml/lab/temp" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/logs" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/model" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/src/db" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
12
.idea/inspectionProfiles/Project_Default.xml
Normal file
12
.idea/inspectionProfiles/Project_Default.xml
Normal file
@ -0,0 +1,12 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="GrazieInspection" enabled="false" level="GRAMMAR_ERROR" enabled_by_default="false" />
|
||||
<inspection_tool class="LanguageDetectionInspection" enabled="false" level="WARNING" enabled_by_default="false" />
|
||||
<inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
|
||||
<option name="processCode" value="true" />
|
||||
<option name="processLiterals" value="true" />
|
||||
<option name="processComments" value="true" />
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
8
.idea/modules.xml
Normal file
8
.idea/modules.xml
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/cvsa.iml" filepath="$PROJECT_DIR$/.idea/cvsa.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
6
.idea/sqldialects.xml
Normal file
6
.idea/sqldialects.xml
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="SqlDialectMappings">
|
||||
<file url="PROJECT" dialect="PostgreSQL" />
|
||||
</component>
|
||||
</project>
|
6
.idea/vcs.xml
Normal file
6
.idea/vcs.xml
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
65
README-refactor.md
Normal file
65
README-refactor.md
Normal file
@ -0,0 +1,65 @@
|
||||
# 项目重构方案
|
||||
|
||||
## 目标架构
|
||||
采用monorepo结构管理三个独立部分:
|
||||
1. `packages/crawler` - 现有爬虫功能
|
||||
2. `packages/frontend` - 基于Astro的前端
|
||||
3. `packages/backend` - 基于Hono的API后端
|
||||
|
||||
## 目录结构调整方案
|
||||
|
||||
### 新结构
|
||||
```
|
||||
.
|
||||
├── packages/
|
||||
│ ├── crawler/ # 爬虫组件
|
||||
│ ├── frontend/ # Astro前端
|
||||
│ ├── backend/ # Hono后端API
|
||||
│ └── core/ # 共享代码(未来提取)
|
||||
├── docs/ # 文档
|
||||
├── scripts/ # 项目脚本
|
||||
└── package.json # 根项目配置
|
||||
```
|
||||
|
||||
### 具体迁移方案
|
||||
|
||||
#### 1. 爬虫部分(crawler)
|
||||
保留以下目录/文件:
|
||||
- `lib/` (除前端相关)
|
||||
- `src/db/raw/`
|
||||
- `src/filterWorker.ts`
|
||||
- `src/worker.ts`
|
||||
- `test/`
|
||||
- `deno.json`
|
||||
- `.gitignore`
|
||||
|
||||
需要移除:
|
||||
- Fresh框架相关文件
|
||||
- 前端组件(`components/`)
|
||||
- 静态资源(`static/`)
|
||||
|
||||
#### 2. 前端部分(frontend)
|
||||
全新创建Astro项目,不保留任何现有前端代码
|
||||
|
||||
#### 3. 后端部分(backend)
|
||||
全新创建Hono项目
|
||||
|
||||
#### 4. 共享代码(core)
|
||||
未来可从爬虫中提取以下内容到core package:
|
||||
- 数据库相关:`lib/db/`
|
||||
- 消息队列:`lib/mq/`
|
||||
- 网络请求:`lib/net/`
|
||||
- 工具函数:`lib/utils/`
|
||||
|
||||
## 重构步骤建议
|
||||
|
||||
1. 初始化monorepo结构
|
||||
2. 迁移爬虫代码到`packages/crawler`
|
||||
3. 创建新的Astro项目在`packages/frontend`
|
||||
4. 创建新的Hono项目在`packages/backend`
|
||||
5. 逐步提取共享代码到`packages/core`
|
||||
|
||||
## 注意事项
|
||||
- 机器学习相关代码(`pred/`, `filter/`, `lab/`)保持现状
|
||||
- 文档(`doc/`)可以迁移到`docs/`目录
|
||||
- 需要更新CI/CD流程支持monorepo
|
@ -1,12 +0,0 @@
|
||||
import { JSX } from "preact";
|
||||
import { IS_BROWSER } from "$fresh/runtime.ts";
|
||||
|
||||
export function Button(props: JSX.HTMLAttributes<HTMLButtonElement>) {
|
||||
return (
|
||||
<button
|
||||
{...props}
|
||||
disabled={!IS_BROWSER || props.disabled}
|
||||
class="px-2 py-1 border-gray-500 border-2 rounded bg-white hover:bg-gray-200 transition-colors"
|
||||
/>
|
||||
);
|
||||
}
|
@ -1,55 +0,0 @@
|
||||
import json
|
||||
import random
|
||||
|
||||
def process_data(input_file, output_file):
|
||||
"""
|
||||
从输入文件中读取数据,找出model和human不一致的行,
|
||||
删除"model"键,将"human"键重命名为"label",
|
||||
然后将处理后的数据添加到输出文件中。
|
||||
在写入之前,它会加载output_file中的所有样本,
|
||||
并使用aid键进行去重过滤。
|
||||
|
||||
Args:
|
||||
input_file (str): 输入文件的路径。
|
||||
output_file (str): 输出文件的路径。
|
||||
"""
|
||||
|
||||
# 加载output_file中已有的数据,用于去重
|
||||
existing_data = set()
|
||||
try:
|
||||
with open(output_file, 'r', encoding='utf-8') as f_out:
|
||||
for line in f_out:
|
||||
try:
|
||||
data = json.loads(line)
|
||||
existing_data.add(data['aid'])
|
||||
except json.JSONDecodeError:
|
||||
pass # 忽略JSON解码错误,继续读取下一行
|
||||
except FileNotFoundError:
|
||||
pass # 如果文件不存在,则忽略
|
||||
|
||||
with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'a', encoding='utf-8') as f_out:
|
||||
for line in f_in:
|
||||
try:
|
||||
data = json.loads(line)
|
||||
|
||||
if data['model'] != data['human'] or random.random() < 0.2:
|
||||
if data['aid'] not in existing_data: # 检查aid是否已存在
|
||||
del data['model']
|
||||
data['label'] = data['human']
|
||||
del data['human']
|
||||
f_out.write(json.dumps(data, ensure_ascii=False) + '\n')
|
||||
existing_data.add(data['aid']) # 将新的aid添加到集合中
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"JSON解码错误: {e}")
|
||||
print(f"错误行内容: {line.strip()}")
|
||||
except KeyError as e:
|
||||
print(f"KeyError: 键 '{e}' 不存在")
|
||||
print(f"错误行内容: {line.strip()}")
|
||||
|
||||
# 调用函数处理数据
|
||||
input_file = 'real_test.jsonl'
|
||||
output_file = 'labeled_data.jsonl'
|
||||
process_data(input_file, output_file)
|
||||
print(f"处理完成,结果已写入 {output_file}")
|
||||
|
60
deno.json
60
deno.json
@ -1,60 +1,8 @@
|
||||
{
|
||||
"lock": false,
|
||||
"lock": false,
|
||||
"workspace": ["./packages/crawler", "./packages/frontend", "./packages/backend", "./packages/core"],
|
||||
"nodeModulesDir": "auto",
|
||||
"tasks": {
|
||||
"crawl-raw-bili": "deno --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run src/db/raw/insertAidsToDB.ts",
|
||||
"crawl-bili-aids": "deno --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run src/db/raw/fetchAids.ts",
|
||||
"check": "deno fmt --check && deno lint && deno check **/*.ts && deno check **/*.tsx",
|
||||
"cli": "echo \"import '\\$fresh/src/dev/cli.ts'\" | deno run --unstable -A -",
|
||||
"manifest": "deno task cli manifest $(pwd)",
|
||||
"start": "deno run -A --watch=static/,routes/ dev.ts",
|
||||
"build": "deno run -A dev.ts build",
|
||||
"preview": "deno run -A main.ts",
|
||||
"update": "deno run -A -r https://fresh.deno.dev/update .",
|
||||
"worker:main": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net --allow-write --allow-run ./src/worker.ts",
|
||||
"worker:filter": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net --allow-write ./src/filterWorker.ts",
|
||||
"adder": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net ./src/jobAdder.ts",
|
||||
"bullui": "deno run --allow-read --allow-env --allow-ffi --allow-net ./src/bullui.ts",
|
||||
"all": "concurrently 'deno task worker:main' 'deno task adder' 'deno task bullui' 'deno task worker:filter'",
|
||||
"test": "deno test ./test/ --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run"
|
||||
},
|
||||
"lint": {
|
||||
"rules": {
|
||||
"tags": ["fresh", "recommended"]
|
||||
}
|
||||
},
|
||||
"exclude": ["**/_fresh/*"],
|
||||
"imports": {
|
||||
"@std/assert": "jsr:@std/assert@1",
|
||||
"$fresh/": "https://deno.land/x/fresh@1.7.3/",
|
||||
"preact": "https://esm.sh/preact@10.22.0",
|
||||
"preact/": "https://esm.sh/preact@10.22.0/",
|
||||
"@preact/signals": "https://esm.sh/*@preact/signals@1.2.2",
|
||||
"@preact/signals-core": "https://esm.sh/*@preact/signals-core@1.5.1",
|
||||
"tailwindcss": "npm:tailwindcss@3.4.1",
|
||||
"tailwindcss/": "npm:/tailwindcss@3.4.1/",
|
||||
"tailwindcss/plugin": "npm:/tailwindcss@3.4.1/plugin.js",
|
||||
"$std/": "https://deno.land/std@0.216.0/",
|
||||
"@huggingface/transformers": "npm:@huggingface/transformers@3.0.0",
|
||||
"bullmq": "npm:bullmq",
|
||||
"lib/": "./lib/",
|
||||
"ioredis": "npm:ioredis",
|
||||
"@bull-board/api": "npm:@bull-board/api",
|
||||
"@bull-board/express": "npm:@bull-board/express",
|
||||
"express": "npm:express",
|
||||
"src/": "./src/",
|
||||
"onnxruntime": "npm:onnxruntime-node@1.19.2",
|
||||
"chalk": "npm:chalk"
|
||||
},
|
||||
"compilerOptions": {
|
||||
"jsx": "react-jsx",
|
||||
"jsxImportSource": "preact"
|
||||
},
|
||||
"nodeModulesDir": "auto",
|
||||
"fmt": {
|
||||
"useTabs": true,
|
||||
"lineWidth": 120,
|
||||
"indentWidth": 4,
|
||||
"semiColons": true,
|
||||
"proseWrap": "always"
|
||||
"crawler": "deno task --filter 'crawler' all"
|
||||
}
|
||||
}
|
||||
|
7
dev.ts
7
dev.ts
@ -1,7 +0,0 @@
|
||||
#!/usr/bin/env -S deno run -A --watch=static/,routes/
|
||||
|
||||
import dev from "$fresh/dev.ts";
|
||||
import config from "./fresh.config.ts";
|
||||
|
||||
import "$std/dotenv/load.ts";
|
||||
await dev(import.meta.url, "./main.ts", config);
|
@ -1,6 +0,0 @@
|
||||
import { defineConfig } from "$fresh/server.ts";
|
||||
import tailwind from "$fresh/plugins/tailwind.ts";
|
||||
|
||||
export default defineConfig({
|
||||
plugins: [tailwind()],
|
||||
});
|
27
fresh.gen.ts
27
fresh.gen.ts
@ -1,27 +0,0 @@
|
||||
// DO NOT EDIT. This file is generated by Fresh.
|
||||
// This file SHOULD be checked into source version control.
|
||||
// This file is automatically updated during development when running `dev.ts`.
|
||||
|
||||
import * as $_404 from "./routes/_404.tsx";
|
||||
import * as $_app from "./routes/_app.tsx";
|
||||
import * as $api_joke from "./routes/api/joke.ts";
|
||||
import * as $greet_name_ from "./routes/greet/[name].tsx";
|
||||
import * as $index from "./routes/index.tsx";
|
||||
import * as $Counter from "./islands/Counter.tsx";
|
||||
import type { Manifest } from "$fresh/server.ts";
|
||||
|
||||
const manifest = {
|
||||
routes: {
|
||||
"./routes/_404.tsx": $_404,
|
||||
"./routes/_app.tsx": $_app,
|
||||
"./routes/api/joke.ts": $api_joke,
|
||||
"./routes/greet/[name].tsx": $greet_name_,
|
||||
"./routes/index.tsx": $index,
|
||||
},
|
||||
islands: {
|
||||
"./islands/Counter.tsx": $Counter,
|
||||
},
|
||||
baseUrl: import.meta.url,
|
||||
} satisfies Manifest;
|
||||
|
||||
export default manifest;
|
@ -1,16 +0,0 @@
|
||||
import type { Signal } from "@preact/signals";
|
||||
import { Button } from "../components/Button.tsx";
|
||||
|
||||
interface CounterProps {
|
||||
count: Signal<number>;
|
||||
}
|
||||
|
||||
export default function Counter(props: CounterProps) {
|
||||
return (
|
||||
<div class="flex gap-8 py-6">
|
||||
<Button onClick={() => props.count.value -= 1}>-1</Button>
|
||||
<p class="text-3xl tabular-nums">{props.count}</p>
|
||||
<Button onClick={() => props.count.value += 1}>+1</Button>
|
||||
</div>
|
||||
);
|
||||
}
|
@ -1,22 +0,0 @@
|
||||
import { AIManager } from "lib/ml/manager.ts";
|
||||
import * as ort from "onnxruntime";
|
||||
import logger from "lib/log/logger.ts";
|
||||
import { WorkerError } from "lib/mq/schema.ts";
|
||||
|
||||
const modelPath = "./model/model.onnx";
|
||||
|
||||
class MantisProto extends AIManager {
|
||||
constructor() {
|
||||
super();
|
||||
this.models = {
|
||||
"predictor": modelPath,
|
||||
};
|
||||
}
|
||||
|
||||
public override async init(): Promise<void> {
|
||||
await super.init();
|
||||
}
|
||||
}
|
||||
|
||||
const Mantis = new MantisProto();
|
||||
export default Mantis;
|
@ -1 +0,0 @@
|
||||
export * from "lib/mq/exec/getLatestVideos.ts";
|
13
main.ts
13
main.ts
@ -1,13 +0,0 @@
|
||||
/// <reference no-default-lib="true" />
|
||||
/// <reference lib="dom" />
|
||||
/// <reference lib="dom.iterable" />
|
||||
/// <reference lib="dom.asynciterable" />
|
||||
/// <reference lib="deno.ns" />
|
||||
|
||||
import "$std/dotenv/load.ts";
|
||||
|
||||
import { start } from "$fresh/server.ts";
|
||||
import manifest from "./fresh.gen.ts";
|
||||
import config from "./fresh.config.ts";
|
||||
|
||||
await start(manifest, config);
|
0
lab/.gitignore → ml/lab/.gitignore
vendored
0
lab/.gitignore → ml/lab/.gitignore
vendored
0
packages/backend/deno.json
Normal file
0
packages/backend/deno.json
Normal file
0
packages/core/deno.json
Normal file
0
packages/core/deno.json
Normal file
@ -1,6 +1,6 @@
|
||||
import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts";
|
||||
import { AllDataType, BiliUserType } from "lib/db/schema.d.ts";
|
||||
import Akari from "lib/ml/akari.ts";
|
||||
import { AllDataType, BiliUserType } from "db/schema.d.ts";
|
||||
import Akari from "ml/akari.ts";
|
||||
|
||||
export async function videoExistsInAllData(client: Client, aid: number) {
|
||||
return await client.queryObject<{ exists: boolean }>(
|
@ -1,5 +1,5 @@
|
||||
import { Pool } from "https://deno.land/x/postgres@v0.19.3/mod.ts";
|
||||
import { postgresConfig } from "lib/db/pgConfig.ts";
|
||||
import { postgresConfig } from "db/pgConfig.ts";
|
||||
|
||||
const pool = new Pool(postgresConfig, 12);
|
||||
|
@ -1,5 +1,5 @@
|
||||
import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts";
|
||||
import { LatestSnapshotType } from "lib/db/schema.d.ts";
|
||||
import { LatestSnapshotType } from "db/schema.d.ts";
|
||||
|
||||
export async function getVideosNearMilestone(client: Client) {
|
||||
const queryResult = await client.queryObject<LatestSnapshotType>(`
|
@ -1,9 +1,9 @@
|
||||
import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts";
|
||||
import { formatTimestampToPsql } from "lib/utils/formatTimestampToPostgre.ts";
|
||||
import { formatTimestampToPsql } from "utils/formatTimestampToPostgre.ts";
|
||||
import { SnapshotScheduleType } from "./schema.d.ts";
|
||||
import logger from "lib/log/logger.ts";
|
||||
import logger from "log/logger.ts";
|
||||
import { MINUTE } from "$std/datetime/constants.ts";
|
||||
import { redis } from "lib/db/redis.ts";
|
||||
import { redis } from "db/redis.ts";
|
||||
import { Redis } from "ioredis";
|
||||
|
||||
const REDIS_KEY = "cvsa:snapshot_window_counts";
|
@ -1,5 +1,5 @@
|
||||
import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts";
|
||||
import { parseTimestampFromPsql } from "lib/utils/formatTimestampToPostgre.ts";
|
||||
import { parseTimestampFromPsql } from "utils/formatTimestampToPostgre.ts";
|
||||
|
||||
export async function getNotCollectedSongs(client: Client) {
|
||||
const queryResult = await client.queryObject<{ aid: number }>(`
|
50
packages/crawler/deno.json
Normal file
50
packages/crawler/deno.json
Normal file
@ -0,0 +1,50 @@
|
||||
{
|
||||
"name": "@cvsa/crawler",
|
||||
"tasks": {
|
||||
"crawl-raw-bili": "deno --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run src/db/raw/insertAidsToDB.ts",
|
||||
"crawl-bili-aids": "deno --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run src/db/raw/fetchAids.ts",
|
||||
"check": "deno fmt --check && deno lint && deno check **/*.ts && deno check **/*.tsx",
|
||||
"manifest": "deno task cli manifest $(pwd)",
|
||||
"start": "deno run -A --watch=static/,routes/ dev.ts",
|
||||
"build": "deno run -A dev.ts build",
|
||||
"preview": "deno run -A main.ts",
|
||||
"worker:main": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net --allow-write --allow-run ./src/worker.ts",
|
||||
"worker:filter": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net --allow-write ./src/filterWorker.ts",
|
||||
"adder": "deno run --env-file=.env --allow-env --allow-read --allow-ffi --allow-net ./src/jobAdder.ts",
|
||||
"bullui": "deno run --allow-read --allow-env --allow-ffi --allow-net ./src/bullui.ts",
|
||||
"all": "concurrently 'deno task worker:main' 'deno task adder' 'deno task bullui' 'deno task worker:filter'",
|
||||
"test": "deno test ./test/ --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run"
|
||||
},
|
||||
"lint": {
|
||||
"rules": {
|
||||
"tags": ["recommended"]
|
||||
}
|
||||
},
|
||||
"imports": {
|
||||
"@std/assert": "jsr:@std/assert@1",
|
||||
"$std/": "https://deno.land/std@0.216.0/",
|
||||
"@huggingface/transformers": "npm:@huggingface/transformers@3.0.0",
|
||||
"bullmq": "npm:bullmq",
|
||||
"mq/": "./mq/",
|
||||
"db/": "./db/",
|
||||
"log/": "./log/",
|
||||
"net/": "./net/",
|
||||
"ml/": "./ml/",
|
||||
"utils/": "./utils/",
|
||||
"ioredis": "npm:ioredis",
|
||||
"@bull-board/api": "npm:@bull-board/api",
|
||||
"@bull-board/express": "npm:@bull-board/express",
|
||||
"express": "npm:express",
|
||||
"src/": "./src/",
|
||||
"onnxruntime": "npm:onnxruntime-node@1.19.2",
|
||||
"chalk": "npm:chalk"
|
||||
},
|
||||
"fmt": {
|
||||
"useTabs": true,
|
||||
"lineWidth": 120,
|
||||
"indentWidth": 4,
|
||||
"semiColons": true,
|
||||
"proseWrap": "always"
|
||||
},
|
||||
"exports": "./main.ts"
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
import logger from "lib/log/logger.ts";
|
||||
import logger from "log/logger.ts";
|
||||
|
||||
logger.error(Error("test error"), "test service");
|
||||
logger.debug(`some string`);
|
7
packages/crawler/main.ts
Normal file
7
packages/crawler/main.ts
Normal file
@ -0,0 +1,7 @@
|
||||
// DENO ASK ME TO EXPORT SOMETHING WHEN 'name' IS SPECIFIED
|
||||
// AND IF I DON'T SPECIFY 'name', THE --filter FLAG IN `deno task` WON'T WORK.
|
||||
// I DONT'T KNOW WHY
|
||||
// SO HERE'S A PLACHOLDER EXPORT FOR DENO:
|
||||
export const DENO = "FUCK YOU DENO";
|
||||
// Oh, maybe export the version is a good idea
|
||||
export const VERSION = "1.0.12";
|
@ -1,7 +1,7 @@
|
||||
import { AIManager } from "lib/ml/manager.ts";
|
||||
import { AIManager } from "ml/manager.ts";
|
||||
import * as ort from "onnxruntime";
|
||||
import logger from "lib/log/logger.ts";
|
||||
import { WorkerError } from "lib/mq/schema.ts";
|
||||
import logger from "log/logger.ts";
|
||||
import { WorkerError } from "mq/schema.ts";
|
||||
import { AutoTokenizer, PreTrainedTokenizer } from "@huggingface/transformers";
|
||||
|
||||
const tokenizerModel = "alikia2x/jina-embedding-v3-m2v-1024";
|
@ -1,6 +1,6 @@
|
||||
import * as ort from "onnxruntime";
|
||||
import logger from "lib/log/logger.ts";
|
||||
import { WorkerError } from "lib/mq/schema.ts";
|
||||
import logger from "log/logger.ts";
|
||||
import { WorkerError } from "mq/schema.ts";
|
||||
|
||||
export class AIManager {
|
||||
public sessions: { [key: string]: ort.InferenceSession } = {};
|
@ -1,13 +1,13 @@
|
||||
import { Job } from "bullmq";
|
||||
import { db } from "lib/db/init.ts";
|
||||
import { getUnlabelledVideos, getVideoInfoFromAllData, insertVideoLabel } from "lib/db/allData.ts";
|
||||
import Akari from "lib/ml/akari.ts";
|
||||
import { ClassifyVideoQueue } from "lib/mq/index.ts";
|
||||
import logger from "lib/log/logger.ts";
|
||||
import { lockManager } from "lib/mq/lockManager.ts";
|
||||
import { aidExistsInSongs } from "lib/db/songs.ts";
|
||||
import { insertIntoSongs } from "lib/mq/task/collectSongs.ts";
|
||||
import { scheduleSnapshot } from "lib/db/snapshotSchedule.ts";
|
||||
import { db } from "db/init.ts";
|
||||
import { getUnlabelledVideos, getVideoInfoFromAllData, insertVideoLabel } from "db/allData.ts";
|
||||
import Akari from "ml/akari.ts";
|
||||
import { ClassifyVideoQueue } from "mq/index.ts";
|
||||
import logger from "log/logger.ts";
|
||||
import { lockManager } from "mq/lockManager.ts";
|
||||
import { aidExistsInSongs } from "db/songs.ts";
|
||||
import { insertIntoSongs } from "mq/task/collectSongs.ts";
|
||||
import { scheduleSnapshot } from "db/snapshotSchedule.ts";
|
||||
import { MINUTE } from "$std/datetime/constants.ts";
|
||||
|
||||
export const classifyVideoWorker = async (job: Job) => {
|
@ -1,8 +1,8 @@
|
||||
import { Job } from "bullmq";
|
||||
import { queueLatestVideos } from "lib/mq/task/queueLatestVideo.ts";
|
||||
import { db } from "lib/db/init.ts";
|
||||
import { insertVideoInfo } from "lib/mq/task/getVideoDetails.ts";
|
||||
import { collectSongs } from "lib/mq/task/collectSongs.ts";
|
||||
import { queueLatestVideos } from "mq/task/queueLatestVideo.ts";
|
||||
import { db } from "db/init.ts";
|
||||
import { insertVideoInfo } from "mq/task/getVideoDetails.ts";
|
||||
import { collectSongs } from "mq/task/collectSongs.ts";
|
||||
|
||||
export const getLatestVideosWorker = async (_job: Job): Promise<void> => {
|
||||
const client = await db.connect();
|
@ -1,6 +1,6 @@
|
||||
import { Job } from "bullmq";
|
||||
import { db } from "lib/db/init.ts";
|
||||
import { getLatestVideoSnapshot, getVideosNearMilestone } from "lib/db/snapshot.ts";
|
||||
import { db } from "db/init.ts";
|
||||
import { getLatestVideoSnapshot, getVideosNearMilestone } from "db/snapshot.ts";
|
||||
import {
|
||||
bulkGetVideosWithoutProcessingSchedules,
|
||||
bulkScheduleSnapshot,
|
||||
@ -16,18 +16,18 @@ import {
|
||||
snapshotScheduleExists,
|
||||
videoHasProcessingSchedule,
|
||||
getBulkSnapshotsInNextSecond
|
||||
} from "lib/db/snapshotSchedule.ts";
|
||||
} from "db/snapshotSchedule.ts";
|
||||
import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts";
|
||||
import { HOUR, MINUTE, SECOND, WEEK } from "$std/datetime/constants.ts";
|
||||
import logger from "lib/log/logger.ts";
|
||||
import { SnapshotQueue } from "lib/mq/index.ts";
|
||||
import { insertVideoSnapshot } from "lib/mq/task/getVideoStats.ts";
|
||||
import { NetSchedulerError } from "lib/mq/scheduler.ts";
|
||||
import { getBiliVideoStatus, setBiliVideoStatus } from "lib/db/allData.ts";
|
||||
import { truncate } from "lib/utils/truncate.ts";
|
||||
import { lockManager } from "lib/mq/lockManager.ts";
|
||||
import { getSongsPublihsedAt } from "lib/db/songs.ts";
|
||||
import { bulkGetVideoStats } from "lib/net/bulkGetVideoStats.ts";
|
||||
import logger from "log/logger.ts";
|
||||
import { SnapshotQueue } from "mq/index.ts";
|
||||
import { insertVideoSnapshot } from "mq/task/getVideoStats.ts";
|
||||
import { NetSchedulerError } from "mq/scheduler.ts";
|
||||
import { getBiliVideoStatus, setBiliVideoStatus } from "db/allData.ts";
|
||||
import { truncate } from "utils/truncate.ts";
|
||||
import { lockManager } from "mq/lockManager.ts";
|
||||
import { getSongsPublihsedAt } from "db/songs.ts";
|
||||
import { bulkGetVideoStats } from "net/bulkGetVideoStats.ts";
|
||||
|
||||
const priorityMap: { [key: string]: number } = {
|
||||
"milestone": 1,
|
1
packages/crawler/mq/executors.ts
Normal file
1
packages/crawler/mq/executors.ts
Normal file
@ -0,0 +1 @@
|
||||
export * from "mq/exec/getLatestVideos.ts";
|
@ -1,9 +1,9 @@
|
||||
import { MINUTE, SECOND } from "$std/datetime/constants.ts";
|
||||
import { ClassifyVideoQueue, LatestVideosQueue, SnapshotQueue } from "lib/mq/index.ts";
|
||||
import logger from "lib/log/logger.ts";
|
||||
import { initSnapshotWindowCounts } from "lib/db/snapshotSchedule.ts";
|
||||
import { db } from "lib/db/init.ts";
|
||||
import { redis } from "lib/db/redis.ts";
|
||||
import { ClassifyVideoQueue, LatestVideosQueue, SnapshotQueue } from "mq/index.ts";
|
||||
import logger from "log/logger.ts";
|
||||
import { initSnapshotWindowCounts } from "db/snapshotSchedule.ts";
|
||||
import { db } from "db/init.ts";
|
||||
import { redis } from "db/redis.ts";
|
||||
|
||||
export async function initMQ() {
|
||||
const client = await db.connect();
|
@ -1,5 +1,5 @@
|
||||
import { Redis } from "ioredis";
|
||||
import { redis } from "lib/db/redis.ts";
|
||||
import { redis } from "db/redis.ts";
|
||||
|
||||
class LockManager {
|
||||
private redis: Redis;
|
@ -1,4 +1,4 @@
|
||||
import { SlidingWindow } from "lib/mq/slidingWindow.ts";
|
||||
import { SlidingWindow } from "mq/slidingWindow.ts";
|
||||
|
||||
export interface RateLimiterConfig {
|
||||
window: SlidingWindow;
|
@ -1,7 +1,7 @@
|
||||
import logger from "lib/log/logger.ts";
|
||||
import { RateLimiter, RateLimiterConfig } from "lib/mq/rateLimiter.ts";
|
||||
import { SlidingWindow } from "lib/mq/slidingWindow.ts";
|
||||
import { redis } from "lib/db/redis.ts";
|
||||
import logger from "log/logger.ts";
|
||||
import { RateLimiter, RateLimiterConfig } from "mq/rateLimiter.ts";
|
||||
import { SlidingWindow } from "mq/slidingWindow.ts";
|
||||
import { redis } from "db/redis.ts";
|
||||
import Redis from "ioredis";
|
||||
import { SECOND } from "$std/datetime/constants.ts";
|
||||
|
@ -1,7 +1,7 @@
|
||||
import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts";
|
||||
import { aidExistsInSongs, getNotCollectedSongs } from "lib/db/songs.ts";
|
||||
import logger from "lib/log/logger.ts";
|
||||
import { scheduleSnapshot } from "lib/db/snapshotSchedule.ts";
|
||||
import { aidExistsInSongs, getNotCollectedSongs } from "db/songs.ts";
|
||||
import logger from "log/logger.ts";
|
||||
import { scheduleSnapshot } from "db/snapshotSchedule.ts";
|
||||
import { MINUTE } from "$std/datetime/constants.ts";
|
||||
|
||||
export async function collectSongs(client: Client) {
|
@ -1,9 +1,9 @@
|
||||
import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts";
|
||||
import { getVideoDetails } from "lib/net/getVideoDetails.ts";
|
||||
import { formatTimestampToPsql } from "lib/utils/formatTimestampToPostgre.ts";
|
||||
import logger from "lib/log/logger.ts";
|
||||
import { ClassifyVideoQueue } from "lib/mq/index.ts";
|
||||
import { userExistsInBiliUsers, videoExistsInAllData } from "lib/db/allData.ts";
|
||||
import { getVideoDetails } from "net/getVideoDetails.ts";
|
||||
import { formatTimestampToPsql } from "utils/formatTimestampToPostgre.ts";
|
||||
import logger from "log/logger.ts";
|
||||
import { ClassifyVideoQueue } from "mq/index.ts";
|
||||
import { userExistsInBiliUsers, videoExistsInAllData } from "db/allData.ts";
|
||||
import { HOUR, SECOND } from "$std/datetime/constants.ts";
|
||||
|
||||
export async function insertVideoInfo(client: Client, aid: number) {
|
@ -1,7 +1,7 @@
|
||||
import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts";
|
||||
import { getVideoInfo } from "lib/net/getVideoInfo.ts";
|
||||
import { LatestSnapshotType } from "lib/db/schema.d.ts";
|
||||
import logger from "lib/log/logger.ts";
|
||||
import { getVideoInfo } from "net/getVideoInfo.ts";
|
||||
import { LatestSnapshotType } from "db/schema.d.ts";
|
||||
import logger from "log/logger.ts";
|
||||
|
||||
/*
|
||||
* Fetch video stats from bilibili API and insert into database
|
@ -1,10 +1,10 @@
|
||||
import { Client } from "https://deno.land/x/postgres@v0.19.3/mod.ts";
|
||||
import { getLatestVideoAids } from "lib/net/getLatestVideoAids.ts";
|
||||
import { videoExistsInAllData } from "lib/db/allData.ts";
|
||||
import { sleep } from "lib/utils/sleep.ts";
|
||||
import { getLatestVideoAids } from "net/getLatestVideoAids.ts";
|
||||
import { videoExistsInAllData } from "db/allData.ts";
|
||||
import { sleep } from "utils/sleep.ts";
|
||||
import { SECOND } from "$std/datetime/constants.ts";
|
||||
import logger from "lib/log/logger.ts";
|
||||
import { LatestVideosQueue } from "lib/mq/index.ts";
|
||||
import logger from "log/logger.ts";
|
||||
import { LatestVideosQueue } from "mq/index.ts";
|
||||
|
||||
export async function queueLatestVideos(
|
||||
client: Client,
|
@ -1,6 +1,6 @@
|
||||
import netScheduler from "lib/mq/scheduler.ts";
|
||||
import { MediaListInfoData, MediaListInfoResponse } from "lib/net/bilibili.d.ts";
|
||||
import logger from "lib/log/logger.ts";
|
||||
import netScheduler from "mq/scheduler.ts";
|
||||
import { MediaListInfoData, MediaListInfoResponse } from "net/bilibili.d.ts";
|
||||
import logger from "log/logger.ts";
|
||||
|
||||
/*
|
||||
* Bulk fetch video metadata from bilibili API
|
@ -1,6 +1,6 @@
|
||||
import { VideoListResponse } from "lib/net/bilibili.d.ts";
|
||||
import logger from "lib/log/logger.ts";
|
||||
import netScheduler from "lib/mq/scheduler.ts";
|
||||
import { VideoListResponse } from "net/bilibili.d.ts";
|
||||
import logger from "log/logger.ts";
|
||||
import netScheduler from "mq/scheduler.ts";
|
||||
|
||||
export async function getLatestVideoAids(page: number = 1, pageSize: number = 10): Promise<number[]> {
|
||||
const startFrom = 1 + pageSize * (page - 1);
|
@ -1,6 +1,6 @@
|
||||
import netScheduler from "lib/mq/scheduler.ts";
|
||||
import { VideoDetailsData, VideoDetailsResponse } from "lib/net/bilibili.d.ts";
|
||||
import logger from "lib/log/logger.ts";
|
||||
import netScheduler from "mq/scheduler.ts";
|
||||
import { VideoDetailsData, VideoDetailsResponse } from "net/bilibili.d.ts";
|
||||
import logger from "log/logger.ts";
|
||||
|
||||
export async function getVideoDetails(aid: number): Promise<VideoDetailsData | null> {
|
||||
const url = `https://api.bilibili.com/x/web-interface/view/detail?aid=${aid}`;
|
@ -1,6 +1,6 @@
|
||||
import netScheduler from "lib/mq/scheduler.ts";
|
||||
import { VideoInfoData, VideoInfoResponse } from "lib/net/bilibili.d.ts";
|
||||
import logger from "lib/log/logger.ts";
|
||||
import netScheduler from "mq/scheduler.ts";
|
||||
import { VideoInfoData, VideoInfoResponse } from "net/bilibili.d.ts";
|
||||
import logger from "log/logger.ts";
|
||||
|
||||
/*
|
||||
* Fetch video metadata from bilibili API
|
@ -2,7 +2,7 @@ import express from "express";
|
||||
import { createBullBoard } from "@bull-board/api";
|
||||
import { BullMQAdapter } from "@bull-board/api/bullMQAdapter.js";
|
||||
import { ExpressAdapter } from "@bull-board/express";
|
||||
import { ClassifyVideoQueue, LatestVideosQueue, SnapshotQueue } from "lib/mq/index.ts";
|
||||
import { ClassifyVideoQueue, LatestVideosQueue, SnapshotQueue } from "mq/index.ts";
|
||||
|
||||
const serverAdapter = new ExpressAdapter();
|
||||
serverAdapter.setBasePath("/");
|
@ -1,10 +1,10 @@
|
||||
import { ConnectionOptions, Job, Worker } from "bullmq";
|
||||
import { redis } from "lib/db/redis.ts";
|
||||
import logger from "lib/log/logger.ts";
|
||||
import { classifyVideosWorker, classifyVideoWorker } from "lib/mq/exec/classifyVideo.ts";
|
||||
import { WorkerError } from "lib/mq/schema.ts";
|
||||
import { lockManager } from "lib/mq/lockManager.ts";
|
||||
import Akari from "lib/ml/akari.ts";
|
||||
import { redis } from "db/redis.ts";
|
||||
import logger from "log/logger.ts";
|
||||
import { classifyVideosWorker, classifyVideoWorker } from "mq/exec/classifyVideo.ts";
|
||||
import { WorkerError } from "mq/schema.ts";
|
||||
import { lockManager } from "mq/lockManager.ts";
|
||||
import Akari from "ml/akari.ts";
|
||||
|
||||
Deno.addSignalListener("SIGINT", async () => {
|
||||
logger.log("SIGINT Received: Shutting down workers...", "mq");
|
3
packages/crawler/src/jobAdder.ts
Normal file
3
packages/crawler/src/jobAdder.ts
Normal file
@ -0,0 +1,3 @@
|
||||
import { initMQ } from "mq/init.ts";
|
||||
|
||||
await initMQ();
|
@ -1,10 +1,10 @@
|
||||
import { ConnectionOptions, Job, Worker } from "bullmq";
|
||||
import { collectSongsWorker, getLatestVideosWorker } from "lib/mq/executors.ts";
|
||||
import { redis } from "lib/db/redis.ts";
|
||||
import logger from "lib/log/logger.ts";
|
||||
import { lockManager } from "lib/mq/lockManager.ts";
|
||||
import { WorkerError } from "lib/mq/schema.ts";
|
||||
import { getVideoInfoWorker } from "lib/mq/exec/getLatestVideos.ts";
|
||||
import { collectSongsWorker, getLatestVideosWorker } from "mq/executors.ts";
|
||||
import { redis } from "db/redis.ts";
|
||||
import logger from "log/logger.ts";
|
||||
import { lockManager } from "mq/lockManager.ts";
|
||||
import { WorkerError } from "mq/schema.ts";
|
||||
import { getVideoInfoWorker } from "mq/exec/getLatestVideos.ts";
|
||||
import {
|
||||
collectMilestoneSnapshotsWorker,
|
||||
regularSnapshotsWorker,
|
||||
@ -13,7 +13,7 @@ import {
|
||||
scheduleCleanupWorker,
|
||||
takeBulkSnapshotForVideosWorker,
|
||||
bulkSnapshotTickWorker
|
||||
} from "lib/mq/exec/snapshotTick.ts";
|
||||
} from "mq/exec/snapshotTick.ts";
|
||||
|
||||
Deno.addSignalListener("SIGINT", async () => {
|
||||
logger.log("SIGINT Received: Shutting down workers...", "mq");
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user