diff --git a/.gitignore b/.gitignore
index 31d6ddf..58df6d2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,7 +51,6 @@ internal/
!tests/cases/projects/projectOption/**/node_modules
!tests/cases/projects/NodeModulesSearch/**/*
!tests/baselines/reference/project/nodeModules*/**/*
-.idea
yarn.lock
yarn-error.log
.parallelperf.*
@@ -78,10 +77,10 @@ node_modules/
# project specific
logs/
__pycache__
-filter/runs
-pred/runs
-pred/checkpoints
-data/
-filter/checkpoints
+ml/filter/runs
+ml/pred/runs
+ml/pred/checkpoints
+ml/data/
+ml/filter/checkpoints
scripts
model/
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..518076d
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,9 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+dataSources.xml
\ No newline at end of file
diff --git a/.idea/cvsa.iml b/.idea/cvsa.iml
new file mode 100644
index 0000000..c155925
--- /dev/null
+++ b/.idea/cvsa.iml
@@ -0,0 +1,21 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..5535e8f
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..4552e71
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/sqldialects.xml b/.idea/sqldialects.xml
new file mode 100644
index 0000000..6df4889
--- /dev/null
+++ b/.idea/sqldialects.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/README-refactor.md b/README-refactor.md
new file mode 100644
index 0000000..75ffdb9
--- /dev/null
+++ b/README-refactor.md
@@ -0,0 +1,65 @@
+# 项目重构方案
+
+## 目标架构
+采用monorepo结构管理三个独立部分:
+1. `packages/crawler` - 现有爬虫功能
+2. `packages/frontend` - 基于Astro的前端
+3. `packages/backend` - 基于Hono的API后端
+
+## 目录结构调整方案
+
+### 新结构
+```
+.
+├── packages/
+│ ├── crawler/ # 爬虫组件
+│ ├── frontend/ # Astro前端
+│ ├── backend/ # Hono后端API
+│ └── core/ # 共享代码(未来提取)
+├── docs/ # 文档
+├── scripts/ # 项目脚本
+└── package.json # 根项目配置
+```
+
+### 具体迁移方案
+
+#### 1. 爬虫部分(crawler)
+保留以下目录/文件:
+- `lib/` (除前端相关)
+- `src/db/raw/`
+- `src/filterWorker.ts`
+- `src/worker.ts`
+- `test/`
+- `deno.json`
+- `.gitignore`
+
+需要移除:
+- Fresh框架相关文件
+- 前端组件(`components/`)
+- 静态资源(`static/`)
+
+#### 2. 前端部分(frontend)
+全新创建Astro项目,不保留任何现有前端代码
+
+#### 3. 后端部分(backend)
+全新创建Hono项目
+
+#### 4. 共享代码(core)
+未来可从爬虫中提取以下内容到core package:
+- 数据库相关:`lib/db/`
+- 消息队列:`lib/mq/`
+- 网络请求:`lib/net/`
+- 工具函数:`lib/utils/`
+
+## 重构步骤建议
+
+1. 初始化monorepo结构
+2. 迁移爬虫代码到`packages/crawler`
+3. 创建新的Astro项目在`packages/frontend`
+4. 创建新的Hono项目在`packages/backend`
+5. 逐步提取共享代码到`packages/core`
+
+## 注意事项
+- 机器学习相关代码(`pred/`, `filter/`, `lab/`)保持现状
+- 文档(`doc/`)可以迁移到`docs/`目录
+- 需要更新CI/CD流程支持monorepo
\ No newline at end of file
diff --git a/components/Button.tsx b/components/Button.tsx
deleted file mode 100644
index 6e868c5..0000000
--- a/components/Button.tsx
+++ /dev/null
@@ -1,12 +0,0 @@
-import { JSX } from "preact";
-import { IS_BROWSER } from "$fresh/runtime.ts";
-
-export function Button(props: JSX.HTMLAttributes) {
- return (
-
- );
-}
diff --git a/data/filter/1.py b/data/filter/1.py
deleted file mode 100644
index a5dc97d..0000000
--- a/data/filter/1.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import json
-import random
-
-def process_data(input_file, output_file):
- """
- 从输入文件中读取数据,找出model和human不一致的行,
- 删除"model"键,将"human"键重命名为"label",
- 然后将处理后的数据添加到输出文件中。
- 在写入之前,它会加载output_file中的所有样本,
- 并使用aid键进行去重过滤。
-
- Args:
- input_file (str): 输入文件的路径。
- output_file (str): 输出文件的路径。
- """
-
- # 加载output_file中已有的数据,用于去重
- existing_data = set()
- try:
- with open(output_file, 'r', encoding='utf-8') as f_out:
- for line in f_out:
- try:
- data = json.loads(line)
- existing_data.add(data['aid'])
- except json.JSONDecodeError:
- pass # 忽略JSON解码错误,继续读取下一行
- except FileNotFoundError:
- pass # 如果文件不存在,则忽略
-
- with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'a', encoding='utf-8') as f_out:
- for line in f_in:
- try:
- data = json.loads(line)
-
- if data['model'] != data['human'] or random.random() < 0.2:
- if data['aid'] not in existing_data: # 检查aid是否已存在
- del data['model']
- data['label'] = data['human']
- del data['human']
- f_out.write(json.dumps(data, ensure_ascii=False) + '\n')
- existing_data.add(data['aid']) # 将新的aid添加到集合中
-
- except json.JSONDecodeError as e:
- print(f"JSON解码错误: {e}")
- print(f"错误行内容: {line.strip()}")
- except KeyError as e:
- print(f"KeyError: 键 '{e}' 不存在")
- print(f"错误行内容: {line.strip()}")
-
-# 调用函数处理数据
-input_file = 'real_test.jsonl'
-output_file = 'labeled_data.jsonl'
-process_data(input_file, output_file)
-print(f"处理完成,结果已写入 {output_file}")
-
diff --git a/dev.ts b/dev.ts
deleted file mode 100755
index fd088b1..0000000
--- a/dev.ts
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/usr/bin/env -S deno run -A --watch=static/,routes/
-
-import dev from "$fresh/dev.ts";
-import config from "./fresh.config.ts";
-
-import "$std/dotenv/load.ts";
-await dev(import.meta.url, "./main.ts", config);
diff --git a/fresh.config.ts b/fresh.config.ts
deleted file mode 100644
index 40e4820..0000000
--- a/fresh.config.ts
+++ /dev/null
@@ -1,6 +0,0 @@
-import { defineConfig } from "$fresh/server.ts";
-import tailwind from "$fresh/plugins/tailwind.ts";
-
-export default defineConfig({
- plugins: [tailwind()],
-});
diff --git a/fresh.gen.ts b/fresh.gen.ts
deleted file mode 100644
index 2e0cba4..0000000
--- a/fresh.gen.ts
+++ /dev/null
@@ -1,27 +0,0 @@
-// DO NOT EDIT. This file is generated by Fresh.
-// This file SHOULD be checked into source version control.
-// This file is automatically updated during development when running `dev.ts`.
-
-import * as $_404 from "./routes/_404.tsx";
-import * as $_app from "./routes/_app.tsx";
-import * as $api_joke from "./routes/api/joke.ts";
-import * as $greet_name_ from "./routes/greet/[name].tsx";
-import * as $index from "./routes/index.tsx";
-import * as $Counter from "./islands/Counter.tsx";
-import type { Manifest } from "$fresh/server.ts";
-
-const manifest = {
- routes: {
- "./routes/_404.tsx": $_404,
- "./routes/_app.tsx": $_app,
- "./routes/api/joke.ts": $api_joke,
- "./routes/greet/[name].tsx": $greet_name_,
- "./routes/index.tsx": $index,
- },
- islands: {
- "./islands/Counter.tsx": $Counter,
- },
- baseUrl: import.meta.url,
-} satisfies Manifest;
-
-export default manifest;
diff --git a/islands/Counter.tsx b/islands/Counter.tsx
deleted file mode 100644
index 51b85fe..0000000
--- a/islands/Counter.tsx
+++ /dev/null
@@ -1,16 +0,0 @@
-import type { Signal } from "@preact/signals";
-import { Button } from "../components/Button.tsx";
-
-interface CounterProps {
- count: Signal;
-}
-
-export default function Counter(props: CounterProps) {
- return (
-
-
-
{props.count}
-
-
- );
-}
diff --git a/main.ts b/main.ts
deleted file mode 100644
index 675f529..0000000
--- a/main.ts
+++ /dev/null
@@ -1,13 +0,0 @@
-///
-///
-///
-///
-///
-
-import "$std/dotenv/load.ts";
-
-import { start } from "$fresh/server.ts";
-import manifest from "./fresh.gen.ts";
-import config from "./fresh.config.ts";
-
-await start(manifest, config);
diff --git a/filter/RunningLogs.txt b/ml/filter/RunningLogs.txt
similarity index 100%
rename from filter/RunningLogs.txt
rename to ml/filter/RunningLogs.txt
diff --git a/filter/checkpoint_conversion.py b/ml/filter/checkpoint_conversion.py
similarity index 100%
rename from filter/checkpoint_conversion.py
rename to ml/filter/checkpoint_conversion.py
diff --git a/filter/clean_dataset.py b/ml/filter/clean_dataset.py
similarity index 100%
rename from filter/clean_dataset.py
rename to ml/filter/clean_dataset.py
diff --git a/filter/dataset.py b/ml/filter/dataset.py
similarity index 100%
rename from filter/dataset.py
rename to ml/filter/dataset.py
diff --git a/filter/db_utils.py b/ml/filter/db_utils.py
similarity index 100%
rename from filter/db_utils.py
rename to ml/filter/db_utils.py
diff --git a/filter/embedding.py b/ml/filter/embedding.py
similarity index 100%
rename from filter/embedding.py
rename to ml/filter/embedding.py
diff --git a/filter/embedding_range.py b/ml/filter/embedding_range.py
similarity index 100%
rename from filter/embedding_range.py
rename to ml/filter/embedding_range.py
diff --git a/filter/embedding_visualization.py b/ml/filter/embedding_visualization.py
similarity index 100%
rename from filter/embedding_visualization.py
rename to ml/filter/embedding_visualization.py
diff --git a/filter/labeling_system.py b/ml/filter/labeling_system.py
similarity index 100%
rename from filter/labeling_system.py
rename to ml/filter/labeling_system.py
diff --git a/filter/model.py b/ml/filter/model.py
similarity index 100%
rename from filter/model.py
rename to ml/filter/model.py
diff --git a/filter/modelV3_10.py b/ml/filter/modelV3_10.py
similarity index 100%
rename from filter/modelV3_10.py
rename to ml/filter/modelV3_10.py
diff --git a/filter/modelV3_12.py b/ml/filter/modelV3_12.py
similarity index 100%
rename from filter/modelV3_12.py
rename to ml/filter/modelV3_12.py
diff --git a/filter/modelV3_15.py b/ml/filter/modelV3_15.py
similarity index 100%
rename from filter/modelV3_15.py
rename to ml/filter/modelV3_15.py
diff --git a/filter/modelV6_0.py b/ml/filter/modelV6_0.py
similarity index 100%
rename from filter/modelV6_0.py
rename to ml/filter/modelV6_0.py
diff --git a/filter/onnx_export.py b/ml/filter/onnx_export.py
similarity index 100%
rename from filter/onnx_export.py
rename to ml/filter/onnx_export.py
diff --git a/filter/predict.py b/ml/filter/predict.py
similarity index 100%
rename from filter/predict.py
rename to ml/filter/predict.py
diff --git a/filter/quantize.py b/ml/filter/quantize.py
similarity index 100%
rename from filter/quantize.py
rename to ml/filter/quantize.py
diff --git a/filter/tag.py b/ml/filter/tag.py
similarity index 100%
rename from filter/tag.py
rename to ml/filter/tag.py
diff --git a/filter/test.py b/ml/filter/test.py
similarity index 100%
rename from filter/test.py
rename to ml/filter/test.py
diff --git a/filter/train.py b/ml/filter/train.py
similarity index 100%
rename from filter/train.py
rename to ml/filter/train.py
diff --git a/lab/.gitignore b/ml/lab/.gitignore
similarity index 100%
rename from lab/.gitignore
rename to ml/lab/.gitignore
diff --git a/lab/align-pipeline.md b/ml/lab/align-pipeline.md
similarity index 100%
rename from lab/align-pipeline.md
rename to ml/lab/align-pipeline.md
diff --git a/lab/mmsAlignment/align2LRC.py b/ml/lab/mmsAlignment/align2LRC.py
similarity index 100%
rename from lab/mmsAlignment/align2LRC.py
rename to ml/lab/mmsAlignment/align2LRC.py
diff --git a/lab/mmsAlignment/alignWithMMS.py b/ml/lab/mmsAlignment/alignWithMMS.py
similarity index 100%
rename from lab/mmsAlignment/alignWithMMS.py
rename to ml/lab/mmsAlignment/alignWithMMS.py
diff --git a/lab/mmsAlignment/splitSong.py b/ml/lab/mmsAlignment/splitSong.py
similarity index 100%
rename from lab/mmsAlignment/splitSong.py
rename to ml/lab/mmsAlignment/splitSong.py
diff --git a/lab/utils/audio.py b/ml/lab/utils/audio.py
similarity index 100%
rename from lab/utils/audio.py
rename to ml/lab/utils/audio.py
diff --git a/lab/utils/cleanTempDir.py b/ml/lab/utils/cleanTempDir.py
similarity index 100%
rename from lab/utils/cleanTempDir.py
rename to ml/lab/utils/cleanTempDir.py
diff --git a/lab/utils/ttml.py b/ml/lab/utils/ttml.py
similarity index 100%
rename from lab/utils/ttml.py
rename to ml/lab/utils/ttml.py
diff --git a/lab/whisperAlignment/align2srt.py b/ml/lab/whisperAlignment/align2srt.py
similarity index 100%
rename from lab/whisperAlignment/align2srt.py
rename to ml/lab/whisperAlignment/align2srt.py
diff --git a/lab/whisperAlignment/alignWithGroup.py b/ml/lab/whisperAlignment/alignWithGroup.py
similarity index 100%
rename from lab/whisperAlignment/alignWithGroup.py
rename to ml/lab/whisperAlignment/alignWithGroup.py
diff --git a/lab/whisperAlignment/splitGroups.py b/ml/lab/whisperAlignment/splitGroups.py
similarity index 100%
rename from lab/whisperAlignment/splitGroups.py
rename to ml/lab/whisperAlignment/splitGroups.py
diff --git a/lab/whisperAlignment/srt2lrc.py b/ml/lab/whisperAlignment/srt2lrc.py
similarity index 100%
rename from lab/whisperAlignment/srt2lrc.py
rename to ml/lab/whisperAlignment/srt2lrc.py
diff --git a/pred/count.py b/ml/pred/count.py
similarity index 100%
rename from pred/count.py
rename to ml/pred/count.py
diff --git a/pred/crawler.py b/ml/pred/crawler.py
similarity index 100%
rename from pred/crawler.py
rename to ml/pred/crawler.py
diff --git a/pred/dataset.py b/ml/pred/dataset.py
similarity index 100%
rename from pred/dataset.py
rename to ml/pred/dataset.py
diff --git a/pred/export_onnx.py b/ml/pred/export_onnx.py
similarity index 100%
rename from pred/export_onnx.py
rename to ml/pred/export_onnx.py
diff --git a/pred/inference.py b/ml/pred/inference.py
similarity index 100%
rename from pred/inference.py
rename to ml/pred/inference.py
diff --git a/pred/model.py b/ml/pred/model.py
similarity index 100%
rename from pred/model.py
rename to ml/pred/model.py
diff --git a/pred/train.py b/ml/pred/train.py
similarity index 100%
rename from pred/train.py
rename to ml/pred/train.py
diff --git a/routes/_404.tsx b/routes/_404.tsx
deleted file mode 100644
index 4628eeb..0000000
--- a/routes/_404.tsx
+++ /dev/null
@@ -1,27 +0,0 @@
-import { Head } from "$fresh/runtime.ts";
-
-export default function Error404() {
- return (
- <>
-
- 404 - Page not found
-
-
- >
- );
-}
diff --git a/routes/_app.tsx b/routes/_app.tsx
deleted file mode 100644
index a44414e..0000000
--- a/routes/_app.tsx
+++ /dev/null
@@ -1,16 +0,0 @@
-import { type PageProps } from "$fresh/server.ts";
-export default function App({ Component }: PageProps) {
- return (
-
-
-
-
- cvsa
-
-
-
-
-
-
- );
-}
diff --git a/routes/api/joke.ts b/routes/api/joke.ts
deleted file mode 100644
index 68b0ebe..0000000
--- a/routes/api/joke.ts
+++ /dev/null
@@ -1,21 +0,0 @@
-import { FreshContext } from "$fresh/server.ts";
-
-// Jokes courtesy of https://punsandoneliners.com/randomness/programmer-jokes/
-const JOKES = [
- "Why do Java developers often wear glasses? They can't C#.",
- "A SQL query walks into a bar, goes up to two tables and says “can I join you?”",
- "Wasn't hard to crack Forrest Gump's password. 1forrest1.",
- "I love pressing the F5 key. It's refreshing.",
- "Called IT support and a chap from Australia came to fix my network connection. I asked “Do you come from a LAN down under?”",
- "There are 10 types of people in the world. Those who understand binary and those who don't.",
- "Why are assembly programmers often wet? They work below C level.",
- "My favourite computer based band is the Black IPs.",
- "What programme do you use to predict the music tastes of former US presidential candidates? An Al Gore Rhythm.",
- "An SEO expert walked into a bar, pub, inn, tavern, hostelry, public house.",
-];
-
-export const handler = (_req: Request, _ctx: FreshContext): Response => {
- const randomIndex = Math.floor(Math.random() * JOKES.length);
- const body = JOKES[randomIndex];
- return new Response(body);
-};
diff --git a/routes/greet/[name].tsx b/routes/greet/[name].tsx
deleted file mode 100644
index a7a5fe1..0000000
--- a/routes/greet/[name].tsx
+++ /dev/null
@@ -1,5 +0,0 @@
-import { PageProps } from "$fresh/server.ts";
-
-export default function Greet(props: PageProps) {
- return