ref: move ML stuff

add: .idea to VCS, the refactor guide
This commit is contained in:
alikia2x (寒寒) 2025-03-29 14:13:15 +08:00
parent 7337538f0b
commit 636c5e25cb
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
63 changed files with 132 additions and 252 deletions

11
.gitignore vendored
View File

@ -51,7 +51,6 @@ internal/
!tests/cases/projects/projectOption/**/node_modules
!tests/cases/projects/NodeModulesSearch/**/*
!tests/baselines/reference/project/nodeModules*/**/*
.idea
yarn.lock
yarn-error.log
.parallelperf.*
@ -78,10 +77,10 @@ node_modules/
# project specific
logs/
__pycache__
filter/runs
pred/runs
pred/checkpoints
data/
filter/checkpoints
ml/filter/runs
ml/pred/runs
ml/pred/checkpoints
ml/data/
ml/filter/checkpoints
scripts
model/

9
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,9 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
dataSources.xml

21
.idea/cvsa.iml Normal file
View File

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="WEB_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/.tmp" />
<excludeFolder url="file://$MODULE_DIR$/temp" />
<excludeFolder url="file://$MODULE_DIR$/tmp" />
<excludeFolder url="file://$MODULE_DIR$/ml/data" />
<excludeFolder url="file://$MODULE_DIR$/doc" />
<excludeFolder url="file://$MODULE_DIR$/ml/filter/checkpoints" />
<excludeFolder url="file://$MODULE_DIR$/ml/filter/runs" />
<excludeFolder url="file://$MODULE_DIR$/ml/lab/data" />
<excludeFolder url="file://$MODULE_DIR$/ml/lab/temp" />
<excludeFolder url="file://$MODULE_DIR$/logs" />
<excludeFolder url="file://$MODULE_DIR$/model" />
<excludeFolder url="file://$MODULE_DIR$/src/db" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -0,0 +1,12 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="GrazieInspection" enabled="false" level="GRAMMAR_ERROR" enabled_by_default="false" />
<inspection_tool class="LanguageDetectionInspection" enabled="false" level="WARNING" enabled_by_default="false" />
<inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
<option name="processCode" value="true" />
<option name="processLiterals" value="true" />
<option name="processComments" value="true" />
</inspection_tool>
</profile>
</component>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/cvsa.iml" filepath="$PROJECT_DIR$/.idea/cvsa.iml" />
</modules>
</component>
</project>

6
.idea/sqldialects.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="SqlDialectMappings">
<file url="PROJECT" dialect="PostgreSQL" />
</component>
</project>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

65
README-refactor.md Normal file
View File

@ -0,0 +1,65 @@
# 项目重构方案
## 目标架构
采用monorepo结构管理三个独立部分
1. `packages/crawler` - 现有爬虫功能
2. `packages/frontend` - 基于Astro的前端
3. `packages/backend` - 基于Hono的API后端
## 目录结构调整方案
### 新结构
```
.
├── packages/
│ ├── crawler/ # 爬虫组件
│ ├── frontend/ # Astro前端
│ ├── backend/ # Hono后端API
│ └── core/ # 共享代码(未来提取)
├── docs/ # 文档
├── scripts/ # 项目脚本
└── package.json # 根项目配置
```
### 具体迁移方案
#### 1. 爬虫部分(crawler)
保留以下目录/文件:
- `lib/` (除前端相关)
- `src/db/raw/`
- `src/filterWorker.ts`
- `src/worker.ts`
- `test/`
- `deno.json`
- `.gitignore`
需要移除:
- Fresh框架相关文件
- 前端组件(`components/`)
- 静态资源(`static/`)
#### 2. 前端部分(frontend)
全新创建Astro项目不保留任何现有前端代码
#### 3. 后端部分(backend)
全新创建Hono项目
#### 4. 共享代码(core)
未来可从爬虫中提取以下内容到core package
- 数据库相关:`lib/db/`
- 消息队列:`lib/mq/`
- 网络请求:`lib/net/`
- 工具函数:`lib/utils/`
## 重构步骤建议
1. 初始化monorepo结构
2. 迁移爬虫代码到`packages/crawler`
3. 创建新的Astro项目在`packages/frontend`
4. 创建新的Hono项目在`packages/backend`
5. 逐步提取共享代码到`packages/core`
## 注意事项
- 机器学习相关代码(`pred/`, `filter/`, `lab/`)保持现状
- 文档(`doc/`)可以迁移到`docs/`目录
- 需要更新CI/CD流程支持monorepo

View File

@ -1,12 +0,0 @@
import { JSX } from "preact";
import { IS_BROWSER } from "$fresh/runtime.ts";
export function Button(props: JSX.HTMLAttributes<HTMLButtonElement>) {
return (
<button
{...props}
disabled={!IS_BROWSER || props.disabled}
class="px-2 py-1 border-gray-500 border-2 rounded bg-white hover:bg-gray-200 transition-colors"
/>
);
}

View File

@ -1,55 +0,0 @@
import json
import random
def process_data(input_file, output_file):
"""
从输入文件中读取数据找出model和human不一致的行
删除"model""human"键重命名为"label"
然后将处理后的数据添加到输出文件中
在写入之前它会加载output_file中的所有样本
并使用aid键进行去重过滤
Args:
input_file (str): 输入文件的路径
output_file (str): 输出文件的路径
"""
# 加载output_file中已有的数据用于去重
existing_data = set()
try:
with open(output_file, 'r', encoding='utf-8') as f_out:
for line in f_out:
try:
data = json.loads(line)
existing_data.add(data['aid'])
except json.JSONDecodeError:
pass # 忽略JSON解码错误继续读取下一行
except FileNotFoundError:
pass # 如果文件不存在,则忽略
with open(input_file, 'r', encoding='utf-8') as f_in, open(output_file, 'a', encoding='utf-8') as f_out:
for line in f_in:
try:
data = json.loads(line)
if data['model'] != data['human'] or random.random() < 0.2:
if data['aid'] not in existing_data: # 检查aid是否已存在
del data['model']
data['label'] = data['human']
del data['human']
f_out.write(json.dumps(data, ensure_ascii=False) + '\n')
existing_data.add(data['aid']) # 将新的aid添加到集合中
except json.JSONDecodeError as e:
print(f"JSON解码错误: {e}")
print(f"错误行内容: {line.strip()}")
except KeyError as e:
print(f"KeyError: 键 '{e}' 不存在")
print(f"错误行内容: {line.strip()}")
# 调用函数处理数据
input_file = 'real_test.jsonl'
output_file = 'labeled_data.jsonl'
process_data(input_file, output_file)
print(f"处理完成,结果已写入 {output_file}")

7
dev.ts
View File

@ -1,7 +0,0 @@
#!/usr/bin/env -S deno run -A --watch=static/,routes/
import dev from "$fresh/dev.ts";
import config from "./fresh.config.ts";
import "$std/dotenv/load.ts";
await dev(import.meta.url, "./main.ts", config);

View File

@ -1,6 +0,0 @@
import { defineConfig } from "$fresh/server.ts";
import tailwind from "$fresh/plugins/tailwind.ts";
export default defineConfig({
plugins: [tailwind()],
});

View File

@ -1,27 +0,0 @@
// DO NOT EDIT. This file is generated by Fresh.
// This file SHOULD be checked into source version control.
// This file is automatically updated during development when running `dev.ts`.
import * as $_404 from "./routes/_404.tsx";
import * as $_app from "./routes/_app.tsx";
import * as $api_joke from "./routes/api/joke.ts";
import * as $greet_name_ from "./routes/greet/[name].tsx";
import * as $index from "./routes/index.tsx";
import * as $Counter from "./islands/Counter.tsx";
import type { Manifest } from "$fresh/server.ts";
const manifest = {
routes: {
"./routes/_404.tsx": $_404,
"./routes/_app.tsx": $_app,
"./routes/api/joke.ts": $api_joke,
"./routes/greet/[name].tsx": $greet_name_,
"./routes/index.tsx": $index,
},
islands: {
"./islands/Counter.tsx": $Counter,
},
baseUrl: import.meta.url,
} satisfies Manifest;
export default manifest;

View File

@ -1,16 +0,0 @@
import type { Signal } from "@preact/signals";
import { Button } from "../components/Button.tsx";
interface CounterProps {
count: Signal<number>;
}
export default function Counter(props: CounterProps) {
return (
<div class="flex gap-8 py-6">
<Button onClick={() => props.count.value -= 1}>-1</Button>
<p class="text-3xl tabular-nums">{props.count}</p>
<Button onClick={() => props.count.value += 1}>+1</Button>
</div>
);
}

13
main.ts
View File

@ -1,13 +0,0 @@
/// <reference no-default-lib="true" />
/// <reference lib="dom" />
/// <reference lib="dom.iterable" />
/// <reference lib="dom.asynciterable" />
/// <reference lib="deno.ns" />
import "$std/dotenv/load.ts";
import { start } from "$fresh/server.ts";
import manifest from "./fresh.gen.ts";
import config from "./fresh.config.ts";
await start(manifest, config);

View File

View File

@ -1,27 +0,0 @@
import { Head } from "$fresh/runtime.ts";
export default function Error404() {
return (
<>
<Head>
<title>404 - Page not found</title>
</Head>
<div class="px-4 py-8 mx-auto bg-[#86efac]">
<div class="max-w-screen-md mx-auto flex flex-col items-center justify-center">
<img
class="my-6"
src="/logo.svg"
width="128"
height="128"
alt="the Fresh logo: a sliced lemon dripping with juice"
/>
<h1 class="text-4xl font-bold">404 - Page not found</h1>
<p class="my-4">
The page you were looking for doesn't exist.
</p>
<a href="/" class="underline">Go back home</a>
</div>
</div>
</>
);
}

View File

@ -1,16 +0,0 @@
import { type PageProps } from "$fresh/server.ts";
export default function App({ Component }: PageProps) {
return (
<html>
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>cvsa</title>
<link rel="stylesheet" href="/styles.css" />
</head>
<body>
<Component />
</body>
</html>
);
}

View File

@ -1,21 +0,0 @@
import { FreshContext } from "$fresh/server.ts";
// Jokes courtesy of https://punsandoneliners.com/randomness/programmer-jokes/
const JOKES = [
"Why do Java developers often wear glasses? They can't C#.",
"A SQL query walks into a bar, goes up to two tables and says “can I join you?”",
"Wasn't hard to crack Forrest Gump's password. 1forrest1.",
"I love pressing the F5 key. It's refreshing.",
"Called IT support and a chap from Australia came to fix my network connection. I asked “Do you come from a LAN down under?”",
"There are 10 types of people in the world. Those who understand binary and those who don't.",
"Why are assembly programmers often wet? They work below C level.",
"My favourite computer based band is the Black IPs.",
"What programme do you use to predict the music tastes of former US presidential candidates? An Al Gore Rhythm.",
"An SEO expert walked into a bar, pub, inn, tavern, hostelry, public house.",
];
export const handler = (_req: Request, _ctx: FreshContext): Response => {
const randomIndex = Math.floor(Math.random() * JOKES.length);
const body = JOKES[randomIndex];
return new Response(body);
};

View File

@ -1,5 +0,0 @@
import { PageProps } from "$fresh/server.ts";
export default function Greet(props: PageProps) {
return <div>Hello {props.params.name}</div>;
}

View File

@ -1,25 +0,0 @@
import { useSignal } from "@preact/signals";
import Counter from "../islands/Counter.tsx";
export default function Home() {
const count = useSignal(3);
return (
<div class="px-4 py-8 mx-auto bg-[#86efac]">
<div class="max-w-screen-md mx-auto flex flex-col items-center justify-center">
<img
class="my-6"
src="/logo.svg"
width="128"
height="128"
alt="the Fresh logo: a sliced lemon dripping with juice"
/>
<h1 class="text-4xl font-bold">Welcome to Fresh</h1>
<p class="my-4">
Try updating this message in the
<code class="mx-2">./routes/index.tsx</code> file, and refresh.
</p>
<Counter count={count} />
</div>
</div>
);
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 22 KiB

View File

@ -1,6 +0,0 @@
<svg width="40" height="40" fill="none" xmlns="http://www.w3.org/2000/svg">
<path d="M34.092 8.845C38.929 20.652 34.092 27 30 30.5c1 3.5-2.986 4.222-4.5 2.5-4.457 1.537-13.512 1.487-20-5C2 24.5 4.73 16.714 14 11.5c8-4.5 16-7 20.092-2.655Z" fill="#FFDB1E"/>
<path d="M14 11.5c6.848-4.497 15.025-6.38 18.368-3.47C37.5 12.5 21.5 22.612 15.5 25c-6.5 2.587-3 8.5-6.5 8.5-3 0-2.5-4-5.183-7.75C2.232 23.535 6.16 16.648 14 11.5Z" fill="#fff" stroke="#FFDB1E"/>
<path d="M28.535 8.772c4.645 1.25-.365 5.695-4.303 8.536-3.732 2.692-6.606 4.21-7.923 4.83-.366.173-1.617-2.252-1.617-1 0 .417-.7 2.238-.934 2.326-1.365.512-4.223 1.29-5.835 1.29-3.491 0-1.923-4.754 3.014-9.122.892-.789 1.478-.645 2.283-.645-.537-.773-.534-.917.403-1.546C17.79 10.64 23 8.77 25.212 8.42c.366.014.82.35.82.629.41-.14 2.095-.388 2.503-.278Z" fill="#FFE600"/>
<path d="M14.297 16.49c.985-.747 1.644-1.01 2.099-2.526.566.121.841-.08 1.29-.701.324.466 1.657.608 2.453.701-.715.451-1.057.852-1.452 2.106-1.464-.611-3.167-.302-4.39.42Z" fill="#fff"/>
</svg>

Before

Width:  |  Height:  |  Size: 1.0 KiB

View File

@ -1,3 +0,0 @@
@tailwind base;
@tailwind components;
@tailwind utilities;

View File

@ -1,7 +0,0 @@
import { type Config } from "tailwindcss";
export default {
content: [
"{routes,islands,components}/**/*.{ts,tsx,js,jsx}",
],
} satisfies Config;