From a1a4abff463a895fda703f9f7bb824c71895252c Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sun, 1 Jun 2025 21:31:37 +0800 Subject: [PATCH] fix: missing import in @cvsa/core --- packages/backend/lib/auth/getJWTsecret.ts | 5 +- packages/backend/middleware/rateLimiters.ts | 3 +- .../backend/routes/captcha/difficulty/GET.ts | 5 +- packages/core/package.json | 2 +- packages/core/types.d.ts | 1 + packages/crawler/ml/benchmark.ts | 179 ------------------ packages/crawler/ml/quant_benchmark.ts | 171 ----------------- packages/crawler/mq/scheduling.ts | 13 +- .../mq/task/regularSnapshotInterval.ts | 4 +- packages/next/bun.lock | 4 +- packages/next/package.json | 2 +- 11 files changed, 21 insertions(+), 368 deletions(-) delete mode 100644 packages/crawler/ml/benchmark.ts delete mode 100644 packages/crawler/ml/quant_benchmark.ts diff --git a/packages/backend/lib/auth/getJWTsecret.ts b/packages/backend/lib/auth/getJWTsecret.ts index 9388892..7c30faa 100644 --- a/packages/backend/lib/auth/getJWTsecret.ts +++ b/packages/backend/lib/auth/getJWTsecret.ts @@ -5,9 +5,10 @@ export const getJWTsecret = () => { if (!secret) { const response: ErrorResponse = { message: "JWT_SECRET is not set", - code: "SERVER_ERROR" + code: "SERVER_ERROR", + errors: [] }; return [response, true]; } return [secret, null]; -} \ No newline at end of file +}; diff --git a/packages/backend/middleware/rateLimiters.ts b/packages/backend/middleware/rateLimiters.ts index dc18387..00531f7 100644 --- a/packages/backend/middleware/rateLimiters.ts +++ b/packages/backend/middleware/rateLimiters.ts @@ -43,7 +43,8 @@ export const registerRateLimiter = async (c: Context, nex if (!allowed) { const response: ErrorResponse = { message: `Too many requests, please retry after ${Math.round(retryAfter)} seconds.`, - code: "RATE_LIMIT_EXCEEDED" + code: "RATE_LIMIT_EXCEEDED", + errors: [] }; return c.json(response, 429); } diff --git a/packages/backend/routes/captcha/difficulty/GET.ts b/packages/backend/routes/captcha/difficulty/GET.ts index 0bed7cc..ce1d339 100644 --- a/packages/backend/routes/captcha/difficulty/GET.ts +++ b/packages/backend/routes/captcha/difficulty/GET.ts @@ -16,12 +16,13 @@ export const getCaptchaDifficultyHandler = createHandlers(async (c) => { if (!difficulty) { const response: ErrorResponse = { code: "ENTITY_NOT_FOUND", - message: "No difficulty configs found for this route." + message: "No difficulty configs found for this route.", + errors: [] }; return c.json>(response, 404); } return c.json({ - "difficulty": difficulty + difficulty: difficulty }); } catch (e: unknown) { if (e instanceof ValidationError) { diff --git a/packages/core/package.json b/packages/core/package.json index 89fa7df..9121312 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -1,7 +1,7 @@ { "name": "@cvsa/core", "private": false, - "version": "0.0.1", + "version": "0.0.2", "scripts": { "test": "bun --env-file=.env.test run vitest", "build": "bun build ./index.ts --target node --outdir ./dist" diff --git a/packages/core/types.d.ts b/packages/core/types.d.ts index cda935c..24322bb 100644 --- a/packages/core/types.d.ts +++ b/packages/core/types.d.ts @@ -1 +1,2 @@ export * from "./db/schema"; +export * from "./index"; diff --git a/packages/crawler/ml/benchmark.ts b/packages/crawler/ml/benchmark.ts deleted file mode 100644 index 3fc76ac..0000000 --- a/packages/crawler/ml/benchmark.ts +++ /dev/null @@ -1,179 +0,0 @@ -import { AutoTokenizer, PreTrainedTokenizer } from "@huggingface/transformers"; -import * as ort from "onnxruntime"; - -function softmax(logits: Float32Array): number[] { - const maxLogit = Math.max(...logits); - const exponents = logits.map((logit) => Math.exp(logit - maxLogit)); - const sumOfExponents = exponents.reduce((sum, exp) => sum + exp, 0); - return Array.from(exponents.map((exp) => exp / sumOfExponents)); -} - -// 配置参数 -const sentenceTransformerModelName = "alikia2x/jina-embedding-v3-m2v-1024"; -const onnxClassifierPath = "./model/video_classifier_v3_17.onnx"; -const onnxEmbeddingPath = "./model/embedding_original.onnx"; -const testDataPath = "./data/filter/test1.jsonl"; - -// 初始化会话 -const [sessionClassifier, sessionEmbedding] = await Promise.all([ - ort.InferenceSession.create(onnxClassifierPath), - ort.InferenceSession.create(onnxEmbeddingPath), -]); - -let tokenizer: PreTrainedTokenizer; - -// 初始化分词器 -async function loadTokenizer() { - const tokenizerConfig = { local_files_only: true }; - tokenizer = await AutoTokenizer.from_pretrained(sentenceTransformerModelName, tokenizerConfig); -} - -// 新的嵌入生成函数(使用ONNX) -async function getONNXEmbeddings(texts: string[], session: ort.InferenceSession): Promise { - const { input_ids } = await tokenizer(texts, { - add_special_tokens: false, - return_tensor: false, - }); - - // 构造输入参数 - const cumsum = (arr: number[]): number[] => - arr.reduce((acc: number[], num: number, i: number) => [...acc, num + (acc[i - 1] || 0)], []); - - const offsets: number[] = [0, ...cumsum(input_ids.slice(0, -1).map((x: string) => x.length))]; - const flattened_input_ids = input_ids.flat(); - - // 准备ONNX输入 - const inputs = { - input_ids: new ort.Tensor("int64", new BigInt64Array(flattened_input_ids.map(BigInt)), [ - flattened_input_ids.length, - ]), - offsets: new ort.Tensor("int64", new BigInt64Array(offsets.map(BigInt)), [offsets.length]), - }; - - // 执行推理 - const { embeddings } = await session.run(inputs); - return Array.from(embeddings.data as Float32Array); -} - -// 分类推理函数 -async function runClassification(embeddings: number[]): Promise { - const inputTensor = new ort.Tensor( - Float32Array.from(embeddings), - [1, 3, 1024], - ); - - const { logits } = await sessionClassifier.run({ channel_features: inputTensor }); - return softmax(logits.data as Float32Array); -} - -// 指标计算函数 -function calculateMetrics(labels: number[], predictions: number[], elapsedTime: number): { - accuracy: number; - precision: number; - recall: number; - f1: number; - "Class 0 Prec": number; - speed: string; -} { - // 输出label和prediction不一样的index列表 - const arr = []; - for (let i = 0; i < labels.length; i++) { - if (labels[i] !== predictions[i] && predictions[i] == 0) { - arr.push([i + 1, labels[i], predictions[i]]); - } - } - console.log(arr); - // 初始化混淆矩阵 - const classCount = Math.max(...labels, ...predictions) + 1; - const matrix = Array.from({ length: classCount }, () => Array.from({ length: classCount }, () => 0)); - - // 填充矩阵 - labels.forEach((trueLabel, i) => { - matrix[trueLabel][predictions[i]]++; - }); - - // 计算各指标 - let totalTP = 0, totalFP = 0, totalFN = 0; - - for (let c = 0; c < classCount; c++) { - const TP = matrix[c][c]; - const FP = matrix.flatMap((row, i) => i === c ? [] : [row[c]]).reduce((a, b) => a + b, 0); - const FN = matrix[c].filter((_, i) => i !== c).reduce((a, b) => a + b, 0); - - totalTP += TP; - totalFP += FP; - totalFN += FN; - } - - const precision = totalTP / (totalTP + totalFP); - const recall = totalTP / (totalTP + totalFN); - const f1 = 2 * (precision * recall) / (precision + recall) || 0; - - // 计算Class 0 Precision - const class0TP = matrix[0][0]; - const class0FP = matrix.flatMap((row, i) => i === 0 ? [] : [row[0]]).reduce((a, b) => a + b, 0); - const class0Precision = class0TP / (class0TP + class0FP) || 0; - - return { - accuracy: labels.filter((l, i) => l === predictions[i]).length / labels.length, - precision, - recall, - f1, - speed: `${(labels.length / (elapsedTime / 1000)).toFixed(1)} samples/sec`, - "Class 0 Prec": class0Precision, - }; -} - -// 改造后的评估函数 -async function evaluateModel(session: ort.InferenceSession): Promise<{ - accuracy: number; - precision: number; - recall: number; - f1: number; - "Class 0 Prec": number; -}> { - const data = await Deno.readTextFile(testDataPath); - const samples = data.split("\n") - .map((line) => { - try { - return JSON.parse(line); - } catch { - return null; - } - }) - .filter(Boolean); - - const allPredictions: number[] = []; - const allLabels: number[] = []; - - const t = new Date().getTime(); - for (const sample of samples) { - try { - const embeddings = await getONNXEmbeddings([ - sample.title, - sample.description, - sample.tags.join(","), - ], session); - - const probabilities = await runClassification(embeddings); - allPredictions.push(probabilities.indexOf(Math.max(...probabilities))); - allLabels.push(sample.label); - } catch (error) { - console.error("Processing error:", error); - } - } - const elapsed = new Date().getTime() - t; - - return calculateMetrics(allLabels, allPredictions, elapsed); -} - -// 主函数 -async function main() { - await loadTokenizer(); - - const metrics = await evaluateModel(sessionEmbedding); - console.log("Model Metrics:"); - console.table(metrics); -} - -await main(); diff --git a/packages/crawler/ml/quant_benchmark.ts b/packages/crawler/ml/quant_benchmark.ts deleted file mode 100644 index aab6308..0000000 --- a/packages/crawler/ml/quant_benchmark.ts +++ /dev/null @@ -1,171 +0,0 @@ -import { AutoTokenizer, PreTrainedTokenizer } from "@huggingface/transformers"; -import * as ort from "onnxruntime"; - -function softmax(logits: Float32Array): number[] { - const maxLogit = Math.max(...logits); - const exponents = logits.map((logit) => Math.exp(logit - maxLogit)); - const sumOfExponents = exponents.reduce((sum, exp) => sum + exp, 0); - return Array.from(exponents.map((exp) => exp / sumOfExponents)); -} - -// 配置参数 -const sentenceTransformerModelName = "alikia2x/jina-embedding-v3-m2v-1024"; -const onnxClassifierPath = "./model/video_classifier_v3_11.onnx"; -const onnxEmbeddingOriginalPath = "./model/embedding_original.onnx"; -const onnxEmbeddingQuantizedPath = "./model/embedding_original.onnx"; - -// 初始化会话 -const [sessionClassifier, sessionEmbeddingOriginal, sessionEmbeddingQuantized] = await Promise.all([ - ort.InferenceSession.create(onnxClassifierPath), - ort.InferenceSession.create(onnxEmbeddingOriginalPath), - ort.InferenceSession.create(onnxEmbeddingQuantizedPath), -]); - -let tokenizer: PreTrainedTokenizer; - -// 初始化分词器 -async function loadTokenizer() { - const tokenizerConfig = { local_files_only: true }; - tokenizer = await AutoTokenizer.from_pretrained(sentenceTransformerModelName, tokenizerConfig); -} - -// 新的嵌入生成函数(使用ONNX) -async function getONNXEmbeddings(texts: string[], session: ort.InferenceSession): Promise { - const { input_ids } = await tokenizer(texts, { - add_special_tokens: false, - return_tensor: false, - }); - - // 构造输入参数 - const cumsum = (arr: number[]): number[] => - arr.reduce((acc: number[], num: number, i: number) => [...acc, num + (acc[i - 1] || 0)], []); - - const offsets: number[] = [0, ...cumsum(input_ids.slice(0, -1).map((x: string) => x.length))]; - const flattened_input_ids = input_ids.flat(); - - // 准备ONNX输入 - const inputs = { - input_ids: new ort.Tensor("int64", new BigInt64Array(flattened_input_ids.map(BigInt)), [ - flattened_input_ids.length, - ]), - offsets: new ort.Tensor("int64", new BigInt64Array(offsets.map(BigInt)), [offsets.length]), - }; - - // 执行推理 - const { embeddings } = await session.run(inputs); - return Array.from(embeddings.data as Float32Array); -} - -// 分类推理函数 -async function runClassification(embeddings: number[]): Promise { - const inputTensor = new ort.Tensor( - Float32Array.from(embeddings), - [1, 4, 1024], - ); - - const { logits } = await sessionClassifier.run({ channel_features: inputTensor }); - return softmax(logits.data as Float32Array); -} - -// 指标计算函数 -function calculateMetrics(labels: number[], predictions: number[], elapsedTime: number): { - accuracy: number; - precision: number; - recall: number; - f1: number; - speed: string; -} { - // 初始化混淆矩阵 - const classCount = Math.max(...labels, ...predictions) + 1; - const matrix = Array.from({ length: classCount }, () => Array.from({ length: classCount }, () => 0)); - - // 填充矩阵 - labels.forEach((trueLabel, i) => { - matrix[trueLabel][predictions[i]]++; - }); - - // 计算各指标 - let totalTP = 0, totalFP = 0, totalFN = 0; - - for (let c = 0; c < classCount; c++) { - const TP = matrix[c][c]; - const FP = matrix.flatMap((row, i) => i === c ? [] : [row[c]]).reduce((a, b) => a + b, 0); - const FN = matrix[c].filter((_, i) => i !== c).reduce((a, b) => a + b, 0); - - totalTP += TP; - totalFP += FP; - totalFN += FN; - } - - const precision = totalTP / (totalTP + totalFP); - const recall = totalTP / (totalTP + totalFN); - const f1 = 2 * (precision * recall) / (precision + recall) || 0; - - return { - accuracy: labels.filter((l, i) => l === predictions[i]).length / labels.length, - precision, - recall, - f1, - speed: `${(labels.length / (elapsedTime / 1000)).toFixed(1)} samples/sec`, - }; -} - -// 改造后的评估函数 -async function evaluateModel(session: ort.InferenceSession): Promise<{ - accuracy: number; - precision: number; - recall: number; - f1: number; -}> { - const data = await Deno.readTextFile("./data/filter/test1.jsonl"); - const samples = data.split("\n") - .map((line) => { - try { - return JSON.parse(line); - } catch { - return null; - } - }) - .filter(Boolean); - - const allPredictions: number[] = []; - const allLabels: number[] = []; - - const t = new Date().getTime(); - for (const sample of samples) { - try { - const embeddings = await getONNXEmbeddings([ - sample.title, - sample.description, - sample.tags.join(","), - sample.author_info, - ], session); - - const probabilities = await runClassification(embeddings); - allPredictions.push(probabilities.indexOf(Math.max(...probabilities))); - allLabels.push(sample.label); - } catch (error) { - console.error("Processing error:", error); - } - } - const elapsed = new Date().getTime() - t; - - return calculateMetrics(allLabels, allPredictions, elapsed); -} - -// 主函数 -async function main() { - await loadTokenizer(); - - // 评估原始模型 - const originalMetrics = await evaluateModel(sessionEmbeddingOriginal); - console.log("Original Model Metrics:"); - console.table(originalMetrics); - - // 评估量化模型 - const quantizedMetrics = await evaluateModel(sessionEmbeddingQuantized); - console.log("Quantized Model Metrics:"); - console.table(quantizedMetrics); -} - -await main(); diff --git a/packages/crawler/mq/scheduling.ts b/packages/crawler/mq/scheduling.ts index cf7427f..3d25515 100644 --- a/packages/crawler/mq/scheduling.ts +++ b/packages/crawler/mq/scheduling.ts @@ -2,7 +2,7 @@ import { findClosestSnapshot, getLatestSnapshot, hasAtLeast2Snapshots } from "db import { truncate } from "utils/truncate.ts"; import { closetMilestone } from "./exec/snapshotTick.ts"; import { HOUR, MINUTE } from "@core/const/time.ts"; -import type { Psql } from "@core/db/global.d.ts"; +import type { Psql } from "@core/db/psql.d.ts"; const log = (value: number, base: number = 10) => Math.log(value) / Math.log(base); @@ -12,13 +12,12 @@ const getFactor = (x: number) => { const c = 100; const u = 0.601; const g = 455; - if (x>g) { - return log(b/log(x+1),a); + if (x > g) { + return log(b / log(x + 1), a); + } else { + return log(b / log(x + c), a) + u; } - else { - return log(b/log(x+c),a)+u; - } -} +}; /* * Returns the minimum ETA in hours for the next snapshot diff --git a/packages/crawler/mq/task/regularSnapshotInterval.ts b/packages/crawler/mq/task/regularSnapshotInterval.ts index 852d401..e7db224 100644 --- a/packages/crawler/mq/task/regularSnapshotInterval.ts +++ b/packages/crawler/mq/task/regularSnapshotInterval.ts @@ -1,6 +1,6 @@ import { findClosestSnapshot, findSnapshotBefore, getLatestSnapshot } from "db/snapshotSchedule.ts"; import { HOUR } from "@core/const/time.ts"; -import type { Psql } from "@core/db/global.d.ts"; +import type { Psql } from "@core/db/psql"; export const getRegularSnapshotInterval = async (sql: Psql, aid: number) => { const now = Date.now(); @@ -14,7 +14,7 @@ export const getRegularSnapshotInterval = async (sql: Psql, aid: number) => { if (hoursDiff < 8) return 24; const viewsDiff = latestSnapshot.views - oldSnapshot.views; if (viewsDiff === 0) return 72; - const speedPerDay = viewsDiff / (hoursDiff + 0.001) * 24; + const speedPerDay = (viewsDiff / (hoursDiff + 0.001)) * 24; if (speedPerDay < 6) return 36; if (speedPerDay < 120) return 24; if (speedPerDay < 320) return 12; diff --git a/packages/next/bun.lock b/packages/next/bun.lock index fa574a2..dfcb802 100644 --- a/packages/next/bun.lock +++ b/packages/next/bun.lock @@ -4,7 +4,7 @@ "": { "name": "next", "dependencies": { - "@cvsa/core": "^0.0.1", + "@cvsa/core": "^0.0.2", "axios": "^1.9.0", "framer-motion": "^12.15.0", "i18next": "^25.2.1", @@ -34,7 +34,7 @@ "@colors/colors": ["@colors/colors@1.6.0", "", {}, "sha512-Ir+AOibqzrIsL6ajt3Rz3LskB7OiMVHqltZmspbW/TJuTVuyOMirVqAkjfY6JISiLHgyNqicAC8AyHHGzNd/dA=="], - "@cvsa/core": ["@cvsa/core@0.0.1", "", { "dependencies": { "@koshnic/ratelimit": "^1.0.3", "chalk": "^5.4.1", "ioredis": "^5.6.1", "logform": "^2.7.0", "postgres": "^3.4.5", "winston": "^3.17.0" } }, "sha512-h7p2AHcvdIA7GCJq4k1sOSGGbs/qjdHa4WlcCh6p1rVgpkpXp6v1Q9lvXca3uqAkInwzXctDSGwKiQp65K5XOg=="], + "@cvsa/core": ["@cvsa/core@0.0.2", "", { "dependencies": { "@koshnic/ratelimit": "^1.0.3", "chalk": "^5.4.1", "ioredis": "^5.6.1", "logform": "^2.7.0", "postgres": "^3.4.5", "winston": "^3.17.0" } }, "sha512-SKiFZYk3+DUCx31R+yFlcMb9S6tbdQdSSV2H+cPNgmCoOcEbBcZvB99iG4vy7wpKOcPy1bDVvUEmIMo3nIxCbQ=="], "@dabh/diagnostics": ["@dabh/diagnostics@2.0.3", "", { "dependencies": { "colorspace": "1.1.x", "enabled": "2.0.x", "kuler": "^2.0.0" } }, "sha512-hrlQOIi7hAfzsMqlGSFyVucrx38O+j6wiGOf//H2ecvIEqYN4ADBSS2iLMh5UFyDunCNniUIPk/q3riFv45xRA=="], diff --git a/packages/next/package.json b/packages/next/package.json index 9741dcf..a46a3fb 100644 --- a/packages/next/package.json +++ b/packages/next/package.json @@ -10,7 +10,7 @@ "format": "prettier --write ." }, "dependencies": { - "@cvsa/core": "^0.0.1", + "@cvsa/core": "^0.0.2", "axios": "^1.9.0", "framer-motion": "^12.15.0", "i18next": "^25.2.1",