update: crawling

This commit is contained in:
alikia2x (寒寒) 2025-01-24 14:35:53 +08:00
parent 8da76839db
commit febe2157cb
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
2 changed files with 51 additions and 21 deletions

View File

@ -4,7 +4,7 @@ import { getBiliBiliVideoInfo } from "./videoInfo.ts";
import { ensureDir } from "https://deno.land/std@0.113.0/fs/mod.ts";
const aidPath = "./data/2025010104_c30_aids.txt";
const db = new Database("./data/main.db");
const db = new Database("./data/main.db", { int64: true });
const regions = ["shanghai", "hangzhou", "qingdao", "beijing", "zhangjiakou", "chengdu", "shenzhen", "hohhot"];
const logDir = "./logs/bili-info-crawl";
const logFile = path.join(logDir, `run-${Date.now() / 1000}.log`);
@ -27,11 +27,13 @@ async function setupLogging() {
await ensureDir(logDir);
const logStream = await Deno.open(logFile, { write: true, create: true, append: true });
const redirectConsole = (originalConsole: (...args: any[]) => void) => (...args: any[]) => {
const message = args.map((arg) => (typeof arg === "object" ? JSON.stringify(arg) : arg)).join(" ");
originalConsole(message);
logStream.write(new TextEncoder().encode(message + "\n"));
};
const redirectConsole =
(originalConsole: (...args: any[]) => void) =>
(...args: any[]) => {
const message = args.map((arg) => (typeof arg === "object" ? JSON.stringify(arg) : arg)).join(" ");
originalConsole(message);
logStream.write(new TextEncoder().encode(message + "\n"));
};
console.log = redirectConsole(console.log);
console.error = redirectConsole(console.error);
@ -49,14 +51,36 @@ function isRateLimited(): boolean {
async function insertAidsToDB() {
const aidRawcontent = await Deno.readTextFile(aidPath);
const aids = aidRawcontent.split("\n").filter((line) => line.length > 0).map((line) => parseInt(line));
const aids = aidRawcontent
.split("\n")
.filter((line) => line.length > 0)
.map((line) => parseInt(line));
if (!db.prepare("SELECT COUNT(*) FROM bili_info_crawl").get()) {
const insertStmt = db.prepare("INSERT OR IGNORE INTO bili_info_crawl (aid, status) VALUES (?, 'pending')");
aids.forEach((aid) => insertStmt.run(aid));
}
// if (!db.prepare("SELECT COUNT(*) FROM bili_info_crawl").get()) {
// const insertStmt = db.prepare("INSERT OR IGNORE INTO bili_info_crawl (aid, status) VALUES (?, 'pending')");
// aids.forEach((aid) => insertStmt.run(aid));
// }
const aidsInDB = db.prepare("SELECT aid FROM bili_info_crawl WHERE status = 'pending' OR status = 'failed'")
// 查询数据库中已经存在的 aid
const existingAids = db
.prepare("SELECT aid FROM bili_info_crawl")
.all()
.map((row) => row.aid);
console.log(existingAids.length);
// 将 existingAids 转换为 Set 以提高查找效率
const existingAidsSet = new Set(existingAids);
// 找出 aids 数组中不存在于数据库的条目
const newAids = aids.filter((aid) => !existingAidsSet.has(aid));
console.log(newAids.length);
// 插入这些新条目
const insertStmt = db.prepare("INSERT OR IGNORE INTO bili_info_crawl (aid, status) VALUES (?, 'pending')");
newAids.forEach((aid) => insertStmt.run(aid));
const aidsInDB = db
.prepare("SELECT aid FROM bili_info_crawl WHERE status = 'pending' OR status = 'failed'")
.all()
.map((row) => row.aid) as number[];
@ -68,18 +92,18 @@ async function insertAidsToDB() {
try {
const res = await getBiliBiliVideoInfo(aid, regions[processedAids % regions.length]);
if (res === null) {
updateAidStatus(aid, 'failed');
updateAidStatus(aid, "failed");
} else {
const rawData = JSON.parse(res);
if (rawData.code === 0) {
updateAidStatus(aid, 'success', rawData.data.View.bvid, JSON.stringify(rawData.data));
updateAidStatus(aid, "success", rawData.data.View.bvid, JSON.stringify(rawData.data));
} else {
updateAidStatus(aid, 'error', undefined, res);
updateAidStatus(aid, "error", undefined, res);
}
}
} catch (error) {
console.error(`Error updating aid ${aid}: ${error}`);
updateAidStatus(aid, 'failed');
updateAidStatus(aid, "failed");
} finally {
processedAids++;
logProgress(aid, processedAids, totalAids, startTime);
@ -108,8 +132,8 @@ function updateAidStatus(aid: number, status: string, bvid?: string, data?: stri
const stmt = db.prepare(`
UPDATE bili_info_crawl
SET status = ?,
${bvid ? 'bvid = ?,' : ''}
${data ? 'data = ?,' : ''}
${bvid ? "bvid = ?," : ""}
${data ? "data = ?," : ""}
timestamp = ?
WHERE aid = ?
`);
@ -130,7 +154,13 @@ function logProgress(aid: number, processedAids: number, totalAids: number, star
const etaMinutes = Math.floor(etaSeconds / 60);
const etaHours = Math.floor(etaMinutes / 60);
const progress = `${processedAids}/${totalAids}, ${((processedAids / totalAids) * 100).toFixed(2)}%, elapsed ${elapsedHours.toString().padStart(2, "0")}:${(elapsedMinutes % 60).toString().padStart(2, "0")}:${(elapsedSeconds % 60).toString().padStart(2, "0")}, ETA ${etaHours}h${(etaMinutes % 60).toString().padStart(2, "0")}m`;
const progress = `${processedAids}/${totalAids}, ${((processedAids / totalAids) * 100).toFixed(
2
)}%, elapsed ${elapsedHours.toString().padStart(2, "0")}:${(elapsedMinutes % 60).toString().padStart(2, "0")}:${(
elapsedSeconds % 60
)
.toString()
.padStart(2, "0")}, ETA ${etaHours}h${(etaMinutes % 60).toString().padStart(2, "0")}m`;
console.log(`Updated aid ${aid}, ${progress}`);
}

View File

@ -47,7 +47,7 @@ async function proxyRequestWithRegion(url: string, region: string): Promise<any
}
}
catch (e){
console.error(`Error proxying requestt ${url} to ${region}: ${e}`);
console.error(`Error proxying request ${url} to ${region}: ${e}`);
return null;
}
}