update: crawling
This commit is contained in:
parent
8da76839db
commit
febe2157cb
@ -4,7 +4,7 @@ import { getBiliBiliVideoInfo } from "./videoInfo.ts";
|
||||
import { ensureDir } from "https://deno.land/std@0.113.0/fs/mod.ts";
|
||||
|
||||
const aidPath = "./data/2025010104_c30_aids.txt";
|
||||
const db = new Database("./data/main.db");
|
||||
const db = new Database("./data/main.db", { int64: true });
|
||||
const regions = ["shanghai", "hangzhou", "qingdao", "beijing", "zhangjiakou", "chengdu", "shenzhen", "hohhot"];
|
||||
const logDir = "./logs/bili-info-crawl";
|
||||
const logFile = path.join(logDir, `run-${Date.now() / 1000}.log`);
|
||||
@ -27,11 +27,13 @@ async function setupLogging() {
|
||||
await ensureDir(logDir);
|
||||
const logStream = await Deno.open(logFile, { write: true, create: true, append: true });
|
||||
|
||||
const redirectConsole = (originalConsole: (...args: any[]) => void) => (...args: any[]) => {
|
||||
const message = args.map((arg) => (typeof arg === "object" ? JSON.stringify(arg) : arg)).join(" ");
|
||||
originalConsole(message);
|
||||
logStream.write(new TextEncoder().encode(message + "\n"));
|
||||
};
|
||||
const redirectConsole =
|
||||
(originalConsole: (...args: any[]) => void) =>
|
||||
(...args: any[]) => {
|
||||
const message = args.map((arg) => (typeof arg === "object" ? JSON.stringify(arg) : arg)).join(" ");
|
||||
originalConsole(message);
|
||||
logStream.write(new TextEncoder().encode(message + "\n"));
|
||||
};
|
||||
|
||||
console.log = redirectConsole(console.log);
|
||||
console.error = redirectConsole(console.error);
|
||||
@ -49,14 +51,36 @@ function isRateLimited(): boolean {
|
||||
|
||||
async function insertAidsToDB() {
|
||||
const aidRawcontent = await Deno.readTextFile(aidPath);
|
||||
const aids = aidRawcontent.split("\n").filter((line) => line.length > 0).map((line) => parseInt(line));
|
||||
const aids = aidRawcontent
|
||||
.split("\n")
|
||||
.filter((line) => line.length > 0)
|
||||
.map((line) => parseInt(line));
|
||||
|
||||
if (!db.prepare("SELECT COUNT(*) FROM bili_info_crawl").get()) {
|
||||
const insertStmt = db.prepare("INSERT OR IGNORE INTO bili_info_crawl (aid, status) VALUES (?, 'pending')");
|
||||
aids.forEach((aid) => insertStmt.run(aid));
|
||||
}
|
||||
// if (!db.prepare("SELECT COUNT(*) FROM bili_info_crawl").get()) {
|
||||
// const insertStmt = db.prepare("INSERT OR IGNORE INTO bili_info_crawl (aid, status) VALUES (?, 'pending')");
|
||||
// aids.forEach((aid) => insertStmt.run(aid));
|
||||
// }
|
||||
|
||||
const aidsInDB = db.prepare("SELECT aid FROM bili_info_crawl WHERE status = 'pending' OR status = 'failed'")
|
||||
// 查询数据库中已经存在的 aid
|
||||
const existingAids = db
|
||||
.prepare("SELECT aid FROM bili_info_crawl")
|
||||
.all()
|
||||
.map((row) => row.aid);
|
||||
console.log(existingAids.length);
|
||||
|
||||
// 将 existingAids 转换为 Set 以提高查找效率
|
||||
const existingAidsSet = new Set(existingAids);
|
||||
|
||||
// 找出 aids 数组中不存在于数据库的条目
|
||||
const newAids = aids.filter((aid) => !existingAidsSet.has(aid));
|
||||
console.log(newAids.length);
|
||||
|
||||
// 插入这些新条目
|
||||
const insertStmt = db.prepare("INSERT OR IGNORE INTO bili_info_crawl (aid, status) VALUES (?, 'pending')");
|
||||
newAids.forEach((aid) => insertStmt.run(aid));
|
||||
|
||||
const aidsInDB = db
|
||||
.prepare("SELECT aid FROM bili_info_crawl WHERE status = 'pending' OR status = 'failed'")
|
||||
.all()
|
||||
.map((row) => row.aid) as number[];
|
||||
|
||||
@ -68,18 +92,18 @@ async function insertAidsToDB() {
|
||||
try {
|
||||
const res = await getBiliBiliVideoInfo(aid, regions[processedAids % regions.length]);
|
||||
if (res === null) {
|
||||
updateAidStatus(aid, 'failed');
|
||||
updateAidStatus(aid, "failed");
|
||||
} else {
|
||||
const rawData = JSON.parse(res);
|
||||
if (rawData.code === 0) {
|
||||
updateAidStatus(aid, 'success', rawData.data.View.bvid, JSON.stringify(rawData.data));
|
||||
updateAidStatus(aid, "success", rawData.data.View.bvid, JSON.stringify(rawData.data));
|
||||
} else {
|
||||
updateAidStatus(aid, 'error', undefined, res);
|
||||
updateAidStatus(aid, "error", undefined, res);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error updating aid ${aid}: ${error}`);
|
||||
updateAidStatus(aid, 'failed');
|
||||
updateAidStatus(aid, "failed");
|
||||
} finally {
|
||||
processedAids++;
|
||||
logProgress(aid, processedAids, totalAids, startTime);
|
||||
@ -108,8 +132,8 @@ function updateAidStatus(aid: number, status: string, bvid?: string, data?: stri
|
||||
const stmt = db.prepare(`
|
||||
UPDATE bili_info_crawl
|
||||
SET status = ?,
|
||||
${bvid ? 'bvid = ?,' : ''}
|
||||
${data ? 'data = ?,' : ''}
|
||||
${bvid ? "bvid = ?," : ""}
|
||||
${data ? "data = ?," : ""}
|
||||
timestamp = ?
|
||||
WHERE aid = ?
|
||||
`);
|
||||
@ -130,7 +154,13 @@ function logProgress(aid: number, processedAids: number, totalAids: number, star
|
||||
const etaMinutes = Math.floor(etaSeconds / 60);
|
||||
const etaHours = Math.floor(etaMinutes / 60);
|
||||
|
||||
const progress = `${processedAids}/${totalAids}, ${((processedAids / totalAids) * 100).toFixed(2)}%, elapsed ${elapsedHours.toString().padStart(2, "0")}:${(elapsedMinutes % 60).toString().padStart(2, "0")}:${(elapsedSeconds % 60).toString().padStart(2, "0")}, ETA ${etaHours}h${(etaMinutes % 60).toString().padStart(2, "0")}m`;
|
||||
const progress = `${processedAids}/${totalAids}, ${((processedAids / totalAids) * 100).toFixed(
|
||||
2
|
||||
)}%, elapsed ${elapsedHours.toString().padStart(2, "0")}:${(elapsedMinutes % 60).toString().padStart(2, "0")}:${(
|
||||
elapsedSeconds % 60
|
||||
)
|
||||
.toString()
|
||||
.padStart(2, "0")}, ETA ${etaHours}h${(etaMinutes % 60).toString().padStart(2, "0")}m`;
|
||||
console.log(`Updated aid ${aid}, ${progress}`);
|
||||
}
|
||||
|
||||
|
@ -47,7 +47,7 @@ async function proxyRequestWithRegion(url: string, region: string): Promise<any
|
||||
}
|
||||
}
|
||||
catch (e){
|
||||
console.error(`Error proxying requestt ${url} to ${region}: ${e}`);
|
||||
console.error(`Error proxying request ${url} to ${region}: ${e}`);
|
||||
return null;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user