update: crawling
This commit is contained in:
parent
8da76839db
commit
febe2157cb
@ -4,7 +4,7 @@ import { getBiliBiliVideoInfo } from "./videoInfo.ts";
|
|||||||
import { ensureDir } from "https://deno.land/std@0.113.0/fs/mod.ts";
|
import { ensureDir } from "https://deno.land/std@0.113.0/fs/mod.ts";
|
||||||
|
|
||||||
const aidPath = "./data/2025010104_c30_aids.txt";
|
const aidPath = "./data/2025010104_c30_aids.txt";
|
||||||
const db = new Database("./data/main.db");
|
const db = new Database("./data/main.db", { int64: true });
|
||||||
const regions = ["shanghai", "hangzhou", "qingdao", "beijing", "zhangjiakou", "chengdu", "shenzhen", "hohhot"];
|
const regions = ["shanghai", "hangzhou", "qingdao", "beijing", "zhangjiakou", "chengdu", "shenzhen", "hohhot"];
|
||||||
const logDir = "./logs/bili-info-crawl";
|
const logDir = "./logs/bili-info-crawl";
|
||||||
const logFile = path.join(logDir, `run-${Date.now() / 1000}.log`);
|
const logFile = path.join(logDir, `run-${Date.now() / 1000}.log`);
|
||||||
@ -27,11 +27,13 @@ async function setupLogging() {
|
|||||||
await ensureDir(logDir);
|
await ensureDir(logDir);
|
||||||
const logStream = await Deno.open(logFile, { write: true, create: true, append: true });
|
const logStream = await Deno.open(logFile, { write: true, create: true, append: true });
|
||||||
|
|
||||||
const redirectConsole = (originalConsole: (...args: any[]) => void) => (...args: any[]) => {
|
const redirectConsole =
|
||||||
const message = args.map((arg) => (typeof arg === "object" ? JSON.stringify(arg) : arg)).join(" ");
|
(originalConsole: (...args: any[]) => void) =>
|
||||||
originalConsole(message);
|
(...args: any[]) => {
|
||||||
logStream.write(new TextEncoder().encode(message + "\n"));
|
const message = args.map((arg) => (typeof arg === "object" ? JSON.stringify(arg) : arg)).join(" ");
|
||||||
};
|
originalConsole(message);
|
||||||
|
logStream.write(new TextEncoder().encode(message + "\n"));
|
||||||
|
};
|
||||||
|
|
||||||
console.log = redirectConsole(console.log);
|
console.log = redirectConsole(console.log);
|
||||||
console.error = redirectConsole(console.error);
|
console.error = redirectConsole(console.error);
|
||||||
@ -49,14 +51,36 @@ function isRateLimited(): boolean {
|
|||||||
|
|
||||||
async function insertAidsToDB() {
|
async function insertAidsToDB() {
|
||||||
const aidRawcontent = await Deno.readTextFile(aidPath);
|
const aidRawcontent = await Deno.readTextFile(aidPath);
|
||||||
const aids = aidRawcontent.split("\n").filter((line) => line.length > 0).map((line) => parseInt(line));
|
const aids = aidRawcontent
|
||||||
|
.split("\n")
|
||||||
|
.filter((line) => line.length > 0)
|
||||||
|
.map((line) => parseInt(line));
|
||||||
|
|
||||||
if (!db.prepare("SELECT COUNT(*) FROM bili_info_crawl").get()) {
|
// if (!db.prepare("SELECT COUNT(*) FROM bili_info_crawl").get()) {
|
||||||
const insertStmt = db.prepare("INSERT OR IGNORE INTO bili_info_crawl (aid, status) VALUES (?, 'pending')");
|
// const insertStmt = db.prepare("INSERT OR IGNORE INTO bili_info_crawl (aid, status) VALUES (?, 'pending')");
|
||||||
aids.forEach((aid) => insertStmt.run(aid));
|
// aids.forEach((aid) => insertStmt.run(aid));
|
||||||
}
|
// }
|
||||||
|
|
||||||
const aidsInDB = db.prepare("SELECT aid FROM bili_info_crawl WHERE status = 'pending' OR status = 'failed'")
|
// 查询数据库中已经存在的 aid
|
||||||
|
const existingAids = db
|
||||||
|
.prepare("SELECT aid FROM bili_info_crawl")
|
||||||
|
.all()
|
||||||
|
.map((row) => row.aid);
|
||||||
|
console.log(existingAids.length);
|
||||||
|
|
||||||
|
// 将 existingAids 转换为 Set 以提高查找效率
|
||||||
|
const existingAidsSet = new Set(existingAids);
|
||||||
|
|
||||||
|
// 找出 aids 数组中不存在于数据库的条目
|
||||||
|
const newAids = aids.filter((aid) => !existingAidsSet.has(aid));
|
||||||
|
console.log(newAids.length);
|
||||||
|
|
||||||
|
// 插入这些新条目
|
||||||
|
const insertStmt = db.prepare("INSERT OR IGNORE INTO bili_info_crawl (aid, status) VALUES (?, 'pending')");
|
||||||
|
newAids.forEach((aid) => insertStmt.run(aid));
|
||||||
|
|
||||||
|
const aidsInDB = db
|
||||||
|
.prepare("SELECT aid FROM bili_info_crawl WHERE status = 'pending' OR status = 'failed'")
|
||||||
.all()
|
.all()
|
||||||
.map((row) => row.aid) as number[];
|
.map((row) => row.aid) as number[];
|
||||||
|
|
||||||
@ -68,18 +92,18 @@ async function insertAidsToDB() {
|
|||||||
try {
|
try {
|
||||||
const res = await getBiliBiliVideoInfo(aid, regions[processedAids % regions.length]);
|
const res = await getBiliBiliVideoInfo(aid, regions[processedAids % regions.length]);
|
||||||
if (res === null) {
|
if (res === null) {
|
||||||
updateAidStatus(aid, 'failed');
|
updateAidStatus(aid, "failed");
|
||||||
} else {
|
} else {
|
||||||
const rawData = JSON.parse(res);
|
const rawData = JSON.parse(res);
|
||||||
if (rawData.code === 0) {
|
if (rawData.code === 0) {
|
||||||
updateAidStatus(aid, 'success', rawData.data.View.bvid, JSON.stringify(rawData.data));
|
updateAidStatus(aid, "success", rawData.data.View.bvid, JSON.stringify(rawData.data));
|
||||||
} else {
|
} else {
|
||||||
updateAidStatus(aid, 'error', undefined, res);
|
updateAidStatus(aid, "error", undefined, res);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Error updating aid ${aid}: ${error}`);
|
console.error(`Error updating aid ${aid}: ${error}`);
|
||||||
updateAidStatus(aid, 'failed');
|
updateAidStatus(aid, "failed");
|
||||||
} finally {
|
} finally {
|
||||||
processedAids++;
|
processedAids++;
|
||||||
logProgress(aid, processedAids, totalAids, startTime);
|
logProgress(aid, processedAids, totalAids, startTime);
|
||||||
@ -108,8 +132,8 @@ function updateAidStatus(aid: number, status: string, bvid?: string, data?: stri
|
|||||||
const stmt = db.prepare(`
|
const stmt = db.prepare(`
|
||||||
UPDATE bili_info_crawl
|
UPDATE bili_info_crawl
|
||||||
SET status = ?,
|
SET status = ?,
|
||||||
${bvid ? 'bvid = ?,' : ''}
|
${bvid ? "bvid = ?," : ""}
|
||||||
${data ? 'data = ?,' : ''}
|
${data ? "data = ?," : ""}
|
||||||
timestamp = ?
|
timestamp = ?
|
||||||
WHERE aid = ?
|
WHERE aid = ?
|
||||||
`);
|
`);
|
||||||
@ -130,9 +154,15 @@ function logProgress(aid: number, processedAids: number, totalAids: number, star
|
|||||||
const etaMinutes = Math.floor(etaSeconds / 60);
|
const etaMinutes = Math.floor(etaSeconds / 60);
|
||||||
const etaHours = Math.floor(etaMinutes / 60);
|
const etaHours = Math.floor(etaMinutes / 60);
|
||||||
|
|
||||||
const progress = `${processedAids}/${totalAids}, ${((processedAids / totalAids) * 100).toFixed(2)}%, elapsed ${elapsedHours.toString().padStart(2, "0")}:${(elapsedMinutes % 60).toString().padStart(2, "0")}:${(elapsedSeconds % 60).toString().padStart(2, "0")}, ETA ${etaHours}h${(etaMinutes % 60).toString().padStart(2, "0")}m`;
|
const progress = `${processedAids}/${totalAids}, ${((processedAids / totalAids) * 100).toFixed(
|
||||||
|
2
|
||||||
|
)}%, elapsed ${elapsedHours.toString().padStart(2, "0")}:${(elapsedMinutes % 60).toString().padStart(2, "0")}:${(
|
||||||
|
elapsedSeconds % 60
|
||||||
|
)
|
||||||
|
.toString()
|
||||||
|
.padStart(2, "0")}, ETA ${etaHours}h${(etaMinutes % 60).toString().padStart(2, "0")}m`;
|
||||||
console.log(`Updated aid ${aid}, ${progress}`);
|
console.log(`Updated aid ${aid}, ${progress}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
await setupLogging();
|
await setupLogging();
|
||||||
insertAidsToDB();
|
insertAidsToDB();
|
||||||
|
@ -47,7 +47,7 @@ async function proxyRequestWithRegion(url: string, region: string): Promise<any
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (e){
|
catch (e){
|
||||||
console.error(`Error proxying requestt ${url} to ${region}: ${e}`);
|
console.error(`Error proxying request ${url} to ${region}: ${e}`);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user