From 21be7883c43519a81fd15d7c73948ea81cf3a492 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sun, 27 Jul 2025 15:28:02 +0800 Subject: [PATCH] update: metadata archive script --- packages/crawler/net/getVideoDetails.ts | 4 +- src/metadataArchive.ts | 116 +++++++++++++++++++++++- 2 files changed, 116 insertions(+), 4 deletions(-) diff --git a/packages/crawler/net/getVideoDetails.ts b/packages/crawler/net/getVideoDetails.ts index 13c942e..432cbf2 100644 --- a/packages/crawler/net/getVideoDetails.ts +++ b/packages/crawler/net/getVideoDetails.ts @@ -2,9 +2,9 @@ import networkDelegate from "@core/net/delegate.ts"; import type { VideoDetailsData, VideoDetailsResponse } from "@core/net/bilibili.d.ts"; import logger from "@core/log/logger.ts"; -export async function getVideoDetails(aid: number): Promise { +export async function getVideoDetails(aid: number, archive: boolean = false): Promise { const url = `https://api.bilibili.com/x/web-interface/view/detail?aid=${aid}`; - const data = await networkDelegate.request(url, "getVideoInfo"); + const data = await networkDelegate.request(url, archive ? "" : "getVideoInfo"); const errMessage = `Error fetching metadata for ${aid}:`; if (data.code !== 0) { logger.error(errMessage + data.code + "-" + data.message, "net", "fn:getVideoInfo"); diff --git a/src/metadataArchive.ts b/src/metadataArchive.ts index 1f6ad77..bb2bffb 100644 --- a/src/metadataArchive.ts +++ b/src/metadataArchive.ts @@ -1,24 +1,78 @@ import arg from "arg"; -//import { getVideoDetails } from "@crawler/net/getVideoDetails"; +import { Database } from "bun:sqlite"; +import { getVideoDetails } from "@crawler/net/getVideoDetails"; import logger from "@core/log/logger"; +const SECOND = 1000; +const SECONDS = SECOND; +const MINUTE = 60 * SECONDS; +const MINUTES = MINUTE; +const IPs = 6; + +const rateLimits = [ + { window: 5 * MINUTES, maxRequests: 160 * IPs }, + { window: 30 * SECONDS, maxRequests: 20 * IPs }, + { window: 1.2 * SECOND, maxRequests: 1 * IPs } +]; + +const requestQueue: number[] = []; + +function isRateLimited(): boolean { + const now = Date.now(); + return rateLimits.some(({ window, maxRequests }) => { + const windowStart = now - window; + const requestsInWindow = requestQueue.filter((timestamp) => timestamp >= windowStart).length; + return requestsInWindow >= maxRequests; + }); +} + +function logProgress(aid: number, processedAids: number, totalAids: number, startTime: number) { + const elapsedTime = Date.now() - startTime; + const elapsedSeconds = Math.floor(elapsedTime / 1000); + const elapsedMinutes = Math.floor(elapsedSeconds / 60); + const elapsedHours = Math.floor(elapsedMinutes / 60); + + const remainingAids = totalAids - processedAids; + const averageTimePerAid = elapsedTime / processedAids; + const eta = remainingAids * averageTimePerAid; + const etaSeconds = Math.floor(eta / 1000); + const etaMinutes = Math.floor(etaSeconds / 60); + const etaHours = Math.floor(etaMinutes / 60); + + const progress = `${processedAids}/${totalAids}, ${((processedAids / totalAids) * 100).toFixed( + 2 + )}%, elapsed ${elapsedHours.toString().padStart(2, "0")}:${(elapsedMinutes % 60).toString().padStart(2, "0")}:${( + elapsedSeconds % 60 + ) + .toString() + .padStart(2, "0")}, ETA ${etaHours}h${(etaMinutes % 60).toString().padStart(2, "0")}m`; + console.log(`Updated aid ${aid}, ${progress}`); +} + const quit = (reason: string) => { logger.error(reason); process.exit(); }; const args = arg({ - "--aids": String // --port or --port= + "--aids": String, + "--db": String }); const aidsFileName = args["--aids"]; +const dbPath = args["--db"]; if (!aidsFileName) { quit("Missing --aids "); } +if (!dbPath) { + quit("Missing --db "); +} + const aidsFile = Bun.file(aidsFileName!); const fileExists = await aidsFile.exists(); + if (!fileExists) { quit(`${aidsFile} does not exist.`); } @@ -30,3 +84,61 @@ const aids = aidsText .filter((num) => !Number.isNaN(num)); logger.log(`Read ${aids.length} aids.`); + +const db = new Database(dbPath); +const existingAids = db.query<{ aid: number }, []>(`SELECT aid from bili_info_crawl`).all(); +logger.log(`Existing Aids: ${existingAids.length}`); +const existingAidsSet = new Set(existingAids.map((a) => a.aid)); +const newAids = aids.filter((aid) => !existingAidsSet.has(aid)); +logger.log(`New Aids: ${newAids.length}`); + +const totalAids = newAids.length; +let processedAids = 0; +const startTime = Date.now(); + +const processAid = async (aid: number) => { + try { + const data = await getVideoDetails(aid); + if (data === null) { + updateAidStatus(aid, "failed"); + } else { + updateAidStatus(aid, "success", data.View.bvid, JSON.stringify(data)); + } + } catch (error) { + console.error(`Error updating aid ${aid}: ${error}`); + updateAidStatus(aid, "failed"); + } finally { + processedAids++; + logProgress(aid, processedAids, totalAids, startTime); + } +}; + +const interval = setInterval(async () => { + if (newAids.length === 0) { + clearInterval(interval); + console.log("All aids processed."); + return; + } + if (!isRateLimited()) { + const aid = newAids.shift(); + if (aid !== undefined) { + requestQueue.push(Date.now()); + await processAid(aid); + } + } +}, 50); + +function updateAidStatus(aid: number, status: string, bvid?: string, data?: string) { + const query = db.query(` + INSERT INTO bili_info_crawl + (aid, bvid, status, bvid, data, timestamp) + VALUES ($aid, $bvid, $status, $bvid, $data, $timestamp) + `); + query.run({ + $aid: aid, + $bvid: bvid || null, + $status: status || null, + $data: data || null, + $timestamp: Date.now() / 1000 + }); +}