diff --git a/.gitignore b/.gitignore index bd7f803..dc60466 100644 --- a/.gitignore +++ b/.gitignore @@ -66,4 +66,6 @@ package-lock.json /lib/ # project specific -data/main.db \ No newline at end of file +data/main.db +.env +logs/ \ No newline at end of file diff --git a/deno.json b/deno.json index 94fdc6f..e326907 100644 --- a/deno.json +++ b/deno.json @@ -1,6 +1,6 @@ { "tasks": { - "crawl-raw-bili": "deno --allow-env --allow-ffi --allow-read --allow-net src/db/raw/insertAidsToDB.ts" + "crawl-raw-bili": "deno --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run src/db/raw/insertAidsToDB.ts" }, "imports": { "@std/assert": "jsr:@std/assert@1", diff --git a/deno.lock b/deno.lock index bb362f9..0d4d54e 100644 --- a/deno.lock +++ b/deno.lock @@ -331,6 +331,32 @@ "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==" } }, + "remote": { + "https://deno.land/std@0.113.0/_util/assert.ts": "2f868145a042a11d5ad0a3c748dcf580add8a0dbc0e876eaa0026303a5488f58", + "https://deno.land/std@0.113.0/_util/os.ts": "dfb186cc4e968c770ab6cc3288bd65f4871be03b93beecae57d657232ecffcac", + "https://deno.land/std@0.113.0/fs/_util.ts": "f2ce811350236ea8c28450ed822a5f42a0892316515b1cd61321dec13569c56b", + "https://deno.land/std@0.113.0/fs/copy.ts": "631bbafbfe6cba282158abc8aeb7e8251cc69a7ec28ce12878ea1b75fec2add4", + "https://deno.land/std@0.113.0/fs/empty_dir.ts": "5f08b263dd064dc7917c4bbeb13de0f5505a664b9cdfe312fa86e7518cfaeb84", + "https://deno.land/std@0.113.0/fs/ensure_dir.ts": "b7c103dc41a3d1dbbb522bf183c519c37065fdc234831a4a0f7d671b1ed5fea7", + "https://deno.land/std@0.113.0/fs/ensure_file.ts": "c06031af24368e80c330897e4b8e9109efc8602ffabc8f3e2306be07529e1d13", + "https://deno.land/std@0.113.0/fs/ensure_link.ts": "26e54363508b822afd87a3f6e873bbbcd6b5993dd638f8170758c16262a75065", + "https://deno.land/std@0.113.0/fs/ensure_symlink.ts": "c07b6d19ef58b6f5c671ffa942e7f9be50315f4f78e2f9f511626fd2e13beccc", + "https://deno.land/std@0.113.0/fs/eol.ts": "afaebaaac36f48c423b920c836551997715672b80a0fee9aa7667c181a94f2df", + "https://deno.land/std@0.113.0/fs/exists.ts": "c3c3335a212bd945bb75df379096ab57fb6c86598fa273dfb24da3b3939a951e", + "https://deno.land/std@0.113.0/fs/expand_glob.ts": "7c9173f93044051456b829a3f5a3676e58ba70b6ce4aae62cf24757b58556205", + "https://deno.land/std@0.113.0/fs/mod.ts": "26eee4b52a8c516e37d464094b080ff6822883e7f01ff0ba0a72b8dcd54b9927", + "https://deno.land/std@0.113.0/fs/move.ts": "4623058e39bbbeb3ad30aeff9c974c55d2d574ad7c480295c12b04c244686a99", + "https://deno.land/std@0.113.0/fs/walk.ts": "f633829f967d2979ab285dbfb09eb0d7d000fd175b95156b63fcede435d1a807", + "https://deno.land/std@0.113.0/path/_constants.ts": "1247fee4a79b70c89f23499691ef169b41b6ccf01887a0abd131009c5581b853", + "https://deno.land/std@0.113.0/path/_interface.ts": "1fa73b02aaa24867e481a48492b44f2598cd9dfa513c7b34001437007d3642e4", + "https://deno.land/std@0.113.0/path/_util.ts": "2e06a3b9e79beaf62687196bd4b60a4c391d862cfa007a20fc3a39f778ba073b", + "https://deno.land/std@0.113.0/path/common.ts": "f41a38a0719a1e85aa11c6ba3bea5e37c15dd009d705bd8873f94c833568cbc4", + "https://deno.land/std@0.113.0/path/glob.ts": "ea87985765b977cc284b92771003b2070c440e0807c90e1eb0ff3e095911a820", + "https://deno.land/std@0.113.0/path/mod.ts": "4465dc494f271b02569edbb4a18d727063b5dbd6ed84283ff906260970a15d12", + "https://deno.land/std@0.113.0/path/posix.ts": "34349174b9cd121625a2810837a82dd8b986bbaaad5ade690d1de75bbb4555b2", + "https://deno.land/std@0.113.0/path/separator.ts": "8fdcf289b1b76fd726a508f57d3370ca029ae6976fcde5044007f062e643ff1c", + "https://deno.land/std@0.113.0/path/win32.ts": "11549e8c6df8307a8efcfa47ad7b2a75da743eac7d4c89c9723a944661c8bd2e" + }, "workspace": { "dependencies": [ "jsr:@std/assert@1", diff --git a/src/crawler/bilibili/videoInfo.ts b/src/crawler/bilibili/videoInfo.ts deleted file mode 100644 index 4fb90d9..0000000 --- a/src/crawler/bilibili/videoInfo.ts +++ /dev/null @@ -1,42 +0,0 @@ -import axios from "axios"; - -// 随机选择 User-Agent -function getRandomUserAgent() { - const userAgents = [ - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15", - "Mozilla/5.0 (Linux; Android 10; Pixel 3 XL) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Mobile Safari/537.36", - "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1", - // 可以添加更多的 User-Agent 字符串 - ]; - const randomIndex = Math.floor(Math.random() * userAgents.length); - return userAgents[randomIndex]; -} - -export function getBiliBiliVideoInfo(bvidORaid?: string | number) { - const bvid = typeof bvidORaid === "string" ? bvidORaid : undefined; - const aid = typeof bvidORaid === "number" ? bvidORaid : undefined; - if (!bvid && !aid) { - return null; - } - const baseURL = "https://api.bilibili.com/x/web-interface/view/detail"; - const headers = { - 'User-Agent': getRandomUserAgent(), // 添加随机 User-Agent - }; - - if (aid) { - return axios.get(baseURL, { - params: { - aid: aid, - }, - headers: headers, // 将 headers 添加到请求中 - }); - } else { - return axios.get(baseURL, { - params: { - bvid: bvid, - }, - headers: headers, // 将 headers 添加到请求中 - }); - } -} diff --git a/src/db/raw/aliyun-fc.mjs b/src/db/raw/aliyun-fc.mjs new file mode 100644 index 0000000..6b54405 --- /dev/null +++ b/src/db/raw/aliyun-fc.mjs @@ -0,0 +1,77 @@ +'use strict'; + +export const handler = async (event, context) => { + const eventObj = JSON.parse(event); + console.log(`receive event: ${JSON.stringify(eventObj)}`); + + let body = 'Missing parameter: URL'; + let statusCode = 400; + + // User-Agent list + const userAgents = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15', + 'Mozilla/5.0 (Linux; Android 10; Pixel 3 XL) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Mobile Safari/537.36', + 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Gecko/20100101 Firefox/89.0' + ]; + + // get http request body + if ("body" in eventObj) { + body = eventObj.body; + if (eventObj.isBase64Encoded) { + body = Buffer.from(body, 'base64').toString('utf-8'); + } + } + console.log(`receive http body: ${body}`); + + // proxy the URL if it exists in eventObj + const refererUrl = 'https://www.bilibili.com/'; // Replace with your desired referer and origin + + if ("url" in eventObj) { + try { + const randomUserAgent = userAgents[Math.floor(Math.random() * userAgents.length)]; + const response = await fetch(eventObj.url, { + headers: { + 'User-Agent': randomUserAgent, + 'Referer': refererUrl + } + }); + statusCode = response.status; + body = await response.text(); + } catch (error) { + statusCode = 500; + body = `Error fetching URL: ${error.message}`; + } + } else if ("urls" in eventObj && Array.isArray(eventObj.urls)) { + const requests = eventObj.urls.map(async url => { + try { + const randomUserAgent = userAgents[Math.floor(Math.random() * userAgents.length)]; + const response = await fetch(url, { + headers: { + 'User-Agent': randomUserAgent, + 'Referer': refererUrl + } + }); + const responseBody = await response.text(); + return { + statusCode: response.status, + body: responseBody + }; + } catch (error) { + return { + statusCode: 500, + body: `Error fetching URL: ${error.message}` + }; + } + }); + + body = await Promise.all(requests); + statusCode = 200; // Assuming all URLs were processed successfully + } + + return { + 'statusCode': statusCode, + 'body': JSON.stringify(body) + }; +}; diff --git a/src/db/raw/insertAidsToDB.ts b/src/db/raw/insertAidsToDB.ts index fd6697e..bbc8ffa 100644 --- a/src/db/raw/insertAidsToDB.ts +++ b/src/db/raw/insertAidsToDB.ts @@ -1,32 +1,62 @@ import path from "node:path"; import { Database } from "jsr:@db/sqlite@0.12"; -import { getBiliBiliVideoInfo } from "../../crawler/bilibili/videoInfo.ts"; - -const aidPath = path.join("./data/2025010104_c30_aids.txt"); +import { getBiliBiliVideoInfo } from "./videoInfo.ts"; +import { ensureDir } from "https://deno.land/std@0.113.0/fs/mod.ts"; +const aidPath = "./data/2025010104_c30_aids.txt"; const db = new Database("./data/main.db"); +const regions = ["shanghai", "hangzhou", "qingdao", "beijing", "zhangjiakou", "chengdu", "shenzhen", "hohhot"]; +const logDir = "./logs/bili-info-crawl"; +const logFile = path.join(logDir, `run-${Date.now() / 1000}.log`); + +const SECOND = 1000; +const SECONDS = SECOND; +const MINUTE = 60 * SECONDS; +const MINUTES = MINUTE; +const IPs = regions.length; + +const rateLimits = [ + { window: 5 * MINUTES, maxRequests: 160 * IPs }, + { window: 30 * SECONDS, maxRequests: 20 * IPs }, + { window: 1.2 * SECOND, maxRequests: 1 * IPs }, +]; + +const requestQueue: number[] = []; + +async function setupLogging() { + await ensureDir(logDir); + const logStream = await Deno.open(logFile, { write: true, create: true, append: true }); + + const redirectConsole = (originalConsole: (...args: any[]) => void) => (...args: any[]) => { + const message = args.map((arg) => (typeof arg === "object" ? JSON.stringify(arg) : arg)).join(" "); + originalConsole(message); + logStream.write(new TextEncoder().encode(message + "\n")); + }; + + console.log = redirectConsole(console.log); + console.error = redirectConsole(console.error); + console.warn = redirectConsole(console.warn); +} + +function isRateLimited(): boolean { + const now = Date.now(); + return rateLimits.some(({ window, maxRequests }) => { + const windowStart = now - window; + const requestsInWindow = requestQueue.filter((timestamp) => timestamp >= windowStart).length; + return requestsInWindow >= maxRequests; + }); +} async function insertAidsToDB() { const aidRawcontent = await Deno.readTextFile(aidPath); - const aids = aidRawcontent - .split("\n") - .filter((line) => line.length > 0) - .map((line) => parseInt(line)); + const aids = aidRawcontent.split("\n").filter((line) => line.length > 0).map((line) => parseInt(line)); - // Insert aids into database with status 'pending' - // if bili_info_crawl contains at least one row, skip it - if (db.prepare("SELECT * FROM bili_info_crawl").all().length == 0) { - const insertStmt = db.prepare(` - INSERT OR IGNORE INTO bili_info_crawl (aid, status) - VALUES (?, 'pending') - `); - for (const aid of aids) { - insertStmt.run(aid); - } + if (!db.prepare("SELECT COUNT(*) FROM bili_info_crawl").get()) { + const insertStmt = db.prepare("INSERT OR IGNORE INTO bili_info_crawl (aid, status) VALUES (?, 'pending')"); + aids.forEach((aid) => insertStmt.run(aid)); } - const aidsInDB = db - .prepare("SELECT aid FROM bili_info_crawl WHERE status = 'pending' OR status = 'failed'") + const aidsInDB = db.prepare("SELECT aid FROM bili_info_crawl WHERE status = 'pending' OR status = 'failed'") .all() .map((row) => row.aid) as number[]; @@ -34,72 +64,75 @@ async function insertAidsToDB() { let processedAids = 0; const startTime = Date.now(); - // Update database with video info - for (const aid of aidsInDB) { + const processAid = async (aid: number) => { try { - const res = await getBiliBiliVideoInfo(aid); - if (res?.data.code !== 0) { - const data = res?.data; - db.prepare( - ` - UPDATE bili_info_crawl - SET status = 'error', - data = ? - WHERE aid = ? - ` - ).run(aid, JSON.stringify(data)); + const res = await getBiliBiliVideoInfo(aid, regions[processedAids % regions.length]); + if (res === null) { + updateAidStatus(aid, 'failed'); } else { - const data = res.data.data; - db.prepare( - ` - UPDATE bili_info_crawl - SET status = 'success', - bvid = ?, - data = ? - WHERE aid = ? - ` - ).run(data.View.bvid, JSON.stringify(data), aid); + const rawData = JSON.parse(res); + if (rawData.code === 0) { + updateAidStatus(aid, 'success', rawData.data.View.bvid, JSON.stringify(rawData.data)); + } else { + updateAidStatus(aid, 'error', undefined, res); + } } } catch (error) { console.error(`Error updating aid ${aid}: ${error}`); - try { - db.prepare( - ` - UPDATE bili_info_crawl - SET status = 'failed' - WHERE aid = ? - ` - ).run(aid); - } - catch (error) { - console.error(`Error wrting to db for aid ${aid}: ${error}`); - } + updateAidStatus(aid, 'failed'); } finally { processedAids++; - const elapsedTime = Date.now() - startTime; - const elapsedSeconds = Math.floor(elapsedTime / 1000); - const elapsedMinutes = Math.floor(elapsedSeconds / 60); - const elapsedHours = Math.floor(elapsedMinutes / 60); - const remainingAids = totalAids - processedAids; - - // Calculate ETA - const averageTimePerAid = elapsedTime / processedAids; - const eta = remainingAids * averageTimePerAid; - const etaSeconds = Math.floor(eta / 1000); - const etaMinutes = Math.floor(etaSeconds / 60); - const etaHours = Math.floor(etaMinutes / 60); - - // Output progress - const progress = `${processedAids}/${totalAids}, ${(processedAids / totalAids * 100).toFixed(2)}%, elapsed ${elapsedHours.toString().padStart(2, '0')}:${(elapsedMinutes % 60).toString().padStart(2, '0')}:${(elapsedSeconds % 60).toString().padStart(2, '0')}, ETA ${etaHours}h${(etaMinutes % 60).toString().padStart(2, '0')}m`; - - if (Math.random() > 0.95) { - console.log("Sleeping..."); - const time = Math.random() * 5 * 1000; - await new Promise((resolve) => setTimeout(resolve, time)); - } - console.log(`Updated aid ${aid}, ${progress}`); + logProgress(aid, processedAids, totalAids, startTime); } - } + }; + + const interval = setInterval(async () => { + if (aidsInDB.length === 0) { + clearInterval(interval); + console.log("All aids processed."); + return; + } + if (!isRateLimited()) { + const aid = aidsInDB.shift(); + if (aid !== undefined) { + requestQueue.push(Date.now()); + await processAid(aid); + } + } + }, 50); + + console.log("Starting to process aids..."); } -insertAidsToDB(); +function updateAidStatus(aid: number, status: string, bvid?: string, data?: string) { + const stmt = db.prepare(` + UPDATE bili_info_crawl + SET status = ?, + ${bvid ? 'bvid = ?,' : ''} + ${data ? 'data = ?,' : ''} + timestamp = ? + WHERE aid = ? + `); + const params = [status, ...(bvid ? [bvid] : []), ...(data ? [data] : []), Date.now() / 1000, aid]; + stmt.run(...params); +} + +function logProgress(aid: number, processedAids: number, totalAids: number, startTime: number) { + const elapsedTime = Date.now() - startTime; + const elapsedSeconds = Math.floor(elapsedTime / 1000); + const elapsedMinutes = Math.floor(elapsedSeconds / 60); + const elapsedHours = Math.floor(elapsedMinutes / 60); + + const remainingAids = totalAids - processedAids; + const averageTimePerAid = elapsedTime / processedAids; + const eta = remainingAids * averageTimePerAid; + const etaSeconds = Math.floor(eta / 1000); + const etaMinutes = Math.floor(etaSeconds / 60); + const etaHours = Math.floor(etaMinutes / 60); + + const progress = `${processedAids}/${totalAids}, ${((processedAids / totalAids) * 100).toFixed(2)}%, elapsed ${elapsedHours.toString().padStart(2, "0")}:${(elapsedMinutes % 60).toString().padStart(2, "0")}:${(elapsedSeconds % 60).toString().padStart(2, "0")}, ETA ${etaHours}h${(etaMinutes % 60).toString().padStart(2, "0")}m`; + console.log(`Updated aid ${aid}, ${progress}`); +} + +await setupLogging(); +insertAidsToDB(); \ No newline at end of file diff --git a/src/db/raw/videoInfo.ts b/src/db/raw/videoInfo.ts new file mode 100644 index 0000000..70b898e --- /dev/null +++ b/src/db/raw/videoInfo.ts @@ -0,0 +1,53 @@ +export async function getBiliBiliVideoInfo(bvidORaid?: string | number, region: string = "hangzhou") { + const bvid = typeof bvidORaid === "string" ? bvidORaid : undefined; + const aid = typeof bvidORaid === "number" ? bvidORaid : undefined; + + const baseURL = "https://api.bilibili.com/x/web-interface/view/detail"; + const urlObject = new URL(baseURL); + + if (aid) { + urlObject.searchParams.append("aid", aid.toString()); + const finalURL = urlObject.toString(); + return await proxyRequestWithRegion(finalURL, region); + } else if (bvid) { + urlObject.searchParams.append("bvid", bvid); + const finalURL = urlObject.toString(); + return await proxyRequestWithRegion(finalURL, region); + } else { + return null; + } +} + +async function proxyRequestWithRegion(url: string, region: string): Promise { + const td = new TextDecoder(); + const p = await new Deno.Command("aliyun", { + args: [ + "fc", + "POST", + `/2023-03-30/functions/proxy-${region}/invocations`, + "--qualifier", + "LATEST", + "--header", + "Content-Type=application/json;x-fc-invocation-type=Sync;x-fc-log-type=None;", + "--body", + JSON.stringify({url: url}), + "--profile", + `CVSA-${region}`, + ], + }).output(); + try { + const out = td.decode(p.stdout); + const rawData = JSON.parse(out); + if (rawData.statusCode !== 200) { + console.error(`Error proxying request ${url} to ${region} , statusCode: ${rawData.statusCode}`); + return null; + } + else { + return JSON.parse(rawData.body); + } + } + catch (e){ + console.error(`Error proxying requestt ${url} to ${region}: ${e}`); + return null; + } +} \ No newline at end of file