update: use aliyun FC to proxy requests

This commit is contained in:
alikia2x (寒寒) 2025-01-12 01:07:45 +08:00
parent 75110be3c1
commit 8da76839db
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
7 changed files with 271 additions and 122 deletions

2
.gitignore vendored
View File

@ -67,3 +67,5 @@ package-lock.json
# project specific
data/main.db
.env
logs/

View File

@ -1,6 +1,6 @@
{
"tasks": {
"crawl-raw-bili": "deno --allow-env --allow-ffi --allow-read --allow-net src/db/raw/insertAidsToDB.ts"
"crawl-raw-bili": "deno --allow-env --allow-ffi --allow-read --allow-net --allow-write --allow-run src/db/raw/insertAidsToDB.ts"
},
"imports": {
"@std/assert": "jsr:@std/assert@1",

View File

@ -331,6 +331,32 @@
"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="
}
},
"remote": {
"https://deno.land/std@0.113.0/_util/assert.ts": "2f868145a042a11d5ad0a3c748dcf580add8a0dbc0e876eaa0026303a5488f58",
"https://deno.land/std@0.113.0/_util/os.ts": "dfb186cc4e968c770ab6cc3288bd65f4871be03b93beecae57d657232ecffcac",
"https://deno.land/std@0.113.0/fs/_util.ts": "f2ce811350236ea8c28450ed822a5f42a0892316515b1cd61321dec13569c56b",
"https://deno.land/std@0.113.0/fs/copy.ts": "631bbafbfe6cba282158abc8aeb7e8251cc69a7ec28ce12878ea1b75fec2add4",
"https://deno.land/std@0.113.0/fs/empty_dir.ts": "5f08b263dd064dc7917c4bbeb13de0f5505a664b9cdfe312fa86e7518cfaeb84",
"https://deno.land/std@0.113.0/fs/ensure_dir.ts": "b7c103dc41a3d1dbbb522bf183c519c37065fdc234831a4a0f7d671b1ed5fea7",
"https://deno.land/std@0.113.0/fs/ensure_file.ts": "c06031af24368e80c330897e4b8e9109efc8602ffabc8f3e2306be07529e1d13",
"https://deno.land/std@0.113.0/fs/ensure_link.ts": "26e54363508b822afd87a3f6e873bbbcd6b5993dd638f8170758c16262a75065",
"https://deno.land/std@0.113.0/fs/ensure_symlink.ts": "c07b6d19ef58b6f5c671ffa942e7f9be50315f4f78e2f9f511626fd2e13beccc",
"https://deno.land/std@0.113.0/fs/eol.ts": "afaebaaac36f48c423b920c836551997715672b80a0fee9aa7667c181a94f2df",
"https://deno.land/std@0.113.0/fs/exists.ts": "c3c3335a212bd945bb75df379096ab57fb6c86598fa273dfb24da3b3939a951e",
"https://deno.land/std@0.113.0/fs/expand_glob.ts": "7c9173f93044051456b829a3f5a3676e58ba70b6ce4aae62cf24757b58556205",
"https://deno.land/std@0.113.0/fs/mod.ts": "26eee4b52a8c516e37d464094b080ff6822883e7f01ff0ba0a72b8dcd54b9927",
"https://deno.land/std@0.113.0/fs/move.ts": "4623058e39bbbeb3ad30aeff9c974c55d2d574ad7c480295c12b04c244686a99",
"https://deno.land/std@0.113.0/fs/walk.ts": "f633829f967d2979ab285dbfb09eb0d7d000fd175b95156b63fcede435d1a807",
"https://deno.land/std@0.113.0/path/_constants.ts": "1247fee4a79b70c89f23499691ef169b41b6ccf01887a0abd131009c5581b853",
"https://deno.land/std@0.113.0/path/_interface.ts": "1fa73b02aaa24867e481a48492b44f2598cd9dfa513c7b34001437007d3642e4",
"https://deno.land/std@0.113.0/path/_util.ts": "2e06a3b9e79beaf62687196bd4b60a4c391d862cfa007a20fc3a39f778ba073b",
"https://deno.land/std@0.113.0/path/common.ts": "f41a38a0719a1e85aa11c6ba3bea5e37c15dd009d705bd8873f94c833568cbc4",
"https://deno.land/std@0.113.0/path/glob.ts": "ea87985765b977cc284b92771003b2070c440e0807c90e1eb0ff3e095911a820",
"https://deno.land/std@0.113.0/path/mod.ts": "4465dc494f271b02569edbb4a18d727063b5dbd6ed84283ff906260970a15d12",
"https://deno.land/std@0.113.0/path/posix.ts": "34349174b9cd121625a2810837a82dd8b986bbaaad5ade690d1de75bbb4555b2",
"https://deno.land/std@0.113.0/path/separator.ts": "8fdcf289b1b76fd726a508f57d3370ca029ae6976fcde5044007f062e643ff1c",
"https://deno.land/std@0.113.0/path/win32.ts": "11549e8c6df8307a8efcfa47ad7b2a75da743eac7d4c89c9723a944661c8bd2e"
},
"workspace": {
"dependencies": [
"jsr:@std/assert@1",

View File

@ -1,42 +0,0 @@
import axios from "axios";
// 随机选择 User-Agent
function getRandomUserAgent() {
const userAgents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
"Mozilla/5.0 (Linux; Android 10; Pixel 3 XL) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Mobile Safari/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1",
// 可以添加更多的 User-Agent 字符串
];
const randomIndex = Math.floor(Math.random() * userAgents.length);
return userAgents[randomIndex];
}
export function getBiliBiliVideoInfo(bvidORaid?: string | number) {
const bvid = typeof bvidORaid === "string" ? bvidORaid : undefined;
const aid = typeof bvidORaid === "number" ? bvidORaid : undefined;
if (!bvid && !aid) {
return null;
}
const baseURL = "https://api.bilibili.com/x/web-interface/view/detail";
const headers = {
'User-Agent': getRandomUserAgent(), // 添加随机 User-Agent
};
if (aid) {
return axios.get(baseURL, {
params: {
aid: aid,
},
headers: headers, // 将 headers 添加到请求中
});
} else {
return axios.get(baseURL, {
params: {
bvid: bvid,
},
headers: headers, // 将 headers 添加到请求中
});
}
}

77
src/db/raw/aliyun-fc.mjs Normal file
View File

@ -0,0 +1,77 @@
'use strict';
export const handler = async (event, context) => {
const eventObj = JSON.parse(event);
console.log(`receive event: ${JSON.stringify(eventObj)}`);
let body = 'Missing parameter: URL';
let statusCode = 400;
// User-Agent list
const userAgents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
'Mozilla/5.0 (Linux; Android 10; Pixel 3 XL) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Mobile Safari/537.36',
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Gecko/20100101 Firefox/89.0'
];
// get http request body
if ("body" in eventObj) {
body = eventObj.body;
if (eventObj.isBase64Encoded) {
body = Buffer.from(body, 'base64').toString('utf-8');
}
}
console.log(`receive http body: ${body}`);
// proxy the URL if it exists in eventObj
const refererUrl = 'https://www.bilibili.com/'; // Replace with your desired referer and origin
if ("url" in eventObj) {
try {
const randomUserAgent = userAgents[Math.floor(Math.random() * userAgents.length)];
const response = await fetch(eventObj.url, {
headers: {
'User-Agent': randomUserAgent,
'Referer': refererUrl
}
});
statusCode = response.status;
body = await response.text();
} catch (error) {
statusCode = 500;
body = `Error fetching URL: ${error.message}`;
}
} else if ("urls" in eventObj && Array.isArray(eventObj.urls)) {
const requests = eventObj.urls.map(async url => {
try {
const randomUserAgent = userAgents[Math.floor(Math.random() * userAgents.length)];
const response = await fetch(url, {
headers: {
'User-Agent': randomUserAgent,
'Referer': refererUrl
}
});
const responseBody = await response.text();
return {
statusCode: response.status,
body: responseBody
};
} catch (error) {
return {
statusCode: 500,
body: `Error fetching URL: ${error.message}`
};
}
});
body = await Promise.all(requests);
statusCode = 200; // Assuming all URLs were processed successfully
}
return {
'statusCode': statusCode,
'body': JSON.stringify(body)
};
};

View File

@ -1,32 +1,62 @@
import path from "node:path";
import { Database } from "jsr:@db/sqlite@0.12";
import { getBiliBiliVideoInfo } from "../../crawler/bilibili/videoInfo.ts";
const aidPath = path.join("./data/2025010104_c30_aids.txt");
import { getBiliBiliVideoInfo } from "./videoInfo.ts";
import { ensureDir } from "https://deno.land/std@0.113.0/fs/mod.ts";
const aidPath = "./data/2025010104_c30_aids.txt";
const db = new Database("./data/main.db");
const regions = ["shanghai", "hangzhou", "qingdao", "beijing", "zhangjiakou", "chengdu", "shenzhen", "hohhot"];
const logDir = "./logs/bili-info-crawl";
const logFile = path.join(logDir, `run-${Date.now() / 1000}.log`);
const SECOND = 1000;
const SECONDS = SECOND;
const MINUTE = 60 * SECONDS;
const MINUTES = MINUTE;
const IPs = regions.length;
const rateLimits = [
{ window: 5 * MINUTES, maxRequests: 160 * IPs },
{ window: 30 * SECONDS, maxRequests: 20 * IPs },
{ window: 1.2 * SECOND, maxRequests: 1 * IPs },
];
const requestQueue: number[] = [];
async function setupLogging() {
await ensureDir(logDir);
const logStream = await Deno.open(logFile, { write: true, create: true, append: true });
const redirectConsole = (originalConsole: (...args: any[]) => void) => (...args: any[]) => {
const message = args.map((arg) => (typeof arg === "object" ? JSON.stringify(arg) : arg)).join(" ");
originalConsole(message);
logStream.write(new TextEncoder().encode(message + "\n"));
};
console.log = redirectConsole(console.log);
console.error = redirectConsole(console.error);
console.warn = redirectConsole(console.warn);
}
function isRateLimited(): boolean {
const now = Date.now();
return rateLimits.some(({ window, maxRequests }) => {
const windowStart = now - window;
const requestsInWindow = requestQueue.filter((timestamp) => timestamp >= windowStart).length;
return requestsInWindow >= maxRequests;
});
}
async function insertAidsToDB() {
const aidRawcontent = await Deno.readTextFile(aidPath);
const aids = aidRawcontent
.split("\n")
.filter((line) => line.length > 0)
.map((line) => parseInt(line));
const aids = aidRawcontent.split("\n").filter((line) => line.length > 0).map((line) => parseInt(line));
// Insert aids into database with status 'pending'
// if bili_info_crawl contains at least one row, skip it
if (db.prepare("SELECT * FROM bili_info_crawl").all().length == 0) {
const insertStmt = db.prepare(`
INSERT OR IGNORE INTO bili_info_crawl (aid, status)
VALUES (?, 'pending')
`);
for (const aid of aids) {
insertStmt.run(aid);
}
if (!db.prepare("SELECT COUNT(*) FROM bili_info_crawl").get()) {
const insertStmt = db.prepare("INSERT OR IGNORE INTO bili_info_crawl (aid, status) VALUES (?, 'pending')");
aids.forEach((aid) => insertStmt.run(aid));
}
const aidsInDB = db
.prepare("SELECT aid FROM bili_info_crawl WHERE status = 'pending' OR status = 'failed'")
const aidsInDB = db.prepare("SELECT aid FROM bili_info_crawl WHERE status = 'pending' OR status = 'failed'")
.all()
.map((row) => row.aid) as number[];
@ -34,72 +64,75 @@ async function insertAidsToDB() {
let processedAids = 0;
const startTime = Date.now();
// Update database with video info
for (const aid of aidsInDB) {
const processAid = async (aid: number) => {
try {
const res = await getBiliBiliVideoInfo(aid);
if (res?.data.code !== 0) {
const data = res?.data;
db.prepare(
`
UPDATE bili_info_crawl
SET status = 'error',
data = ?
WHERE aid = ?
`
).run(aid, JSON.stringify(data));
const res = await getBiliBiliVideoInfo(aid, regions[processedAids % regions.length]);
if (res === null) {
updateAidStatus(aid, 'failed');
} else {
const data = res.data.data;
db.prepare(
`
UPDATE bili_info_crawl
SET status = 'success',
bvid = ?,
data = ?
WHERE aid = ?
`
).run(data.View.bvid, JSON.stringify(data), aid);
const rawData = JSON.parse(res);
if (rawData.code === 0) {
updateAidStatus(aid, 'success', rawData.data.View.bvid, JSON.stringify(rawData.data));
} else {
updateAidStatus(aid, 'error', undefined, res);
}
}
} catch (error) {
console.error(`Error updating aid ${aid}: ${error}`);
try {
db.prepare(
`
UPDATE bili_info_crawl
SET status = 'failed'
WHERE aid = ?
`
).run(aid);
}
catch (error) {
console.error(`Error wrting to db for aid ${aid}: ${error}`);
}
updateAidStatus(aid, 'failed');
} finally {
processedAids++;
logProgress(aid, processedAids, totalAids, startTime);
}
};
const interval = setInterval(async () => {
if (aidsInDB.length === 0) {
clearInterval(interval);
console.log("All aids processed.");
return;
}
if (!isRateLimited()) {
const aid = aidsInDB.shift();
if (aid !== undefined) {
requestQueue.push(Date.now());
await processAid(aid);
}
}
}, 50);
console.log("Starting to process aids...");
}
function updateAidStatus(aid: number, status: string, bvid?: string, data?: string) {
const stmt = db.prepare(`
UPDATE bili_info_crawl
SET status = ?,
${bvid ? 'bvid = ?,' : ''}
${data ? 'data = ?,' : ''}
timestamp = ?
WHERE aid = ?
`);
const params = [status, ...(bvid ? [bvid] : []), ...(data ? [data] : []), Date.now() / 1000, aid];
stmt.run(...params);
}
function logProgress(aid: number, processedAids: number, totalAids: number, startTime: number) {
const elapsedTime = Date.now() - startTime;
const elapsedSeconds = Math.floor(elapsedTime / 1000);
const elapsedMinutes = Math.floor(elapsedSeconds / 60);
const elapsedHours = Math.floor(elapsedMinutes / 60);
const remainingAids = totalAids - processedAids;
// Calculate ETA
const remainingAids = totalAids - processedAids;
const averageTimePerAid = elapsedTime / processedAids;
const eta = remainingAids * averageTimePerAid;
const etaSeconds = Math.floor(eta / 1000);
const etaMinutes = Math.floor(etaSeconds / 60);
const etaHours = Math.floor(etaMinutes / 60);
// Output progress
const progress = `${processedAids}/${totalAids}, ${(processedAids / totalAids * 100).toFixed(2)}%, elapsed ${elapsedHours.toString().padStart(2, '0')}:${(elapsedMinutes % 60).toString().padStart(2, '0')}:${(elapsedSeconds % 60).toString().padStart(2, '0')}, ETA ${etaHours}h${(etaMinutes % 60).toString().padStart(2, '0')}m`;
if (Math.random() > 0.95) {
console.log("Sleeping...");
const time = Math.random() * 5 * 1000;
await new Promise((resolve) => setTimeout(resolve, time));
}
const progress = `${processedAids}/${totalAids}, ${((processedAids / totalAids) * 100).toFixed(2)}%, elapsed ${elapsedHours.toString().padStart(2, "0")}:${(elapsedMinutes % 60).toString().padStart(2, "0")}:${(elapsedSeconds % 60).toString().padStart(2, "0")}, ETA ${etaHours}h${(etaMinutes % 60).toString().padStart(2, "0")}m`;
console.log(`Updated aid ${aid}, ${progress}`);
}
}
}
await setupLogging();
insertAidsToDB();

53
src/db/raw/videoInfo.ts Normal file
View File

@ -0,0 +1,53 @@
export async function getBiliBiliVideoInfo(bvidORaid?: string | number, region: string = "hangzhou") {
const bvid = typeof bvidORaid === "string" ? bvidORaid : undefined;
const aid = typeof bvidORaid === "number" ? bvidORaid : undefined;
const baseURL = "https://api.bilibili.com/x/web-interface/view/detail";
const urlObject = new URL(baseURL);
if (aid) {
urlObject.searchParams.append("aid", aid.toString());
const finalURL = urlObject.toString();
return await proxyRequestWithRegion(finalURL, region);
} else if (bvid) {
urlObject.searchParams.append("bvid", bvid);
const finalURL = urlObject.toString();
return await proxyRequestWithRegion(finalURL, region);
} else {
return null;
}
}
async function proxyRequestWithRegion(url: string, region: string): Promise<any | null> {
const td = new TextDecoder();
const p = await new Deno.Command("aliyun", {
args: [
"fc",
"POST",
`/2023-03-30/functions/proxy-${region}/invocations`,
"--qualifier",
"LATEST",
"--header",
"Content-Type=application/json;x-fc-invocation-type=Sync;x-fc-log-type=None;",
"--body",
JSON.stringify({url: url}),
"--profile",
`CVSA-${region}`,
],
}).output();
try {
const out = td.decode(p.stdout);
const rawData = JSON.parse(out);
if (rawData.statusCode !== 200) {
console.error(`Error proxying request ${url} to ${region} , statusCode: ${rawData.statusCode}`);
return null;
}
else {
return JSON.parse(rawData.body);
}
}
catch (e){
console.error(`Error proxying requestt ${url} to ${region}: ${e}`);
return null;
}
}