add: content fetcher for translate

This commit is contained in:
alikia2x (寒寒) 2024-09-15 23:43:01 +08:00
parent ebd1113a6e
commit 4c9f411f67
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
3 changed files with 155 additions and 13 deletions

125
translate/fetcher.py Normal file
View File

@ -0,0 +1,125 @@
import sqlite3
import trafilatura
import hashlib
import re
import os
from trafilatura.readability_lxml import is_probably_readerable
from concurrent.futures import ThreadPoolExecutor, as_completed
# 常量定义
MAX_FETCH_LIMIT = 300 # 每次运行时获取的最大任务数量
# 数据库连接
def connect_db(db_path):
return sqlite3.connect(db_path)
# 创建fetch_list表
def create_fetch_list_table(conn):
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS fetch_list (
url TEXT PRIMARY KEY,
fetched_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
conn.commit()
# 获取未爬取的URL列表
def get_unfetched_urls(conn, limit):
cursor = conn.cursor()
cursor.execute("""
SELECT url FROM url_list
WHERE url NOT IN (SELECT url FROM fetch_list)
LIMIT ?
""", (limit,))
return [row[0] for row in cursor.fetchall()]
# 下载并提取网页内容
def fetch_and_extract_content(url):
downloaded = trafilatura.fetch_url(url)
if not downloaded:
return None
html_string = downloaded
if not is_probably_readerable(html_string):
return None
content = trafilatura.extract(html_string, output_format="txt", url=url, favor_precision=True)
return content
# 计算URL的MD5
def md5_hash(url):
return hashlib.md5(url.encode()).hexdigest()
# 分段规则
def split_content(content):
sentences = re.split(r'[。!?;.!?;]', content)
segments = []
current_segment = []
current_length = 0
for sentence in sentences:
sentence_length = len(sentence)
if (len(current_segment) >= 10 or current_length + sentence_length > 1500):
segments.append(''.join(current_segment))
current_segment = []
current_length = 0
current_segment.append(sentence)
current_length += sentence_length
if current_segment:
segments.append(''.join(current_segment))
return segments
# 保存分段内容到文件
def save_segments(url, segments, path):
url_hash = md5_hash(url)
for idx, segment in enumerate(segments):
save_path = os.path.join(path, f"{url_hash}_{idx}.txt")
with open(save_path, "w", encoding="utf-8") as f:
f.write(segment)
# 记录已爬取的URL
def record_fetched_url(conn, url):
cursor = conn.cursor()
cursor.execute("""
INSERT INTO fetch_list (url, fetched_time)
VALUES (?, CURRENT_TIMESTAMP)
""", (url,))
conn.commit()
# 处理单个URL的任务
def process_url(url, db_path, save_path):
conn = connect_db(db_path)
content = fetch_and_extract_content(url)
if content:
segments = split_content(content)
save_segments(url, segments, save_path)
record_fetched_url(conn, url)
conn.close()
# 主函数
def main():
db_path = "crawler.db"
save_path = "./source"
conn = connect_db(db_path)
# 创建fetch_list表
create_fetch_list_table(conn)
unfetched_urls = get_unfetched_urls(conn, MAX_FETCH_LIMIT)
conn.close()
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(process_url, url, db_path, save_path) for url in unfetched_urls]
for future in as_completed(futures):
try:
future.result()
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == "__main__":
main()

View File

@ -3,6 +3,7 @@ from dotenv import load_dotenv
import json import json
import threading import threading
from openai import OpenAI from openai import OpenAI
from pathlib import Path
load_dotenv() load_dotenv()
@ -14,6 +15,12 @@ client = OpenAI(
system_prompt = """ system_prompt = """
The user will provide some text. Please parse the text into segments, each segment contains 1 to 5 sentences. Translate each sentence into the corresponding language. If the input is in Chinese, return the English translation, and vice versa. The user will provide some text. Please parse the text into segments, each segment contains 1 to 5 sentences. Translate each sentence into the corresponding language. If the input is in Chinese, return the English translation, and vice versa.
IMPORTANT:
1. Segment should not be too long, each segment should be under 100 English words or 180 Chinese characters.
2. For segments or sentences that appear multiple times in the original text, they are only output **once** in the returned translation.
3. **For content with obvious semantic differences, such as different components on a web page, no matter how short it is, it should be divided into a separate segment.**
4. **Information such as web page headers, footers, and other fixed text, such as copyright notices, website or company names, and conventional link text (such as "About Us", "Privacy Policy", etc.) will be **ignored and not translated**
EXAMPLE INPUT: EXAMPLE INPUT:
法律之前人人平等并有权享受法律的平等保护不受任何歧视人人有权享受平等保护以免受违反本宣言的任何歧视行为以及煽动这种歧视的任何行为之害 法律之前人人平等并有权享受法律的平等保护不受任何歧视人人有权享受平等保护以免受违反本宣言的任何歧视行为以及煽动这种歧视的任何行为之害
@ -47,11 +54,11 @@ def process_file(input_file, output_dir):
translation = translate_text(text) translation = translate_text(text)
output_file = os.path.join(output_dir, os.path.basename(input_file) + '.json') output_path = os.path.join(output_dir, Path(input_file).stem + ".json")
with open(output_file, 'w', encoding='utf-8') as f: with open(output_path, 'w', encoding='utf-8') as f:
json.dump(translation, f, ensure_ascii=False, indent=4) json.dump(translation, f, ensure_ascii=False, indent=4)
print(f"Processed {input_file} and saved to {output_file}") print(f"Successfully translated and saved to {output_path}")
except Exception as e: except Exception as e:
print(f"Error processing {input_file}: {e}") print(f"Error processing {input_file}: {e}")
@ -60,7 +67,12 @@ def batch_process(input_dir, output_dir, num_threads=4):
if not os.path.exists(output_dir): if not os.path.exists(output_dir):
os.makedirs(output_dir) os.makedirs(output_dir)
files = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))] input_files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))]
output_files = [f for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f))]
output_stems = {Path(f).stem for f in output_files}
files = [os.path.join(input_dir, f) for f in input_files if Path(f).stem not in output_stems]
threads = [] threads = []
for file in files: for file in files:
@ -79,4 +91,4 @@ def batch_process(input_dir, output_dir, num_threads=4):
if __name__ == "__main__": if __name__ == "__main__":
input_dir = "./source" input_dir = "./source"
output_dir = "./output" output_dir = "./output"
batch_process(input_dir, output_dir) batch_process(input_dir, output_dir, num_threads=64)

View File

@ -1,9 +1,10 @@
import os
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import sqlite3 import sqlite3
import urllib.robotparser as urobot import urllib.robotparser as urobot
from urllib.parse import urljoin, urlparse from urllib.parse import urljoin, urlparse
from dotenv import load_dotenv
MAX_RECURSION_DEPTH = 5 MAX_RECURSION_DEPTH = 5
MAX_URLS = 1000 MAX_URLS = 1000
@ -124,11 +125,15 @@ def main(seed_url, rp, sitemap=None):
save_url(sitemap_url) save_url(sitemap_url)
crawl(seed_url, rp=rp) crawl(seed_url, rp=rp)
def get_config(key):
return os.getenv(key)
# Example usage # Example usage
# if __name__ == "__main__": if __name__ == "__main__":
# seed_url = "https://www.bbc.co.uk/news" load_dotenv()
# rp = urobot.RobotFileParser() seed_url = get_config("SEED_URL")
# rp.set_url("https://www.bbc.co.uk/robots.txt") rp = urobot.RobotFileParser()
# rp.read() rp.set_url(get_config("ROBOTS_URL"))
# main(seed_url, rp, "https://www.bbc.co.uk/sitemap.xml") rp.read()
# conn.close() main(seed_url, rp, get_config("SITEMAP_URL"))
conn.close()