diff --git a/translate/fetcher.py b/translate/fetcher.py index 6be2641..b361525 100644 --- a/translate/fetcher.py +++ b/translate/fetcher.py @@ -3,11 +3,14 @@ import trafilatura import hashlib import re import os +from dotenv import load_dotenv from trafilatura.readability_lxml import is_probably_readerable from concurrent.futures import ThreadPoolExecutor, as_completed +load_dotenv() + # 常量定义 -MAX_FETCH_LIMIT = 300 # 每次运行时获取的最大任务数量 +MAX_FETCH_LIMIT = int(os.getenv("FETCH_LIMIT")) # 每次运行时获取的最大任务数量 # 数据库连接 def connect_db(db_path): @@ -112,7 +115,7 @@ def main(): unfetched_urls = get_unfetched_urls(conn, MAX_FETCH_LIMIT) conn.close() - with ThreadPoolExecutor(max_workers=10) as executor: + with ThreadPoolExecutor(max_workers=int(os.getenv("FETCH_THREADS"))) as executor: futures = [executor.submit(process_url, url, db_path, save_path) for url in unfetched_urls] for future in as_completed(futures): diff --git a/translate/postprocess.py b/translate/postprocess.py index bf5f1e9..a60cd4a 100644 --- a/translate/postprocess.py +++ b/translate/postprocess.py @@ -1,5 +1,6 @@ import os import json +from pybloom_live import BloomFilter def read_converted_files(filename): """读取converted.txt文件,返回一个包含已处理文件名的集合""" @@ -16,6 +17,8 @@ def write_converted_file(filename, file_name): def process_json_files(directory, converted_filename): """处理指定目录下的所有json文件""" converted_files = read_converted_files(converted_filename) + bloom_filter_chinese = BloomFilter(capacity=1000000, error_rate=0.001) # 初始化Bloom Filter + bloom_filter_english = BloomFilter(capacity=1000000, error_rate=0.001) # 初始化Bloom Filter for filename in os.listdir(directory): if filename.endswith('.json') and filename not in converted_files: @@ -30,8 +33,13 @@ def process_json_files(directory, converted_filename): chinese_text = segment.get('chinese', '').replace('\n', ' ') english_text = segment.get('english', '').replace('\n', ' ') - source_file.write(chinese_text + '\n') - target_file.write(english_text + '\n') + if chinese_text not in bloom_filter_chinese: + bloom_filter_chinese.add(chinese_text) + source_file.write(chinese_text + '\n') + + if english_text not in bloom_filter_english: + bloom_filter_english.add(english_text) + target_file.write(english_text + '\n') write_converted_file(converted_filename, filename)