update: fetcher and post-process

move the max threads and fetch limit in fetcher into env
update the postprocess flow to remove duplicates
This commit is contained in:
alikia2x (寒寒) 2024-09-16 00:59:58 +08:00
parent 9eeb3de828
commit 6f25183654
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
2 changed files with 15 additions and 4 deletions

View File

@ -3,11 +3,14 @@ import trafilatura
import hashlib
import re
import os
from dotenv import load_dotenv
from trafilatura.readability_lxml import is_probably_readerable
from concurrent.futures import ThreadPoolExecutor, as_completed
load_dotenv()
# 常量定义
MAX_FETCH_LIMIT = 300 # 每次运行时获取的最大任务数量
MAX_FETCH_LIMIT = int(os.getenv("FETCH_LIMIT")) # 每次运行时获取的最大任务数量
# 数据库连接
def connect_db(db_path):
@ -112,7 +115,7 @@ def main():
unfetched_urls = get_unfetched_urls(conn, MAX_FETCH_LIMIT)
conn.close()
with ThreadPoolExecutor(max_workers=10) as executor:
with ThreadPoolExecutor(max_workers=int(os.getenv("FETCH_THREADS"))) as executor:
futures = [executor.submit(process_url, url, db_path, save_path) for url in unfetched_urls]
for future in as_completed(futures):

View File

@ -1,5 +1,6 @@
import os
import json
from pybloom_live import BloomFilter
def read_converted_files(filename):
"""读取converted.txt文件返回一个包含已处理文件名的集合"""
@ -16,6 +17,8 @@ def write_converted_file(filename, file_name):
def process_json_files(directory, converted_filename):
"""处理指定目录下的所有json文件"""
converted_files = read_converted_files(converted_filename)
bloom_filter_chinese = BloomFilter(capacity=1000000, error_rate=0.001) # 初始化Bloom Filter
bloom_filter_english = BloomFilter(capacity=1000000, error_rate=0.001) # 初始化Bloom Filter
for filename in os.listdir(directory):
if filename.endswith('.json') and filename not in converted_files:
@ -30,8 +33,13 @@ def process_json_files(directory, converted_filename):
chinese_text = segment.get('chinese', '').replace('\n', ' ')
english_text = segment.get('english', '').replace('\n', ' ')
source_file.write(chinese_text + '\n')
target_file.write(english_text + '\n')
if chinese_text not in bloom_filter_chinese:
bloom_filter_chinese.add(chinese_text)
source_file.write(chinese_text + '\n')
if english_text not in bloom_filter_english:
bloom_filter_english.add(english_text)
target_file.write(english_text + '\n')
write_converted_file(converted_filename, filename)