update: fetcher and post-process
move the max threads and fetch limit in fetcher into env update the postprocess flow to remove duplicates
This commit is contained in:
parent
9eeb3de828
commit
6f25183654
@ -3,11 +3,14 @@ import trafilatura
|
||||
import hashlib
|
||||
import re
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
from trafilatura.readability_lxml import is_probably_readerable
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# 常量定义
|
||||
MAX_FETCH_LIMIT = 300 # 每次运行时获取的最大任务数量
|
||||
MAX_FETCH_LIMIT = int(os.getenv("FETCH_LIMIT")) # 每次运行时获取的最大任务数量
|
||||
|
||||
# 数据库连接
|
||||
def connect_db(db_path):
|
||||
@ -112,7 +115,7 @@ def main():
|
||||
unfetched_urls = get_unfetched_urls(conn, MAX_FETCH_LIMIT)
|
||||
conn.close()
|
||||
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
with ThreadPoolExecutor(max_workers=int(os.getenv("FETCH_THREADS"))) as executor:
|
||||
futures = [executor.submit(process_url, url, db_path, save_path) for url in unfetched_urls]
|
||||
|
||||
for future in as_completed(futures):
|
||||
|
@ -1,5 +1,6 @@
|
||||
import os
|
||||
import json
|
||||
from pybloom_live import BloomFilter
|
||||
|
||||
def read_converted_files(filename):
|
||||
"""读取converted.txt文件,返回一个包含已处理文件名的集合"""
|
||||
@ -16,6 +17,8 @@ def write_converted_file(filename, file_name):
|
||||
def process_json_files(directory, converted_filename):
|
||||
"""处理指定目录下的所有json文件"""
|
||||
converted_files = read_converted_files(converted_filename)
|
||||
bloom_filter_chinese = BloomFilter(capacity=1000000, error_rate=0.001) # 初始化Bloom Filter
|
||||
bloom_filter_english = BloomFilter(capacity=1000000, error_rate=0.001) # 初始化Bloom Filter
|
||||
|
||||
for filename in os.listdir(directory):
|
||||
if filename.endswith('.json') and filename not in converted_files:
|
||||
@ -30,8 +33,13 @@ def process_json_files(directory, converted_filename):
|
||||
chinese_text = segment.get('chinese', '').replace('\n', ' ')
|
||||
english_text = segment.get('english', '').replace('\n', ' ')
|
||||
|
||||
source_file.write(chinese_text + '\n')
|
||||
target_file.write(english_text + '\n')
|
||||
if chinese_text not in bloom_filter_chinese:
|
||||
bloom_filter_chinese.add(chinese_text)
|
||||
source_file.write(chinese_text + '\n')
|
||||
|
||||
if english_text not in bloom_filter_english:
|
||||
bloom_filter_english.add(english_text)
|
||||
target_file.write(english_text + '\n')
|
||||
|
||||
write_converted_file(converted_filename, filename)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user