update: fetcher and post-process
move the max threads and fetch limit in fetcher into env update the postprocess flow to remove duplicates
This commit is contained in:
parent
9eeb3de828
commit
6f25183654
@ -3,11 +3,14 @@ import trafilatura
|
|||||||
import hashlib
|
import hashlib
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
from trafilatura.readability_lxml import is_probably_readerable
|
from trafilatura.readability_lxml import is_probably_readerable
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
# 常量定义
|
# 常量定义
|
||||||
MAX_FETCH_LIMIT = 300 # 每次运行时获取的最大任务数量
|
MAX_FETCH_LIMIT = int(os.getenv("FETCH_LIMIT")) # 每次运行时获取的最大任务数量
|
||||||
|
|
||||||
# 数据库连接
|
# 数据库连接
|
||||||
def connect_db(db_path):
|
def connect_db(db_path):
|
||||||
@ -112,7 +115,7 @@ def main():
|
|||||||
unfetched_urls = get_unfetched_urls(conn, MAX_FETCH_LIMIT)
|
unfetched_urls = get_unfetched_urls(conn, MAX_FETCH_LIMIT)
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
with ThreadPoolExecutor(max_workers=int(os.getenv("FETCH_THREADS"))) as executor:
|
||||||
futures = [executor.submit(process_url, url, db_path, save_path) for url in unfetched_urls]
|
futures = [executor.submit(process_url, url, db_path, save_path) for url in unfetched_urls]
|
||||||
|
|
||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
from pybloom_live import BloomFilter
|
||||||
|
|
||||||
def read_converted_files(filename):
|
def read_converted_files(filename):
|
||||||
"""读取converted.txt文件,返回一个包含已处理文件名的集合"""
|
"""读取converted.txt文件,返回一个包含已处理文件名的集合"""
|
||||||
@ -16,6 +17,8 @@ def write_converted_file(filename, file_name):
|
|||||||
def process_json_files(directory, converted_filename):
|
def process_json_files(directory, converted_filename):
|
||||||
"""处理指定目录下的所有json文件"""
|
"""处理指定目录下的所有json文件"""
|
||||||
converted_files = read_converted_files(converted_filename)
|
converted_files = read_converted_files(converted_filename)
|
||||||
|
bloom_filter_chinese = BloomFilter(capacity=1000000, error_rate=0.001) # 初始化Bloom Filter
|
||||||
|
bloom_filter_english = BloomFilter(capacity=1000000, error_rate=0.001) # 初始化Bloom Filter
|
||||||
|
|
||||||
for filename in os.listdir(directory):
|
for filename in os.listdir(directory):
|
||||||
if filename.endswith('.json') and filename not in converted_files:
|
if filename.endswith('.json') and filename not in converted_files:
|
||||||
@ -30,7 +33,12 @@ def process_json_files(directory, converted_filename):
|
|||||||
chinese_text = segment.get('chinese', '').replace('\n', ' ')
|
chinese_text = segment.get('chinese', '').replace('\n', ' ')
|
||||||
english_text = segment.get('english', '').replace('\n', ' ')
|
english_text = segment.get('english', '').replace('\n', ' ')
|
||||||
|
|
||||||
|
if chinese_text not in bloom_filter_chinese:
|
||||||
|
bloom_filter_chinese.add(chinese_text)
|
||||||
source_file.write(chinese_text + '\n')
|
source_file.write(chinese_text + '\n')
|
||||||
|
|
||||||
|
if english_text not in bloom_filter_english:
|
||||||
|
bloom_filter_english.add(english_text)
|
||||||
target_file.write(english_text + '\n')
|
target_file.write(english_text + '\n')
|
||||||
|
|
||||||
write_converted_file(converted_filename, filename)
|
write_converted_file(converted_filename, filename)
|
||||||
|
Loading…
Reference in New Issue
Block a user