update: fetching with cooldown
fix: post-process unmatch improve: LLM-translate now request with temprature
This commit is contained in:
parent
6f25183654
commit
a9a7430a58
@ -48,6 +48,7 @@ def fetch_and_extract_content(url):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
content = trafilatura.extract(html_string, output_format="txt", url=url, favor_precision=True)
|
content = trafilatura.extract(html_string, output_format="txt", url=url, favor_precision=True)
|
||||||
|
print(f"Successfully extracted text for URL: {url}")
|
||||||
return content
|
return content
|
||||||
|
|
||||||
# 计算URL的MD5
|
# 计算URL的MD5
|
||||||
@ -95,6 +96,9 @@ def record_fetched_url(conn, url):
|
|||||||
|
|
||||||
# 处理单个URL的任务
|
# 处理单个URL的任务
|
||||||
def process_url(url, db_path, save_path):
|
def process_url(url, db_path, save_path):
|
||||||
|
import time, random
|
||||||
|
cooldown_base = float(os.getenv("FETCH_COOLDOWN"))
|
||||||
|
time.sleep(random.random() * cooldown_base)
|
||||||
conn = connect_db(db_path)
|
conn = connect_db(db_path)
|
||||||
content = fetch_and_extract_content(url)
|
content = fetch_and_extract_content(url)
|
||||||
if content:
|
if content:
|
||||||
@ -102,6 +106,7 @@ def process_url(url, db_path, save_path):
|
|||||||
save_segments(url, segments, save_path)
|
save_segments(url, segments, save_path)
|
||||||
record_fetched_url(conn, url)
|
record_fetched_url(conn, url)
|
||||||
conn.close()
|
conn.close()
|
||||||
|
time.sleep(random.random() * cooldown_base)
|
||||||
|
|
||||||
# 主函数
|
# 主函数
|
||||||
def main():
|
def main():
|
||||||
|
@ -43,7 +43,8 @@ def translate_text(text):
|
|||||||
response = client.chat.completions.create(
|
response = client.chat.completions.create(
|
||||||
model=os.getenv("TRANSLATION_MODEL"),
|
model=os.getenv("TRANSLATION_MODEL"),
|
||||||
messages=messages,
|
messages=messages,
|
||||||
response_format={'type': 'json_object'}
|
response_format={'type': 'json_object'},
|
||||||
|
temperature=float(os.getenv("TRANSLATION_TEMP"))
|
||||||
)
|
)
|
||||||
|
|
||||||
return json.loads(response.choices[0].message.content)
|
return json.loads(response.choices[0].message.content)
|
||||||
|
@ -33,11 +33,9 @@ def process_json_files(directory, converted_filename):
|
|||||||
chinese_text = segment.get('chinese', '').replace('\n', ' ')
|
chinese_text = segment.get('chinese', '').replace('\n', ' ')
|
||||||
english_text = segment.get('english', '').replace('\n', ' ')
|
english_text = segment.get('english', '').replace('\n', ' ')
|
||||||
|
|
||||||
if chinese_text not in bloom_filter_chinese:
|
if chinese_text not in bloom_filter_chinese and english_text not in bloom_filter_english:
|
||||||
bloom_filter_chinese.add(chinese_text)
|
bloom_filter_chinese.add(chinese_text)
|
||||||
source_file.write(chinese_text + '\n')
|
source_file.write(chinese_text + '\n')
|
||||||
|
|
||||||
if english_text not in bloom_filter_english:
|
|
||||||
bloom_filter_english.add(english_text)
|
bloom_filter_english.add(english_text)
|
||||||
target_file.write(english_text + '\n')
|
target_file.write(english_text + '\n')
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user