From a9a7430a588abb22771cc3155611a8b7691fad6b Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 16 Sep 2024 04:08:33 +0800 Subject: [PATCH] update: fetching with cooldown fix: post-process unmatch improve: LLM-translate now request with temprature --- translate/fetcher.py | 5 +++++ translate/llm-translate.py | 3 ++- translate/postprocess.py | 4 +--- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/translate/fetcher.py b/translate/fetcher.py index b361525..b1bb1dc 100644 --- a/translate/fetcher.py +++ b/translate/fetcher.py @@ -48,6 +48,7 @@ def fetch_and_extract_content(url): return None content = trafilatura.extract(html_string, output_format="txt", url=url, favor_precision=True) + print(f"Successfully extracted text for URL: {url}") return content # 计算URL的MD5 @@ -95,6 +96,9 @@ def record_fetched_url(conn, url): # 处理单个URL的任务 def process_url(url, db_path, save_path): + import time, random + cooldown_base = float(os.getenv("FETCH_COOLDOWN")) + time.sleep(random.random() * cooldown_base) conn = connect_db(db_path) content = fetch_and_extract_content(url) if content: @@ -102,6 +106,7 @@ def process_url(url, db_path, save_path): save_segments(url, segments, save_path) record_fetched_url(conn, url) conn.close() + time.sleep(random.random() * cooldown_base) # 主函数 def main(): diff --git a/translate/llm-translate.py b/translate/llm-translate.py index 3ccc235..5f9e9cb 100644 --- a/translate/llm-translate.py +++ b/translate/llm-translate.py @@ -43,7 +43,8 @@ def translate_text(text): response = client.chat.completions.create( model=os.getenv("TRANSLATION_MODEL"), messages=messages, - response_format={'type': 'json_object'} + response_format={'type': 'json_object'}, + temperature=float(os.getenv("TRANSLATION_TEMP")) ) return json.loads(response.choices[0].message.content) diff --git a/translate/postprocess.py b/translate/postprocess.py index a60cd4a..ee25fe0 100644 --- a/translate/postprocess.py +++ b/translate/postprocess.py @@ -33,11 +33,9 @@ def process_json_files(directory, converted_filename): chinese_text = segment.get('chinese', '').replace('\n', ' ') english_text = segment.get('english', '').replace('\n', ' ') - if chinese_text not in bloom_filter_chinese: + if chinese_text not in bloom_filter_chinese and english_text not in bloom_filter_english: bloom_filter_chinese.add(chinese_text) source_file.write(chinese_text + '\n') - - if english_text not in bloom_filter_english: bloom_filter_english.add(english_text) target_file.write(english_text + '\n')