update: fetching with cooldown

fix: post-process unmatch
improve: LLM-translate now request with temprature
This commit is contained in:
alikia2x (寒寒) 2024-09-16 04:08:33 +08:00
parent 6f25183654
commit a9a7430a58
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
3 changed files with 8 additions and 4 deletions

View File

@ -48,6 +48,7 @@ def fetch_and_extract_content(url):
return None
content = trafilatura.extract(html_string, output_format="txt", url=url, favor_precision=True)
print(f"Successfully extracted text for URL: {url}")
return content
# 计算URL的MD5
@ -95,6 +96,9 @@ def record_fetched_url(conn, url):
# 处理单个URL的任务
def process_url(url, db_path, save_path):
import time, random
cooldown_base = float(os.getenv("FETCH_COOLDOWN"))
time.sleep(random.random() * cooldown_base)
conn = connect_db(db_path)
content = fetch_and_extract_content(url)
if content:
@ -102,6 +106,7 @@ def process_url(url, db_path, save_path):
save_segments(url, segments, save_path)
record_fetched_url(conn, url)
conn.close()
time.sleep(random.random() * cooldown_base)
# 主函数
def main():

View File

@ -43,7 +43,8 @@ def translate_text(text):
response = client.chat.completions.create(
model=os.getenv("TRANSLATION_MODEL"),
messages=messages,
response_format={'type': 'json_object'}
response_format={'type': 'json_object'},
temperature=float(os.getenv("TRANSLATION_TEMP"))
)
return json.loads(response.choices[0].message.content)

View File

@ -33,11 +33,9 @@ def process_json_files(directory, converted_filename):
chinese_text = segment.get('chinese', '').replace('\n', ' ')
english_text = segment.get('english', '').replace('\n', ' ')
if chinese_text not in bloom_filter_chinese:
if chinese_text not in bloom_filter_chinese and english_text not in bloom_filter_english:
bloom_filter_chinese.add(chinese_text)
source_file.write(chinese_text + '\n')
if english_text not in bloom_filter_english:
bloom_filter_english.add(english_text)
target_file.write(english_text + '\n')