update: fetcher, translator

increase threshold of split in fetcher improve prompt for LLM-translator
2024-09-16 00:48:07 +08:00 · 2024-09-16 00:48:07 +08:00 · 9eeb3de828
commit 9eeb3de828
parent 7021687e10
2 changed files with 4 additions and 3 deletions
--- a/translate/fetcher.py
+++ b/translate/fetcher.py
@ -60,7 +60,7 @@ def split_content(content):
    for sentence in sentences:
        sentence_length = len(sentence)
-        if (len(current_segment) >= 10 or current_length + sentence_length > 1500):
+        if (len(current_segment) >= 12 or current_length + sentence_length > 1800):
            segments.append(''.join(current_segment))
            current_segment = []
            current_length = 0
--- a/translate/llm-translate.py
+++ b/translate/llm-translate.py
@ -20,6 +20,7 @@ IMPORTANT:
 2. For segments or sentences that appear multiple times in the original text, they are only output **once** in the returned translation.
 3. **For content with obvious semantic differences, such as different components on a web page, no matter how short it is, it should be divided into a separate segment.**
 4. **Information such as web page headers, footers, and other fixed text, such as copyright notices, website or company names, and conventional link text (such as "About Us", "Privacy Policy", etc.) will be **ignored and not translated**
 5. If the provided text lacks proper punctuation, please add proper punctuation to both the source text and the translated text in the output.
 EXAMPLE INPUT: 
 法律之前人人平等，并有权享受法律的平等保护，不受任何歧视。人人有权享受平等保护，以免受违反本宣言的任何歧视行为以及煽动这种歧视的任何行为之害。
@ -40,7 +41,7 @@ def translate_text(text):
    ]
    response = client.chat.completions.create(
-        model="deepseek-chat",
+        model=os.getenv("TRANSLATION_MODEL"),
        messages=messages,
        response_format={'type': 'json_object'}
    )
@ -91,4 +92,4 @@ def batch_process(input_dir, output_dir, num_threads=4):
 if __name__ == "__main__":
    input_dir = "./source"
    output_dir = "./output"
-    batch_process(input_dir, output_dir, num_threads=64)
+    batch_process(input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS")))