update: fetcher, translator

increase threshold of split in fetcher
improve prompt for LLM-translator
This commit is contained in:
alikia2x (寒寒) 2024-09-16 00:48:07 +08:00
parent 7021687e10
commit 9eeb3de828
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
2 changed files with 4 additions and 3 deletions

View File

@ -60,7 +60,7 @@ def split_content(content):
for sentence in sentences: for sentence in sentences:
sentence_length = len(sentence) sentence_length = len(sentence)
if (len(current_segment) >= 10 or current_length + sentence_length > 1500): if (len(current_segment) >= 12 or current_length + sentence_length > 1800):
segments.append(''.join(current_segment)) segments.append(''.join(current_segment))
current_segment = [] current_segment = []
current_length = 0 current_length = 0

View File

@ -20,6 +20,7 @@ IMPORTANT:
2. For segments or sentences that appear multiple times in the original text, they are only output **once** in the returned translation. 2. For segments or sentences that appear multiple times in the original text, they are only output **once** in the returned translation.
3. **For content with obvious semantic differences, such as different components on a web page, no matter how short it is, it should be divided into a separate segment.** 3. **For content with obvious semantic differences, such as different components on a web page, no matter how short it is, it should be divided into a separate segment.**
4. **Information such as web page headers, footers, and other fixed text, such as copyright notices, website or company names, and conventional link text (such as "About Us", "Privacy Policy", etc.) will be **ignored and not translated** 4. **Information such as web page headers, footers, and other fixed text, such as copyright notices, website or company names, and conventional link text (such as "About Us", "Privacy Policy", etc.) will be **ignored and not translated**
5. If the provided text lacks proper punctuation, please add proper punctuation to both the source text and the translated text in the output.
EXAMPLE INPUT: EXAMPLE INPUT:
法律之前人人平等并有权享受法律的平等保护不受任何歧视人人有权享受平等保护以免受违反本宣言的任何歧视行为以及煽动这种歧视的任何行为之害 法律之前人人平等并有权享受法律的平等保护不受任何歧视人人有权享受平等保护以免受违反本宣言的任何歧视行为以及煽动这种歧视的任何行为之害
@ -40,7 +41,7 @@ def translate_text(text):
] ]
response = client.chat.completions.create( response = client.chat.completions.create(
model="deepseek-chat", model=os.getenv("TRANSLATION_MODEL"),
messages=messages, messages=messages,
response_format={'type': 'json_object'} response_format={'type': 'json_object'}
) )
@ -91,4 +92,4 @@ def batch_process(input_dir, output_dir, num_threads=4):
if __name__ == "__main__": if __name__ == "__main__":
input_dir = "./source" input_dir = "./source"
output_dir = "./output" output_dir = "./output"
batch_process(input_dir, output_dir, num_threads=64) batch_process(input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS")))