update: latest synthetic data script

2024-10-07 23:15:25 +08:00 · 2024-10-07 23:15:25 +08:00 · 37d2507f10
commit 37d2507f10
parent 33754146c8
5 changed files with 101 additions and 7 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,8 +9,8 @@ token_to_id.json
 __pycache__
 .env
 .env*
-translate/output
+translate/output*
-translate/source
+translate/source*
 translate/result
 *.db
 dataset/raw
--- a/translate/LLMtranslator.py
+++ b/translate/LLMtranslator.py
@ -105,8 +105,8 @@ def batch_process(input_dir, output_dir, num_threads=4):
 if __name__ == "__main__":
-    input_dir = "./source"
+    input_dir = "./source-new"
-    output_dir = "./output"
+    output_dir = "./output-new"
    batch_process(
        input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS"))
    )
--- a/translate/postprocess.py
+++ b/translate/postprocess.py
@ -27,8 +27,8 @@ def process_json_files(directory, converted_filename):
                data = json.load(json_file)
                segments = data.get('segments', [])
-                with open('./result/source.txt', 'a', encoding='utf-8') as source_file, \
+                with open('./result/source-new.txt', 'a', encoding='utf-8') as source_file, \
-                     open('./result/target.txt', 'a', encoding='utf-8') as target_file:
+                     open('./result/target-new.txt', 'a', encoding='utf-8') as target_file:
                    for segment in segments:
                        chinese_text = segment.get('chinese', '').replace('\n', ' ')
                        english_text = segment.get('english', '').replace('\n', ' ')
@ -42,7 +42,7 @@ def process_json_files(directory, converted_filename):
            write_converted_file(converted_filename, filename)
 if __name__ == "__main__":
-    json_directory = './output'  # 替换为你的JSON文件目录路径
+    json_directory = './output-new'  # 替换为你的JSON文件目录路径
    converted_filename = './result/converted.txt'
    process_json_files(json_directory, converted_filename)
--- a/translate/split_source.py
+++ b/translate/split_source.py
@ -0,0 +1,52 @@
 import os
 import re
 def split_content(content):
    sentences = re.split(r'[。！？；.!?;]', content)
    segments = []
    current_segment = []
    current_length = 0
    for sentence in sentences:
        sentence_length = len(sentence)
        if (len(current_segment) >= 25 or current_length + sentence_length > 1200):
            segments.append(''.join(current_segment))
            current_segment = []
            current_length = 0
        current_segment.append(sentence)
        current_length += sentence_length
    if current_segment:
        segments.append(''.join(current_segment))
    return segments
 def process_files_in_directory(directory):
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        # 只处理文件，跳过目录
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
            segments = split_content(content)
            if len(segments) > 1:
                # 删除原始文件
                os.remove(file_path)
                # 保存分割后的文件
                for i, segment in enumerate(segments):
                    new_filename = f"{filename}_{i+1}"
                    new_file_path = os.path.join(directory, new_filename)
                    with open(new_file_path, 'w', encoding='utf-8') as new_file:
                        new_file.write(segment)
            else:
                print(f"文件 {filename} 不需要分割")
 # 指定目录
 directory = './source-new'
 process_files_in_directory(directory)
--- a/translate/validation/bleu_full.py
+++ b/translate/validation/bleu_full.py
@ -0,0 +1,42 @@
 import json
 import subprocess
 import evaluate
 from nltk.tokenize import word_tokenize
 from tqdm import tqdm
 bleu_cal = evaluate.load("chrf")
 def translate_text(text):
    command = f'argos-translate --from zh --to en "{text}"'
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    return result.stdout.strip()
 def main():
    # 读取数据集
    with open('./data/1.jsonl', 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    translations = []
    references = []
    # for entry in tqdm(data):
    #     chinese_sentence = entry['zh']
    #     translated_sentence = translate_text(chinese_sentence)
    #     with open("./data/1-inf.txt", "a") as f:
    #         f.write(translated_sentence + "\n")
    #     translations.append(translated_sentence)
    with open("./data/1-inf.txt", 'r') as f:
        translations = f.readlines()
    for entry in data:
        english_sentence = entry['en']
        references.append([english_sentence])
    # 计算 BLEU 分数
    bleu = bleu_cal.compute(predictions=translations, references=references)
    print(bleu)
 if __name__ == "__main__":
    main()