update: latest synthetic data script

2024-10-07 23:15:25 +08:00 · 2024-10-07 23:15:25 +08:00 · 37d2507f10
commit 37d2507f10
parent 33754146c8
5 changed files with 101 additions and 7 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,8 +9,8 @@ token_to_id.json
 __pycache__
 .env
 .env*
-translate/output
-translate/source
+translate/output*
+translate/source*
 translate/result
 *.db
 dataset/raw
--- a/translate/LLMtranslator.py
+++ b/translate/LLMtranslator.py
@ -105,8 +105,8 @@ def batch_process(input_dir, output_dir, num_threads=4):


 if __name__ == "__main__":
-    input_dir = "./source"
-    output_dir = "./output"
+    input_dir = "./source-new"
+    output_dir = "./output-new"
    batch_process(
        input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS"))
    )
--- a/translate/postprocess.py
+++ b/translate/postprocess.py
@ -27,8 +27,8 @@ def process_json_files(directory, converted_filename):
                data = json.load(json_file)
                segments = data.get('segments', [])

-                with open('./result/source.txt', 'a', encoding='utf-8') as source_file, \
-                     open('./result/target.txt', 'a', encoding='utf-8') as target_file:
+                with open('./result/source-new.txt', 'a', encoding='utf-8') as source_file, \
+                     open('./result/target-new.txt', 'a', encoding='utf-8') as target_file:
                    for segment in segments:
                        chinese_text = segment.get('chinese', '').replace('\n', ' ')
                        english_text = segment.get('english', '').replace('\n', ' ')
@ -42,7 +42,7 @@ def process_json_files(directory, converted_filename):
            write_converted_file(converted_filename, filename)

 if __name__ == "__main__":
-    json_directory = './output'  # 替换为你的JSON文件目录路径
+    json_directory = './output-new'  # 替换为你的JSON文件目录路径
    converted_filename = './result/converted.txt'

    process_json_files(json_directory, converted_filename)
--- a/translate/split_source.py
+++ b/translate/split_source.py
@ -0,0 +1,52 @@
+import os
+import re
+
+def split_content(content):
+    sentences = re.split(r'[。！？；.!?;]', content)
+    segments = []
+    current_segment = []
+    current_length = 0
+    
+    for sentence in sentences:
+        sentence_length = len(sentence)
+        if (len(current_segment) >= 25 or current_length + sentence_length > 1200):
+            segments.append(''.join(current_segment))
+            current_segment = []
+            current_length = 0
+        
+        current_segment.append(sentence)
+        current_length += sentence_length
+    
+    if current_segment:
+        segments.append(''.join(current_segment))
+    
+    return segments
+
+def process_files_in_directory(directory):
+    for filename in os.listdir(directory):
+        file_path = os.path.join(directory, filename)
+        
+        # 只处理文件，跳过目录
+        if os.path.isfile(file_path):
+            with open(file_path, 'r', encoding='utf-8') as file:
+                content = file.read()
+            
+            segments = split_content(content)
+            
+            if len(segments) > 1:
+                # 删除原始文件
+                os.remove(file_path)
+                
+                # 保存分割后的文件
+                for i, segment in enumerate(segments):
+                    new_filename = f"{filename}_{i+1}"
+                    new_file_path = os.path.join(directory, new_filename)
+                    
+                    with open(new_file_path, 'w', encoding='utf-8') as new_file:
+                        new_file.write(segment)
+            else:
+                print(f"文件 {filename} 不需要分割")
+
+# 指定目录
+directory = './source-new'
+process_files_in_directory(directory)
--- a/translate/validation/bleu_full.py
+++ b/translate/validation/bleu_full.py
@ -0,0 +1,42 @@
+import json
+import subprocess
+import evaluate
+from nltk.tokenize import word_tokenize
+from tqdm import tqdm
+
+bleu_cal = evaluate.load("chrf")
+
+def translate_text(text):
+    command = f'argos-translate --from zh --to en "{text}"'
+    result = subprocess.run(command, shell=True, capture_output=True, text=True)
+    return result.stdout.strip()
+
+def main():
+    # 读取数据集
+    with open('./data/1.jsonl', 'r', encoding='utf-8') as f:
+        data = [json.loads(line) for line in f]
+    
+    translations = []
+    references = []
+    
+    # for entry in tqdm(data):
+    #     chinese_sentence = entry['zh']
+    #     translated_sentence = translate_text(chinese_sentence)
+    #     with open("./data/1-inf.txt", "a") as f:
+    #         f.write(translated_sentence + "\n")
+    #     translations.append(translated_sentence)
+    
+    with open("./data/1-inf.txt", 'r') as f:
+        translations = f.readlines()
+
+    for entry in data:
+        english_sentence = entry['en']
+        references.append([english_sentence])
+        
+
+    # 计算 BLEU 分数
+    bleu = bleu_cal.compute(predictions=translations, references=references)
+    print(bleu)
+
+if __name__ == "__main__":
+    main()