From 37d2507f101b158de802e55b7b411a9f9c23dcb6 Mon Sep 17 00:00:00 2001
From: alikia2x <alikia2x@outlook.com>
Date: Mon, 7 Oct 2024 23:15:25 +0800
Subject: [PATCH] update: latest synthetic data script

---
 .gitignore                        |  4 +--
 translate/LLMtranslator.py        |  4 +--
 translate/postprocess.py          |  6 ++--
 translate/split_source.py         | 52 +++++++++++++++++++++++++++++++
 translate/validation/bleu_full.py | 42 +++++++++++++++++++++++++
 5 files changed, 101 insertions(+), 7 deletions(-)
 create mode 100644 translate/split_source.py
 create mode 100644 translate/validation/bleu_full.py

diff --git a/.gitignore b/.gitignore
index 2dcd07e..2922831 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,8 +9,8 @@ token_to_id.json
 __pycache__
 .env
 .env*
-translate/output
-translate/source
+translate/output*
+translate/source*
 translate/result
 *.db
 dataset/raw
diff --git a/translate/LLMtranslator.py b/translate/LLMtranslator.py
index eceff53..a066b24 100644
--- a/translate/LLMtranslator.py
+++ b/translate/LLMtranslator.py
@@ -105,8 +105,8 @@ def batch_process(input_dir, output_dir, num_threads=4):
 
 
 if __name__ == "__main__":
-    input_dir = "./source"
-    output_dir = "./output"
+    input_dir = "./source-new"
+    output_dir = "./output-new"
     batch_process(
         input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS"))
     )
diff --git a/translate/postprocess.py b/translate/postprocess.py
index ee25fe0..cf996c6 100644
--- a/translate/postprocess.py
+++ b/translate/postprocess.py
@@ -27,8 +27,8 @@ def process_json_files(directory, converted_filename):
                 data = json.load(json_file)
                 segments = data.get('segments', [])
 
-                with open('./result/source.txt', 'a', encoding='utf-8') as source_file, \
-                     open('./result/target.txt', 'a', encoding='utf-8') as target_file:
+                with open('./result/source-new.txt', 'a', encoding='utf-8') as source_file, \
+                     open('./result/target-new.txt', 'a', encoding='utf-8') as target_file:
                     for segment in segments:
                         chinese_text = segment.get('chinese', '').replace('\n', ' ')
                         english_text = segment.get('english', '').replace('\n', ' ')
@@ -42,7 +42,7 @@ def process_json_files(directory, converted_filename):
             write_converted_file(converted_filename, filename)
 
 if __name__ == "__main__":
-    json_directory = './output'  # 替换为你的JSON文件目录路径
+    json_directory = './output-new'  # 替换为你的JSON文件目录路径
     converted_filename = './result/converted.txt'
 
     process_json_files(json_directory, converted_filename)
\ No newline at end of file
diff --git a/translate/split_source.py b/translate/split_source.py
new file mode 100644
index 0000000..b8016aa
--- /dev/null
+++ b/translate/split_source.py
@@ -0,0 +1,52 @@
+import os
+import re
+
+def split_content(content):
+    sentences = re.split(r'[。！？；.!?;]', content)
+    segments = []
+    current_segment = []
+    current_length = 0
+    
+    for sentence in sentences:
+        sentence_length = len(sentence)
+        if (len(current_segment) >= 25 or current_length + sentence_length > 1200):
+            segments.append(''.join(current_segment))
+            current_segment = []
+            current_length = 0
+        
+        current_segment.append(sentence)
+        current_length += sentence_length
+    
+    if current_segment:
+        segments.append(''.join(current_segment))
+    
+    return segments
+
+def process_files_in_directory(directory):
+    for filename in os.listdir(directory):
+        file_path = os.path.join(directory, filename)
+        
+        # 只处理文件，跳过目录
+        if os.path.isfile(file_path):
+            with open(file_path, 'r', encoding='utf-8') as file:
+                content = file.read()
+            
+            segments = split_content(content)
+            
+            if len(segments) > 1:
+                # 删除原始文件
+                os.remove(file_path)
+                
+                # 保存分割后的文件
+                for i, segment in enumerate(segments):
+                    new_filename = f"{filename}_{i+1}"
+                    new_file_path = os.path.join(directory, new_filename)
+                    
+                    with open(new_file_path, 'w', encoding='utf-8') as new_file:
+                        new_file.write(segment)
+            else:
+                print(f"文件 {filename} 不需要分割")
+
+# 指定目录
+directory = './source-new'
+process_files_in_directory(directory)
\ No newline at end of file
diff --git a/translate/validation/bleu_full.py b/translate/validation/bleu_full.py
new file mode 100644
index 0000000..cecdbc3
--- /dev/null
+++ b/translate/validation/bleu_full.py
@@ -0,0 +1,42 @@
+import json
+import subprocess
+import evaluate
+from nltk.tokenize import word_tokenize
+from tqdm import tqdm
+
+bleu_cal = evaluate.load("chrf")
+
+def translate_text(text):
+    command = f'argos-translate --from zh --to en "{text}"'
+    result = subprocess.run(command, shell=True, capture_output=True, text=True)
+    return result.stdout.strip()
+
+def main():
+    # 读取数据集
+    with open('./data/1.jsonl', 'r', encoding='utf-8') as f:
+        data = [json.loads(line) for line in f]
+    
+    translations = []
+    references = []
+    
+    # for entry in tqdm(data):
+    #     chinese_sentence = entry['zh']
+    #     translated_sentence = translate_text(chinese_sentence)
+    #     with open("./data/1-inf.txt", "a") as f:
+    #         f.write(translated_sentence + "\n")
+    #     translations.append(translated_sentence)
+    
+    with open("./data/1-inf.txt", 'r') as f:
+        translations = f.readlines()
+
+    for entry in data:
+        english_sentence = entry['en']
+        references.append([english_sentence])
+        
+
+    # 计算 BLEU 分数
+    bleu = bleu_cal.compute(predictions=translations, references=references)
+    print(bleu)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file