From 37d2507f101b158de802e55b7b411a9f9c23dcb6 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Mon, 7 Oct 2024 23:15:25 +0800 Subject: [PATCH] update: latest synthetic data script --- .gitignore | 4 +-- translate/LLMtranslator.py | 4 +-- translate/postprocess.py | 6 ++-- translate/split_source.py | 52 +++++++++++++++++++++++++++++++ translate/validation/bleu_full.py | 42 +++++++++++++++++++++++++ 5 files changed, 101 insertions(+), 7 deletions(-) create mode 100644 translate/split_source.py create mode 100644 translate/validation/bleu_full.py diff --git a/.gitignore b/.gitignore index 2dcd07e..2922831 100644 --- a/.gitignore +++ b/.gitignore @@ -9,8 +9,8 @@ token_to_id.json __pycache__ .env .env* -translate/output -translate/source +translate/output* +translate/source* translate/result *.db dataset/raw diff --git a/translate/LLMtranslator.py b/translate/LLMtranslator.py index eceff53..a066b24 100644 --- a/translate/LLMtranslator.py +++ b/translate/LLMtranslator.py @@ -105,8 +105,8 @@ def batch_process(input_dir, output_dir, num_threads=4): if __name__ == "__main__": - input_dir = "./source" - output_dir = "./output" + input_dir = "./source-new" + output_dir = "./output-new" batch_process( input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS")) ) diff --git a/translate/postprocess.py b/translate/postprocess.py index ee25fe0..cf996c6 100644 --- a/translate/postprocess.py +++ b/translate/postprocess.py @@ -27,8 +27,8 @@ def process_json_files(directory, converted_filename): data = json.load(json_file) segments = data.get('segments', []) - with open('./result/source.txt', 'a', encoding='utf-8') as source_file, \ - open('./result/target.txt', 'a', encoding='utf-8') as target_file: + with open('./result/source-new.txt', 'a', encoding='utf-8') as source_file, \ + open('./result/target-new.txt', 'a', encoding='utf-8') as target_file: for segment in segments: chinese_text = segment.get('chinese', '').replace('\n', ' ') english_text = segment.get('english', '').replace('\n', ' ') @@ -42,7 +42,7 @@ def process_json_files(directory, converted_filename): write_converted_file(converted_filename, filename) if __name__ == "__main__": - json_directory = './output' # 替换为你的JSON文件目录路径 + json_directory = './output-new' # 替换为你的JSON文件目录路径 converted_filename = './result/converted.txt' process_json_files(json_directory, converted_filename) \ No newline at end of file diff --git a/translate/split_source.py b/translate/split_source.py new file mode 100644 index 0000000..b8016aa --- /dev/null +++ b/translate/split_source.py @@ -0,0 +1,52 @@ +import os +import re + +def split_content(content): + sentences = re.split(r'[。!?;.!?;]', content) + segments = [] + current_segment = [] + current_length = 0 + + for sentence in sentences: + sentence_length = len(sentence) + if (len(current_segment) >= 25 or current_length + sentence_length > 1200): + segments.append(''.join(current_segment)) + current_segment = [] + current_length = 0 + + current_segment.append(sentence) + current_length += sentence_length + + if current_segment: + segments.append(''.join(current_segment)) + + return segments + +def process_files_in_directory(directory): + for filename in os.listdir(directory): + file_path = os.path.join(directory, filename) + + # 只处理文件,跳过目录 + if os.path.isfile(file_path): + with open(file_path, 'r', encoding='utf-8') as file: + content = file.read() + + segments = split_content(content) + + if len(segments) > 1: + # 删除原始文件 + os.remove(file_path) + + # 保存分割后的文件 + for i, segment in enumerate(segments): + new_filename = f"{filename}_{i+1}" + new_file_path = os.path.join(directory, new_filename) + + with open(new_file_path, 'w', encoding='utf-8') as new_file: + new_file.write(segment) + else: + print(f"文件 {filename} 不需要分割") + +# 指定目录 +directory = './source-new' +process_files_in_directory(directory) \ No newline at end of file diff --git a/translate/validation/bleu_full.py b/translate/validation/bleu_full.py new file mode 100644 index 0000000..cecdbc3 --- /dev/null +++ b/translate/validation/bleu_full.py @@ -0,0 +1,42 @@ +import json +import subprocess +import evaluate +from nltk.tokenize import word_tokenize +from tqdm import tqdm + +bleu_cal = evaluate.load("chrf") + +def translate_text(text): + command = f'argos-translate --from zh --to en "{text}"' + result = subprocess.run(command, shell=True, capture_output=True, text=True) + return result.stdout.strip() + +def main(): + # 读取数据集 + with open('./data/1.jsonl', 'r', encoding='utf-8') as f: + data = [json.loads(line) for line in f] + + translations = [] + references = [] + + # for entry in tqdm(data): + # chinese_sentence = entry['zh'] + # translated_sentence = translate_text(chinese_sentence) + # with open("./data/1-inf.txt", "a") as f: + # f.write(translated_sentence + "\n") + # translations.append(translated_sentence) + + with open("./data/1-inf.txt", 'r') as f: + translations = f.readlines() + + for entry in data: + english_sentence = entry['en'] + references.append([english_sentence]) + + + # 计算 BLEU 分数 + bleu = bleu_cal.compute(predictions=translations, references=references) + print(bleu) + +if __name__ == "__main__": + main() \ No newline at end of file