update: latest synthetic data script

This commit is contained in:
alikia2x (寒寒) 2024-10-07 23:15:25 +08:00
parent 33754146c8
commit 37d2507f10
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
5 changed files with 101 additions and 7 deletions

4
.gitignore vendored
View File

@ -9,8 +9,8 @@ token_to_id.json
__pycache__ __pycache__
.env .env
.env* .env*
translate/output translate/output*
translate/source translate/source*
translate/result translate/result
*.db *.db
dataset/raw dataset/raw

View File

@ -105,8 +105,8 @@ def batch_process(input_dir, output_dir, num_threads=4):
if __name__ == "__main__": if __name__ == "__main__":
input_dir = "./source" input_dir = "./source-new"
output_dir = "./output" output_dir = "./output-new"
batch_process( batch_process(
input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS")) input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS"))
) )

View File

@ -27,8 +27,8 @@ def process_json_files(directory, converted_filename):
data = json.load(json_file) data = json.load(json_file)
segments = data.get('segments', []) segments = data.get('segments', [])
with open('./result/source.txt', 'a', encoding='utf-8') as source_file, \ with open('./result/source-new.txt', 'a', encoding='utf-8') as source_file, \
open('./result/target.txt', 'a', encoding='utf-8') as target_file: open('./result/target-new.txt', 'a', encoding='utf-8') as target_file:
for segment in segments: for segment in segments:
chinese_text = segment.get('chinese', '').replace('\n', ' ') chinese_text = segment.get('chinese', '').replace('\n', ' ')
english_text = segment.get('english', '').replace('\n', ' ') english_text = segment.get('english', '').replace('\n', ' ')
@ -42,7 +42,7 @@ def process_json_files(directory, converted_filename):
write_converted_file(converted_filename, filename) write_converted_file(converted_filename, filename)
if __name__ == "__main__": if __name__ == "__main__":
json_directory = './output' # 替换为你的JSON文件目录路径 json_directory = './output-new' # 替换为你的JSON文件目录路径
converted_filename = './result/converted.txt' converted_filename = './result/converted.txt'
process_json_files(json_directory, converted_filename) process_json_files(json_directory, converted_filename)

52
translate/split_source.py Normal file
View File

@ -0,0 +1,52 @@
import os
import re
def split_content(content):
sentences = re.split(r'[。!?;.!?;]', content)
segments = []
current_segment = []
current_length = 0
for sentence in sentences:
sentence_length = len(sentence)
if (len(current_segment) >= 25 or current_length + sentence_length > 1200):
segments.append(''.join(current_segment))
current_segment = []
current_length = 0
current_segment.append(sentence)
current_length += sentence_length
if current_segment:
segments.append(''.join(current_segment))
return segments
def process_files_in_directory(directory):
for filename in os.listdir(directory):
file_path = os.path.join(directory, filename)
# 只处理文件,跳过目录
if os.path.isfile(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
segments = split_content(content)
if len(segments) > 1:
# 删除原始文件
os.remove(file_path)
# 保存分割后的文件
for i, segment in enumerate(segments):
new_filename = f"{filename}_{i+1}"
new_file_path = os.path.join(directory, new_filename)
with open(new_file_path, 'w', encoding='utf-8') as new_file:
new_file.write(segment)
else:
print(f"文件 {filename} 不需要分割")
# 指定目录
directory = './source-new'
process_files_in_directory(directory)

View File

@ -0,0 +1,42 @@
import json
import subprocess
import evaluate
from nltk.tokenize import word_tokenize
from tqdm import tqdm
bleu_cal = evaluate.load("chrf")
def translate_text(text):
command = f'argos-translate --from zh --to en "{text}"'
result = subprocess.run(command, shell=True, capture_output=True, text=True)
return result.stdout.strip()
def main():
# 读取数据集
with open('./data/1.jsonl', 'r', encoding='utf-8') as f:
data = [json.loads(line) for line in f]
translations = []
references = []
# for entry in tqdm(data):
# chinese_sentence = entry['zh']
# translated_sentence = translate_text(chinese_sentence)
# with open("./data/1-inf.txt", "a") as f:
# f.write(translated_sentence + "\n")
# translations.append(translated_sentence)
with open("./data/1-inf.txt", 'r') as f:
translations = f.readlines()
for entry in data:
english_sentence = entry['en']
references.append([english_sentence])
# 计算 BLEU 分数
bleu = bleu_cal.compute(predictions=translations, references=references)
print(bleu)
if __name__ == "__main__":
main()