update: latest synthetic data script
This commit is contained in:
parent
33754146c8
commit
37d2507f10
4
.gitignore
vendored
4
.gitignore
vendored
@ -9,8 +9,8 @@ token_to_id.json
|
||||
__pycache__
|
||||
.env
|
||||
.env*
|
||||
translate/output
|
||||
translate/source
|
||||
translate/output*
|
||||
translate/source*
|
||||
translate/result
|
||||
*.db
|
||||
dataset/raw
|
||||
|
@ -105,8 +105,8 @@ def batch_process(input_dir, output_dir, num_threads=4):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
input_dir = "./source"
|
||||
output_dir = "./output"
|
||||
input_dir = "./source-new"
|
||||
output_dir = "./output-new"
|
||||
batch_process(
|
||||
input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS"))
|
||||
)
|
||||
|
@ -27,8 +27,8 @@ def process_json_files(directory, converted_filename):
|
||||
data = json.load(json_file)
|
||||
segments = data.get('segments', [])
|
||||
|
||||
with open('./result/source.txt', 'a', encoding='utf-8') as source_file, \
|
||||
open('./result/target.txt', 'a', encoding='utf-8') as target_file:
|
||||
with open('./result/source-new.txt', 'a', encoding='utf-8') as source_file, \
|
||||
open('./result/target-new.txt', 'a', encoding='utf-8') as target_file:
|
||||
for segment in segments:
|
||||
chinese_text = segment.get('chinese', '').replace('\n', ' ')
|
||||
english_text = segment.get('english', '').replace('\n', ' ')
|
||||
@ -42,7 +42,7 @@ def process_json_files(directory, converted_filename):
|
||||
write_converted_file(converted_filename, filename)
|
||||
|
||||
if __name__ == "__main__":
|
||||
json_directory = './output' # 替换为你的JSON文件目录路径
|
||||
json_directory = './output-new' # 替换为你的JSON文件目录路径
|
||||
converted_filename = './result/converted.txt'
|
||||
|
||||
process_json_files(json_directory, converted_filename)
|
52
translate/split_source.py
Normal file
52
translate/split_source.py
Normal file
@ -0,0 +1,52 @@
|
||||
import os
|
||||
import re
|
||||
|
||||
def split_content(content):
|
||||
sentences = re.split(r'[。!?;.!?;]', content)
|
||||
segments = []
|
||||
current_segment = []
|
||||
current_length = 0
|
||||
|
||||
for sentence in sentences:
|
||||
sentence_length = len(sentence)
|
||||
if (len(current_segment) >= 25 or current_length + sentence_length > 1200):
|
||||
segments.append(''.join(current_segment))
|
||||
current_segment = []
|
||||
current_length = 0
|
||||
|
||||
current_segment.append(sentence)
|
||||
current_length += sentence_length
|
||||
|
||||
if current_segment:
|
||||
segments.append(''.join(current_segment))
|
||||
|
||||
return segments
|
||||
|
||||
def process_files_in_directory(directory):
|
||||
for filename in os.listdir(directory):
|
||||
file_path = os.path.join(directory, filename)
|
||||
|
||||
# 只处理文件,跳过目录
|
||||
if os.path.isfile(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
content = file.read()
|
||||
|
||||
segments = split_content(content)
|
||||
|
||||
if len(segments) > 1:
|
||||
# 删除原始文件
|
||||
os.remove(file_path)
|
||||
|
||||
# 保存分割后的文件
|
||||
for i, segment in enumerate(segments):
|
||||
new_filename = f"{filename}_{i+1}"
|
||||
new_file_path = os.path.join(directory, new_filename)
|
||||
|
||||
with open(new_file_path, 'w', encoding='utf-8') as new_file:
|
||||
new_file.write(segment)
|
||||
else:
|
||||
print(f"文件 {filename} 不需要分割")
|
||||
|
||||
# 指定目录
|
||||
directory = './source-new'
|
||||
process_files_in_directory(directory)
|
42
translate/validation/bleu_full.py
Normal file
42
translate/validation/bleu_full.py
Normal file
@ -0,0 +1,42 @@
|
||||
import json
|
||||
import subprocess
|
||||
import evaluate
|
||||
from nltk.tokenize import word_tokenize
|
||||
from tqdm import tqdm
|
||||
|
||||
bleu_cal = evaluate.load("chrf")
|
||||
|
||||
def translate_text(text):
|
||||
command = f'argos-translate --from zh --to en "{text}"'
|
||||
result = subprocess.run(command, shell=True, capture_output=True, text=True)
|
||||
return result.stdout.strip()
|
||||
|
||||
def main():
|
||||
# 读取数据集
|
||||
with open('./data/1.jsonl', 'r', encoding='utf-8') as f:
|
||||
data = [json.loads(line) for line in f]
|
||||
|
||||
translations = []
|
||||
references = []
|
||||
|
||||
# for entry in tqdm(data):
|
||||
# chinese_sentence = entry['zh']
|
||||
# translated_sentence = translate_text(chinese_sentence)
|
||||
# with open("./data/1-inf.txt", "a") as f:
|
||||
# f.write(translated_sentence + "\n")
|
||||
# translations.append(translated_sentence)
|
||||
|
||||
with open("./data/1-inf.txt", 'r') as f:
|
||||
translations = f.readlines()
|
||||
|
||||
for entry in data:
|
||||
english_sentence = entry['en']
|
||||
references.append([english_sentence])
|
||||
|
||||
|
||||
# 计算 BLEU 分数
|
||||
bleu = bleu_cal.compute(predictions=translations, references=references)
|
||||
print(bleu)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user