update: latest synthetic data script
This commit is contained in:
parent
33754146c8
commit
37d2507f10
4
.gitignore
vendored
4
.gitignore
vendored
@ -9,8 +9,8 @@ token_to_id.json
|
|||||||
__pycache__
|
__pycache__
|
||||||
.env
|
.env
|
||||||
.env*
|
.env*
|
||||||
translate/output
|
translate/output*
|
||||||
translate/source
|
translate/source*
|
||||||
translate/result
|
translate/result
|
||||||
*.db
|
*.db
|
||||||
dataset/raw
|
dataset/raw
|
||||||
|
@ -105,8 +105,8 @@ def batch_process(input_dir, output_dir, num_threads=4):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
input_dir = "./source"
|
input_dir = "./source-new"
|
||||||
output_dir = "./output"
|
output_dir = "./output-new"
|
||||||
batch_process(
|
batch_process(
|
||||||
input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS"))
|
input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS"))
|
||||||
)
|
)
|
||||||
|
@ -27,8 +27,8 @@ def process_json_files(directory, converted_filename):
|
|||||||
data = json.load(json_file)
|
data = json.load(json_file)
|
||||||
segments = data.get('segments', [])
|
segments = data.get('segments', [])
|
||||||
|
|
||||||
with open('./result/source.txt', 'a', encoding='utf-8') as source_file, \
|
with open('./result/source-new.txt', 'a', encoding='utf-8') as source_file, \
|
||||||
open('./result/target.txt', 'a', encoding='utf-8') as target_file:
|
open('./result/target-new.txt', 'a', encoding='utf-8') as target_file:
|
||||||
for segment in segments:
|
for segment in segments:
|
||||||
chinese_text = segment.get('chinese', '').replace('\n', ' ')
|
chinese_text = segment.get('chinese', '').replace('\n', ' ')
|
||||||
english_text = segment.get('english', '').replace('\n', ' ')
|
english_text = segment.get('english', '').replace('\n', ' ')
|
||||||
@ -42,7 +42,7 @@ def process_json_files(directory, converted_filename):
|
|||||||
write_converted_file(converted_filename, filename)
|
write_converted_file(converted_filename, filename)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
json_directory = './output' # 替换为你的JSON文件目录路径
|
json_directory = './output-new' # 替换为你的JSON文件目录路径
|
||||||
converted_filename = './result/converted.txt'
|
converted_filename = './result/converted.txt'
|
||||||
|
|
||||||
process_json_files(json_directory, converted_filename)
|
process_json_files(json_directory, converted_filename)
|
52
translate/split_source.py
Normal file
52
translate/split_source.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
def split_content(content):
|
||||||
|
sentences = re.split(r'[。!?;.!?;]', content)
|
||||||
|
segments = []
|
||||||
|
current_segment = []
|
||||||
|
current_length = 0
|
||||||
|
|
||||||
|
for sentence in sentences:
|
||||||
|
sentence_length = len(sentence)
|
||||||
|
if (len(current_segment) >= 25 or current_length + sentence_length > 1200):
|
||||||
|
segments.append(''.join(current_segment))
|
||||||
|
current_segment = []
|
||||||
|
current_length = 0
|
||||||
|
|
||||||
|
current_segment.append(sentence)
|
||||||
|
current_length += sentence_length
|
||||||
|
|
||||||
|
if current_segment:
|
||||||
|
segments.append(''.join(current_segment))
|
||||||
|
|
||||||
|
return segments
|
||||||
|
|
||||||
|
def process_files_in_directory(directory):
|
||||||
|
for filename in os.listdir(directory):
|
||||||
|
file_path = os.path.join(directory, filename)
|
||||||
|
|
||||||
|
# 只处理文件,跳过目录
|
||||||
|
if os.path.isfile(file_path):
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
content = file.read()
|
||||||
|
|
||||||
|
segments = split_content(content)
|
||||||
|
|
||||||
|
if len(segments) > 1:
|
||||||
|
# 删除原始文件
|
||||||
|
os.remove(file_path)
|
||||||
|
|
||||||
|
# 保存分割后的文件
|
||||||
|
for i, segment in enumerate(segments):
|
||||||
|
new_filename = f"{filename}_{i+1}"
|
||||||
|
new_file_path = os.path.join(directory, new_filename)
|
||||||
|
|
||||||
|
with open(new_file_path, 'w', encoding='utf-8') as new_file:
|
||||||
|
new_file.write(segment)
|
||||||
|
else:
|
||||||
|
print(f"文件 {filename} 不需要分割")
|
||||||
|
|
||||||
|
# 指定目录
|
||||||
|
directory = './source-new'
|
||||||
|
process_files_in_directory(directory)
|
42
translate/validation/bleu_full.py
Normal file
42
translate/validation/bleu_full.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
import evaluate
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
bleu_cal = evaluate.load("chrf")
|
||||||
|
|
||||||
|
def translate_text(text):
|
||||||
|
command = f'argos-translate --from zh --to en "{text}"'
|
||||||
|
result = subprocess.run(command, shell=True, capture_output=True, text=True)
|
||||||
|
return result.stdout.strip()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# 读取数据集
|
||||||
|
with open('./data/1.jsonl', 'r', encoding='utf-8') as f:
|
||||||
|
data = [json.loads(line) for line in f]
|
||||||
|
|
||||||
|
translations = []
|
||||||
|
references = []
|
||||||
|
|
||||||
|
# for entry in tqdm(data):
|
||||||
|
# chinese_sentence = entry['zh']
|
||||||
|
# translated_sentence = translate_text(chinese_sentence)
|
||||||
|
# with open("./data/1-inf.txt", "a") as f:
|
||||||
|
# f.write(translated_sentence + "\n")
|
||||||
|
# translations.append(translated_sentence)
|
||||||
|
|
||||||
|
with open("./data/1-inf.txt", 'r') as f:
|
||||||
|
translations = f.readlines()
|
||||||
|
|
||||||
|
for entry in data:
|
||||||
|
english_sentence = entry['en']
|
||||||
|
references.append([english_sentence])
|
||||||
|
|
||||||
|
|
||||||
|
# 计算 BLEU 分数
|
||||||
|
bleu = bleu_cal.compute(predictions=translations, references=references)
|
||||||
|
print(bleu)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user