sparkastML/translate/validation/bleu_full.py

42 lines
1.2 KiB
Python

import json
import subprocess
import evaluate
from nltk.tokenize import word_tokenize
from tqdm import tqdm
bleu_cal = evaluate.load("chrf")
def translate_text(text):
command = f'argos-translate --from zh --to en "{text}"'
result = subprocess.run(command, shell=True, capture_output=True, text=True)
return result.stdout.strip()
def main():
# 读取数据集
with open('./data/1.jsonl', 'r', encoding='utf-8') as f:
data = [json.loads(line) for line in f]
translations = []
references = []
# for entry in tqdm(data):
# chinese_sentence = entry['zh']
# translated_sentence = translate_text(chinese_sentence)
# with open("./data/1-inf.txt", "a") as f:
# f.write(translated_sentence + "\n")
# translations.append(translated_sentence)
with open("./data/1-inf.txt", 'r') as f:
translations = f.readlines()
for entry in data:
english_sentence = entry['en']
references.append([english_sentence])
# 计算 BLEU 分数
bleu = bleu_cal.compute(predictions=translations, references=references)
print(bleu)
if __name__ == "__main__":
main()