From 01597c298dab9179c6b9b602d0e7f890fff9e512 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Thu, 19 Sep 2024 22:03:54 +0800 Subject: [PATCH] update: evaluation --- translate/synthesis/extract.py | 26 ++++++++++++++++++++++++++ translate/validation/argoTrans.py | 7 ++++--- translate/validation/googleTrans.py | 4 ++-- translate/validation/m2mTrans.py | 19 +++++++++++++++++++ translate/validation/preprocess.py | 2 +- 5 files changed, 52 insertions(+), 6 deletions(-) create mode 100644 translate/synthesis/extract.py create mode 100644 translate/validation/m2mTrans.py diff --git a/translate/synthesis/extract.py b/translate/synthesis/extract.py new file mode 100644 index 0000000..80fab9a --- /dev/null +++ b/translate/synthesis/extract.py @@ -0,0 +1,26 @@ +from openai import OpenAI +client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama") +text='''互联 +虎脸 +互怜 +糊脸对猴 +互联工程 +互联互通 +湖莲潭 +互联网 +互联网安全 +互联网编程 +互联网产品 +互联网出版管理暂行规定 +互联网创业 +互联网大会 +互联网等信息网络传播视听节目管理办法 +互联网电脑 +互联网服务 +互联网公司''' +messages = [ + {"role": "system", "content": "用户会给出若干中文短语或词汇,每行一个。你需要从中抽取出**不重复**的中文**词汇**并输出,每行一个。**注意,你不应该输出其它任何内容**"}, + {"role": "user", "content": text}, +] +response = client.chat.completions.create(model='deepseek-v2',messages=messages,temperature=1.0) +print(response.choices[0].message.content) \ No newline at end of file diff --git a/translate/validation/argoTrans.py b/translate/validation/argoTrans.py index c70e7e1..f984b70 100644 --- a/translate/validation/argoTrans.py +++ b/translate/validation/argoTrans.py @@ -1,14 +1,15 @@ import subprocess +from tqdm import tqdm def translate_text(text): command = f'argos-translate --from zh --to en "{text}"' result = subprocess.run(command, shell=True, capture_output=True, text=True) return result.stdout.strip() -with open("src.txt", "r") as f: +with open("./data/src.txt", "r") as f: src_lines = f.readlines() -for line in src_lines: +for line in tqdm(src_lines): result = translate_text(line) - with open("hyp-ag.txt", 'a') as f: + with open("./data/hyp-sk-1.2.txt", 'a') as f: f.write(result + '\n') \ No newline at end of file diff --git a/translate/validation/googleTrans.py b/translate/validation/googleTrans.py index be63a81..cb42e38 100644 --- a/translate/validation/googleTrans.py +++ b/translate/validation/googleTrans.py @@ -1,10 +1,10 @@ from googletrans import Translator translator = Translator() -with open("src.txt", "r") as f: +with open("./data/src.txt", "r") as f: src_lines = f.readlines() for line in src_lines: result = translator.translate(line, dest='en') - with open("hyp-gg-py.txt", 'a') as f: + with open("./data/hyp-gg-py.txt", 'a') as f: f.write(result.text + '\n') \ No newline at end of file diff --git a/translate/validation/m2mTrans.py b/translate/validation/m2mTrans.py new file mode 100644 index 0000000..0cfc07d --- /dev/null +++ b/translate/validation/m2mTrans.py @@ -0,0 +1,19 @@ +from tqdm import tqdm +from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer +model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") +tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") + +def translate_text(text): + tokenizer.src_lang = "zh" + encoded_zh = tokenizer(text, return_tensors="pt") + generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en")) + result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) + return result[0] + +with open("./data/src.txt", "r") as f: + src_lines = f.readlines() + +for line in tqdm(src_lines): + result = translate_text(line) + with open("./data/hyp-m2m.txt", 'a') as f: + f.write(result + '\n') \ No newline at end of file diff --git a/translate/validation/preprocess.py b/translate/validation/preprocess.py index 79aa725..fdc1b7b 100644 --- a/translate/validation/preprocess.py +++ b/translate/validation/preprocess.py @@ -28,7 +28,7 @@ def main(input_file, sample_size): chinese_text = item["chinese"] english_text = item["english"] - with open("src.txt", 'a') as srcf, open("ref.txt", 'a') as reff: + with open("./data/src.txt", 'a') as srcf, open("./data/ref.txt", 'a') as reff: srcf.write(chinese_text + '\n') reff.write(english_text + '\n')