From 01597c298dab9179c6b9b602d0e7f890fff9e512 Mon Sep 17 00:00:00 2001
From: alikia2x <alikia2x@outlook.com>
Date: Thu, 19 Sep 2024 22:03:54 +0800
Subject: [PATCH] update: evaluation

---
 translate/synthesis/extract.py      | 26 ++++++++++++++++++++++++++
 translate/validation/argoTrans.py   |  7 ++++---
 translate/validation/googleTrans.py |  4 ++--
 translate/validation/m2mTrans.py    | 19 +++++++++++++++++++
 translate/validation/preprocess.py  |  2 +-
 5 files changed, 52 insertions(+), 6 deletions(-)
 create mode 100644 translate/synthesis/extract.py
 create mode 100644 translate/validation/m2mTrans.py

diff --git a/translate/synthesis/extract.py b/translate/synthesis/extract.py
new file mode 100644
index 0000000..80fab9a
--- /dev/null
+++ b/translate/synthesis/extract.py
@@ -0,0 +1,26 @@
+from openai import OpenAI
+client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
+text='''互联
+虎脸
+互怜
+糊脸对猴
+互联工程
+互联互通
+湖莲潭
+互联网
+互联网安全
+互联网编程
+互联网产品
+互联网出版管理暂行规定
+互联网创业
+互联网大会
+互联网等信息网络传播视听节目管理办法
+互联网电脑
+互联网服务
+互联网公司'''
+messages = [
+     {"role": "system", "content": "用户会给出若干中文短语或词汇，每行一个。你需要从中抽取出**不重复**的中文**词汇**并输出，每行一个。**注意，你不应该输出其它任何内容**"},
+     {"role": "user", "content": text},
+]
+response = client.chat.completions.create(model='deepseek-v2',messages=messages,temperature=1.0)
+print(response.choices[0].message.content)
\ No newline at end of file
diff --git a/translate/validation/argoTrans.py b/translate/validation/argoTrans.py
index c70e7e1..f984b70 100644
--- a/translate/validation/argoTrans.py
+++ b/translate/validation/argoTrans.py
@@ -1,14 +1,15 @@
 import subprocess
+from tqdm import tqdm
 
 def translate_text(text):
     command = f'argos-translate --from zh --to en "{text}"'
     result = subprocess.run(command, shell=True, capture_output=True, text=True)
     return result.stdout.strip()
 
-with open("src.txt", "r") as f:
+with open("./data/src.txt", "r") as f:
     src_lines = f.readlines()
     
-for line in src_lines:
+for line in tqdm(src_lines):
     result = translate_text(line)
-    with open("hyp-ag.txt", 'a') as f:
+    with open("./data/hyp-sk-1.2.txt", 'a') as f:
         f.write(result + '\n')
\ No newline at end of file
diff --git a/translate/validation/googleTrans.py b/translate/validation/googleTrans.py
index be63a81..cb42e38 100644
--- a/translate/validation/googleTrans.py
+++ b/translate/validation/googleTrans.py
@@ -1,10 +1,10 @@
 from googletrans import Translator
 translator = Translator()
 
-with open("src.txt", "r") as f:
+with open("./data/src.txt", "r") as f:
     src_lines = f.readlines()
     
 for line in src_lines:
     result = translator.translate(line, dest='en')
-    with open("hyp-gg-py.txt", 'a') as f:
+    with open("./data/hyp-gg-py.txt", 'a') as f:
         f.write(result.text + '\n')
\ No newline at end of file
diff --git a/translate/validation/m2mTrans.py b/translate/validation/m2mTrans.py
new file mode 100644
index 0000000..0cfc07d
--- /dev/null
+++ b/translate/validation/m2mTrans.py
@@ -0,0 +1,19 @@
+from tqdm import tqdm
+from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
+
+def translate_text(text):
+    tokenizer.src_lang = "zh"
+    encoded_zh = tokenizer(text, return_tensors="pt")
+    generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
+    result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+    return result[0]
+
+with open("./data/src.txt", "r") as f:
+    src_lines = f.readlines()
+    
+for line in tqdm(src_lines):
+    result = translate_text(line)
+    with open("./data/hyp-m2m.txt", 'a') as f:
+        f.write(result + '\n')
\ No newline at end of file
diff --git a/translate/validation/preprocess.py b/translate/validation/preprocess.py
index 79aa725..fdc1b7b 100644
--- a/translate/validation/preprocess.py
+++ b/translate/validation/preprocess.py
@@ -28,7 +28,7 @@ def main(input_file, sample_size):
         chinese_text = item["chinese"]
         english_text = item["english"]
 
-        with open("src.txt", 'a') as srcf, open("ref.txt", 'a') as reff:
+        with open("./data/src.txt", 'a') as srcf, open("./data/ref.txt", 'a') as reff:
             srcf.write(chinese_text + '\n')
             reff.write(english_text + '\n')