update: evaluation
This commit is contained in:
parent
435faa4b92
commit
01597c298d
26
translate/synthesis/extract.py
Normal file
26
translate/synthesis/extract.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
from openai import OpenAI
|
||||||
|
client = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
|
||||||
|
text='''互联
|
||||||
|
虎脸
|
||||||
|
互怜
|
||||||
|
糊脸对猴
|
||||||
|
互联工程
|
||||||
|
互联互通
|
||||||
|
湖莲潭
|
||||||
|
互联网
|
||||||
|
互联网安全
|
||||||
|
互联网编程
|
||||||
|
互联网产品
|
||||||
|
互联网出版管理暂行规定
|
||||||
|
互联网创业
|
||||||
|
互联网大会
|
||||||
|
互联网等信息网络传播视听节目管理办法
|
||||||
|
互联网电脑
|
||||||
|
互联网服务
|
||||||
|
互联网公司'''
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": "用户会给出若干中文短语或词汇,每行一个。你需要从中抽取出**不重复**的中文**词汇**并输出,每行一个。**注意,你不应该输出其它任何内容**"},
|
||||||
|
{"role": "user", "content": text},
|
||||||
|
]
|
||||||
|
response = client.chat.completions.create(model='deepseek-v2',messages=messages,temperature=1.0)
|
||||||
|
print(response.choices[0].message.content)
|
@ -1,14 +1,15 @@
|
|||||||
import subprocess
|
import subprocess
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
def translate_text(text):
|
def translate_text(text):
|
||||||
command = f'argos-translate --from zh --to en "{text}"'
|
command = f'argos-translate --from zh --to en "{text}"'
|
||||||
result = subprocess.run(command, shell=True, capture_output=True, text=True)
|
result = subprocess.run(command, shell=True, capture_output=True, text=True)
|
||||||
return result.stdout.strip()
|
return result.stdout.strip()
|
||||||
|
|
||||||
with open("src.txt", "r") as f:
|
with open("./data/src.txt", "r") as f:
|
||||||
src_lines = f.readlines()
|
src_lines = f.readlines()
|
||||||
|
|
||||||
for line in src_lines:
|
for line in tqdm(src_lines):
|
||||||
result = translate_text(line)
|
result = translate_text(line)
|
||||||
with open("hyp-ag.txt", 'a') as f:
|
with open("./data/hyp-sk-1.2.txt", 'a') as f:
|
||||||
f.write(result + '\n')
|
f.write(result + '\n')
|
@ -1,10 +1,10 @@
|
|||||||
from googletrans import Translator
|
from googletrans import Translator
|
||||||
translator = Translator()
|
translator = Translator()
|
||||||
|
|
||||||
with open("src.txt", "r") as f:
|
with open("./data/src.txt", "r") as f:
|
||||||
src_lines = f.readlines()
|
src_lines = f.readlines()
|
||||||
|
|
||||||
for line in src_lines:
|
for line in src_lines:
|
||||||
result = translator.translate(line, dest='en')
|
result = translator.translate(line, dest='en')
|
||||||
with open("hyp-gg-py.txt", 'a') as f:
|
with open("./data/hyp-gg-py.txt", 'a') as f:
|
||||||
f.write(result.text + '\n')
|
f.write(result.text + '\n')
|
19
translate/validation/m2mTrans.py
Normal file
19
translate/validation/m2mTrans.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
from tqdm import tqdm
|
||||||
|
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
|
||||||
|
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
|
||||||
|
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
|
||||||
|
|
||||||
|
def translate_text(text):
|
||||||
|
tokenizer.src_lang = "zh"
|
||||||
|
encoded_zh = tokenizer(text, return_tensors="pt")
|
||||||
|
generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
|
||||||
|
result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
|
||||||
|
return result[0]
|
||||||
|
|
||||||
|
with open("./data/src.txt", "r") as f:
|
||||||
|
src_lines = f.readlines()
|
||||||
|
|
||||||
|
for line in tqdm(src_lines):
|
||||||
|
result = translate_text(line)
|
||||||
|
with open("./data/hyp-m2m.txt", 'a') as f:
|
||||||
|
f.write(result + '\n')
|
@ -28,7 +28,7 @@ def main(input_file, sample_size):
|
|||||||
chinese_text = item["chinese"]
|
chinese_text = item["chinese"]
|
||||||
english_text = item["english"]
|
english_text = item["english"]
|
||||||
|
|
||||||
with open("src.txt", 'a') as srcf, open("ref.txt", 'a') as reff:
|
with open("./data/src.txt", 'a') as srcf, open("./data/ref.txt", 'a') as reff:
|
||||||
srcf.write(chinese_text + '\n')
|
srcf.write(chinese_text + '\n')
|
||||||
reff.write(english_text + '\n')
|
reff.write(english_text + '\n')
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user