add: translation evaluation

This commit is contained in:
alikia2x (寒寒) 2024-09-17 20:07:47 +08:00
parent 3bb222bda1
commit 6500e378be
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
7 changed files with 187 additions and 24 deletions

View File

@ -14,6 +14,16 @@ This model is optimized to be lightweight, ensuring it can run on a wide range o
For a detailed explanation of how it works, you can refer to [this blog post](https://blog.alikia2x.com/en/posts/sparkastml-intention/). For a detailed explanation of how it works, you can refer to [this blog post](https://blog.alikia2x.com/en/posts/sparkastml-intention/).
## Translation
Language barriers are one of the biggest obstacles to communication between civilizations. In modern times, with the development of computer science and artificial intelligence, machine translation is bridging this barrier and building a Tower of Babel.
Unfortunately, many machine translations are owned by commercial companies, which seriously hinders the development of freedom and innovation.
Therefore, sparkastML is on the road to challenge commercial machine translation. We decided to tackle the translation between Chinese and English first. These are two languages with a long history and a large number of users. Their writing methods and expression habits are very different, which brings challenges to the project.
For more details, you can view [this page](./translate/README.md).
## Dataset ## Dataset
To support the development of Libre Intelligence, we have made a series of datasets publicly available. You can access them [here](./dataset/public/README.md). To support the development of Libre Intelligence, we have made a series of datasets publicly available. You can access them [here](./dataset/public/README.md).

View File

@ -34,63 +34,79 @@ EXAMPLE JSON OUTPUT:
} }
""" """
def translate_text(text):
def translate_text(text, client, model_name, temp):
messages = [ messages = [
{"role": "system", "content": system_prompt}, {"role": "system", "content": system_prompt},
{"role": "user", "content": text} {"role": "user", "content": text},
] ]
response = client.chat.completions.create( response = client.chat.completions.create(
model=os.getenv("TRANSLATION_MODEL"), model=model_name,
messages=messages, messages=messages,
response_format={'type': 'json_object'}, response_format={"type": "json_object"},
temperature=float(os.getenv("TRANSLATION_TEMP")) temperature=temp,
) )
return json.loads(response.choices[0].message.content) return json.loads(response.choices[0].message.content)
def process_file(input_file, output_dir): def process_file(input_file, output_dir):
try: try:
with open(input_file, 'r', encoding='utf-8') as f: with open(input_file, "r", encoding="utf-8") as f:
text = f.read() text = f.read()
translation = translate_text(text) model = os.getenv("TRANSLATION_MODEL")
temp = float(os.getenv("TRANSLATION_TEMP"))
translation = translate_text(text, client, model, temp)
output_path = os.path.join(output_dir, Path(input_file).stem + ".json") output_path = os.path.join(output_dir, Path(input_file).stem + ".json")
with open(output_path, 'w', encoding='utf-8') as f: with open(output_path, "w", encoding="utf-8") as f:
json.dump(translation, f, ensure_ascii=False, indent=4) json.dump(translation, f, ensure_ascii=False, indent=4)
print(f"Successfully translated and saved to {output_path}") print(f"Successfully translated and saved to {output_path}")
except Exception as e: except Exception as e:
print(f"Error processing {input_file}: {e}") print(f"Error processing {input_file}: {e}")
def batch_process(input_dir, output_dir, num_threads=4): def batch_process(input_dir, output_dir, num_threads=4):
if not os.path.exists(output_dir): if not os.path.exists(output_dir):
os.makedirs(output_dir) os.makedirs(output_dir)
input_files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))] input_files = [
output_files = [f for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f))] f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))
]
output_files = [
f for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f))
]
output_stems = {Path(f).stem for f in output_files} output_stems = {Path(f).stem for f in output_files}
files = [os.path.join(input_dir, f) for f in input_files if Path(f).stem not in output_stems] files = [
os.path.join(input_dir, f)
for f in input_files
if Path(f).stem not in output_stems
]
threads = [] threads = []
for file in files: for file in files:
thread = threading.Thread(target=process_file, args=(file, output_dir)) thread = threading.Thread(target=process_file, args=(file, output_dir))
threads.append(thread) threads.append(thread)
thread.start() thread.start()
if len(threads) >= num_threads: if len(threads) >= num_threads:
for t in threads: for t in threads:
t.join() t.join()
threads = [] threads = []
for t in threads: for t in threads:
t.join() t.join()
if __name__ == "__main__": if __name__ == "__main__":
input_dir = "./source" input_dir = "./source"
output_dir = "./output" output_dir = "./output"
batch_process(input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS"))) batch_process(
input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS"))
)

14
translate/README.md Normal file
View File

@ -0,0 +1,14 @@
# sparkastML NMT
## News
sparkastML's first translation model has release!
### Details
- Training time: 5 hours, 20k steps
- Training device: RTX 3080 (20GB)
- Corpus size: over 10 million sentences
- Validation Score: BLEU
[Model]

View File

@ -0,0 +1,43 @@
from openai import OpenAI
import argparse
import os
from dotenv import load_dotenv
def translate_text(text, client, model_name, temp):
messages = [
{"role": "system", "content": "User will provide some text. You need to translate the text into English and output it WITHOUT ANY ADDITIONAL INFORMATION OR EXPLANATION."},
{"role": "user", "content": text},
]
response = client.chat.completions.create(
model=model_name,
messages=messages,
temperature=temp,
)
return response.choices[0].message.content
load_dotenv()
parser = argparse.ArgumentParser()
parser.add_argument("input", type=str, help="Path to the input file")
parser.add_argument("output", type=str, help="Path to the output file")
args = parser.parse_args()
input_file = args.input
output_file = args.output
client = OpenAI(
api_key=os.getenv("API_KEY"),
base_url=os.getenv("BASE_URL"),
)
model = os.getenv("TRANSLATION_MODEL")
temp = float(os.getenv("TRANSLATION_TEMP"))
with open(input_file, "r") as f:
src_lines = f.readlines()
for line in src_lines:
result = translate_text(line, client, model, temp)
with open(output_file, 'a') as f:
f.write(result + '\n')

View File

@ -0,0 +1,14 @@
import subprocess
def translate_text(text):
command = f'argos-translate --from zh --to en "{text}"'
result = subprocess.run(command, shell=True, capture_output=True, text=True)
return result.stdout.strip()
with open("src.txt", "r") as f:
src_lines = f.readlines()
for line in src_lines:
result = translate_text(line)
with open("hyp-ag.txt", 'a') as f:
f.write(result + '\n')

View File

@ -0,0 +1,10 @@
from googletrans import Translator
translator = Translator()
with open("src.txt", "r") as f:
src_lines = f.readlines()
for line in src_lines:
result = translator.translate(line, dest='en')
with open("hyp-gg-py.txt", 'a') as f:
f.write(result.text + '\n')

View File

@ -0,0 +1,56 @@
import json
import random
import argparse
from tqdm import tqdm
# 读取jsonl文件
def read_jsonl(file_path):
with open(file_path, "r", encoding="utf-8") as file:
for line in file:
yield json.loads(line)
# 随机抽取一定数量的行
def sample_lines(data, sample_size):
return random.sample(list(data), sample_size)
# 主函数
def main(input_file, sample_size):
# 读取jsonl文件
data = read_jsonl(input_file)
# 随机抽取一定数量的行
sampled_data = sample_lines(data, sample_size)
for item in tqdm(sampled_data):
chinese_text = item["chinese"]
english_text = item["english"]
with open("src.txt", 'a') as srcf, open("ref.txt", 'a') as reff:
srcf.write(chinese_text + '\n')
reff.write(english_text + '\n')
# 示例调用
if __name__ == "__main__":
# 创建命令行参数解析器
parser = argparse.ArgumentParser(
description="Process a JSONL file by sampling lines and translating text."
)
# 添加命令行参数
parser.add_argument("input", type=str, help="Path to the input JSONL file")
parser.add_argument(
"--sample_size",
type=int,
default=100,
help="Number of lines to sample (default: 100)",
)
# 解析命令行参数
args = parser.parse_args()
# 调用主函数
main(args.input, args.sample_size)