add: translation evaluation
This commit is contained in:
parent
3bb222bda1
commit
6500e378be
10
README.md
10
README.md
@ -14,6 +14,16 @@ This model is optimized to be lightweight, ensuring it can run on a wide range o
|
||||
|
||||
For a detailed explanation of how it works, you can refer to [this blog post](https://blog.alikia2x.com/en/posts/sparkastml-intention/).
|
||||
|
||||
## Translation
|
||||
|
||||
Language barriers are one of the biggest obstacles to communication between civilizations. In modern times, with the development of computer science and artificial intelligence, machine translation is bridging this barrier and building a Tower of Babel.
|
||||
|
||||
Unfortunately, many machine translations are owned by commercial companies, which seriously hinders the development of freedom and innovation.
|
||||
|
||||
Therefore, sparkastML is on the road to challenge commercial machine translation. We decided to tackle the translation between Chinese and English first. These are two languages with a long history and a large number of users. Their writing methods and expression habits are very different, which brings challenges to the project.
|
||||
|
||||
For more details, you can view [this page](./translate/README.md).
|
||||
|
||||
## Dataset
|
||||
|
||||
To support the development of Libre Intelligence, we have made a series of datasets publicly available. You can access them [here](./dataset/public/README.md).
|
||||
|
@ -34,30 +34,34 @@ EXAMPLE JSON OUTPUT:
|
||||
}
|
||||
"""
|
||||
|
||||
def translate_text(text):
|
||||
|
||||
def translate_text(text, client, model_name, temp):
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": text}
|
||||
{"role": "user", "content": text},
|
||||
]
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=os.getenv("TRANSLATION_MODEL"),
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
response_format={'type': 'json_object'},
|
||||
temperature=float(os.getenv("TRANSLATION_TEMP"))
|
||||
response_format={"type": "json_object"},
|
||||
temperature=temp,
|
||||
)
|
||||
|
||||
return json.loads(response.choices[0].message.content)
|
||||
|
||||
|
||||
def process_file(input_file, output_dir):
|
||||
try:
|
||||
with open(input_file, 'r', encoding='utf-8') as f:
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
text = f.read()
|
||||
|
||||
translation = translate_text(text)
|
||||
model = os.getenv("TRANSLATION_MODEL")
|
||||
temp = float(os.getenv("TRANSLATION_TEMP"))
|
||||
translation = translate_text(text, client, model, temp)
|
||||
|
||||
output_path = os.path.join(output_dir, Path(input_file).stem + ".json")
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(translation, f, ensure_ascii=False, indent=4)
|
||||
|
||||
print(f"Successfully translated and saved to {output_path}")
|
||||
@ -65,16 +69,25 @@ def process_file(input_file, output_dir):
|
||||
except Exception as e:
|
||||
print(f"Error processing {input_file}: {e}")
|
||||
|
||||
|
||||
def batch_process(input_dir, output_dir, num_threads=4):
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
input_files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))]
|
||||
output_files = [f for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f))]
|
||||
input_files = [
|
||||
f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))
|
||||
]
|
||||
output_files = [
|
||||
f for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f))
|
||||
]
|
||||
|
||||
output_stems = {Path(f).stem for f in output_files}
|
||||
|
||||
files = [os.path.join(input_dir, f) for f in input_files if Path(f).stem not in output_stems]
|
||||
files = [
|
||||
os.path.join(input_dir, f)
|
||||
for f in input_files
|
||||
if Path(f).stem not in output_stems
|
||||
]
|
||||
|
||||
threads = []
|
||||
for file in files:
|
||||
@ -90,7 +103,10 @@ def batch_process(input_dir, output_dir, num_threads=4):
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
input_dir = "./source"
|
||||
output_dir = "./output"
|
||||
batch_process(input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS")))
|
||||
batch_process(
|
||||
input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS"))
|
||||
)
|
14
translate/README.md
Normal file
14
translate/README.md
Normal file
@ -0,0 +1,14 @@
|
||||
# sparkastML NMT
|
||||
|
||||
## News
|
||||
|
||||
sparkastML's first translation model has release!
|
||||
|
||||
### Details
|
||||
|
||||
- Training time: 5 hours, 20k steps
|
||||
- Training device: RTX 3080 (20GB)
|
||||
- Corpus size: over 10 million sentences
|
||||
- Validation Score: BLEU
|
||||
|
||||
[Model]
|
43
translate/validation/LLMtrans.py
Normal file
43
translate/validation/LLMtrans.py
Normal file
@ -0,0 +1,43 @@
|
||||
from openai import OpenAI
|
||||
import argparse
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
def translate_text(text, client, model_name, temp):
|
||||
messages = [
|
||||
{"role": "system", "content": "User will provide some text. You need to translate the text into English and output it WITHOUT ANY ADDITIONAL INFORMATION OR EXPLANATION."},
|
||||
{"role": "user", "content": text},
|
||||
]
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
temperature=temp,
|
||||
)
|
||||
|
||||
return response.choices[0].message.content
|
||||
|
||||
load_dotenv()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("input", type=str, help="Path to the input file")
|
||||
parser.add_argument("output", type=str, help="Path to the output file")
|
||||
args = parser.parse_args()
|
||||
|
||||
input_file = args.input
|
||||
output_file = args.output
|
||||
client = OpenAI(
|
||||
api_key=os.getenv("API_KEY"),
|
||||
base_url=os.getenv("BASE_URL"),
|
||||
)
|
||||
model = os.getenv("TRANSLATION_MODEL")
|
||||
temp = float(os.getenv("TRANSLATION_TEMP"))
|
||||
|
||||
with open(input_file, "r") as f:
|
||||
src_lines = f.readlines()
|
||||
|
||||
|
||||
for line in src_lines:
|
||||
result = translate_text(line, client, model, temp)
|
||||
with open(output_file, 'a') as f:
|
||||
f.write(result + '\n')
|
14
translate/validation/argoTrans.py
Normal file
14
translate/validation/argoTrans.py
Normal file
@ -0,0 +1,14 @@
|
||||
import subprocess
|
||||
|
||||
def translate_text(text):
|
||||
command = f'argos-translate --from zh --to en "{text}"'
|
||||
result = subprocess.run(command, shell=True, capture_output=True, text=True)
|
||||
return result.stdout.strip()
|
||||
|
||||
with open("src.txt", "r") as f:
|
||||
src_lines = f.readlines()
|
||||
|
||||
for line in src_lines:
|
||||
result = translate_text(line)
|
||||
with open("hyp-ag.txt", 'a') as f:
|
||||
f.write(result + '\n')
|
10
translate/validation/googleTrans.py
Normal file
10
translate/validation/googleTrans.py
Normal file
@ -0,0 +1,10 @@
|
||||
from googletrans import Translator
|
||||
translator = Translator()
|
||||
|
||||
with open("src.txt", "r") as f:
|
||||
src_lines = f.readlines()
|
||||
|
||||
for line in src_lines:
|
||||
result = translator.translate(line, dest='en')
|
||||
with open("hyp-gg-py.txt", 'a') as f:
|
||||
f.write(result.text + '\n')
|
56
translate/validation/preprocess.py
Normal file
56
translate/validation/preprocess.py
Normal file
@ -0,0 +1,56 @@
|
||||
import json
|
||||
import random
|
||||
import argparse
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
# 读取jsonl文件
|
||||
def read_jsonl(file_path):
|
||||
with open(file_path, "r", encoding="utf-8") as file:
|
||||
for line in file:
|
||||
yield json.loads(line)
|
||||
|
||||
|
||||
# 随机抽取一定数量的行
|
||||
def sample_lines(data, sample_size):
|
||||
return random.sample(list(data), sample_size)
|
||||
|
||||
|
||||
# 主函数
|
||||
def main(input_file, sample_size):
|
||||
# 读取jsonl文件
|
||||
data = read_jsonl(input_file)
|
||||
|
||||
# 随机抽取一定数量的行
|
||||
sampled_data = sample_lines(data, sample_size)
|
||||
|
||||
for item in tqdm(sampled_data):
|
||||
chinese_text = item["chinese"]
|
||||
english_text = item["english"]
|
||||
|
||||
with open("src.txt", 'a') as srcf, open("ref.txt", 'a') as reff:
|
||||
srcf.write(chinese_text + '\n')
|
||||
reff.write(english_text + '\n')
|
||||
|
||||
|
||||
# 示例调用
|
||||
if __name__ == "__main__":
|
||||
# 创建命令行参数解析器
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Process a JSONL file by sampling lines and translating text."
|
||||
)
|
||||
|
||||
# 添加命令行参数
|
||||
parser.add_argument("input", type=str, help="Path to the input JSONL file")
|
||||
parser.add_argument(
|
||||
"--sample_size",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Number of lines to sample (default: 100)",
|
||||
)
|
||||
|
||||
# 解析命令行参数
|
||||
args = parser.parse_args()
|
||||
|
||||
# 调用主函数
|
||||
main(args.input, args.sample_size)
|
Loading…
Reference in New Issue
Block a user