add: translation evaluation

2024-09-17 20:07:47 +08:00 · 2024-09-17 20:07:47 +08:00 · 6500e378be
commit 6500e378be
parent 3bb222bda1
7 changed files with 187 additions and 24 deletions
--- a/README.md
+++ b/README.md
@ -14,6 +14,16 @@ This model is optimized to be lightweight, ensuring it can run on a wide range o

 For a detailed explanation of how it works, you can refer to [this blog post](https://blog.alikia2x.com/en/posts/sparkastml-intention/).

+## Translation
+
+Language barriers are one of the biggest obstacles to communication between civilizations. In modern times, with the development of computer science and artificial intelligence, machine translation is bridging this barrier and building a Tower of Babel.
+
+Unfortunately, many machine translations are owned by commercial companies, which seriously hinders the development of freedom and innovation.
+
+Therefore, sparkastML is on the road to challenge commercial machine translation. We decided to tackle the translation between Chinese and English first. These are two languages with a long history and a large number of users. Their writing methods and expression habits are very different, which brings challenges to the project.
+
+For more details, you can view [this page](./translate/README.md).
+
 ## Dataset

 To support the development of Libre Intelligence, we have made a series of datasets publicly available. You can access them [here](./dataset/public/README.md).
--- a/translate/LLMtranslator.py
+++ b/translate/LLMtranslator.py
@ -34,30 +34,34 @@ EXAMPLE JSON OUTPUT:
 }
 """

-def translate_text(text):
+
+def translate_text(text, client, model_name, temp):
    messages = [
        {"role": "system", "content": system_prompt},
-        {"role": "user", "content": text}
+        {"role": "user", "content": text},
    ]

    response = client.chat.completions.create(
-        model=os.getenv("TRANSLATION_MODEL"),
+        model=model_name,
        messages=messages,
-        response_format={'type': 'json_object'},
-        temperature=float(os.getenv("TRANSLATION_TEMP"))
+        response_format={"type": "json_object"},
+        temperature=temp,
    )

    return json.loads(response.choices[0].message.content)

+
 def process_file(input_file, output_dir):
    try:
-        with open(input_file, 'r', encoding='utf-8') as f:
+        with open(input_file, "r", encoding="utf-8") as f:
            text = f.read()

-        translation = translate_text(text)
+        model = os.getenv("TRANSLATION_MODEL")
+        temp = float(os.getenv("TRANSLATION_TEMP"))
+        translation = translate_text(text, client, model, temp)

        output_path = os.path.join(output_dir, Path(input_file).stem + ".json")
-        with open(output_path, 'w', encoding='utf-8') as f:
+        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(translation, f, ensure_ascii=False, indent=4)

        print(f"Successfully translated and saved to {output_path}")
@ -65,16 +69,25 @@ def process_file(input_file, output_dir):
    except Exception as e:
        print(f"Error processing {input_file}: {e}")

+
 def batch_process(input_dir, output_dir, num_threads=4):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

-    input_files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))]
-    output_files = [f for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f))]
+    input_files = [
+        f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))
+    ]
+    output_files = [
+        f for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f))
+    ]

    output_stems = {Path(f).stem for f in output_files}

-    files = [os.path.join(input_dir, f) for f in input_files if Path(f).stem not in output_stems]
+    files = [
+        os.path.join(input_dir, f)
+        for f in input_files
+        if Path(f).stem not in output_stems
+    ]

    threads = []
    for file in files:
@ -90,7 +103,10 @@ def batch_process(input_dir, output_dir, num_threads=4):
    for t in threads:
        t.join()

+
 if __name__ == "__main__":
    input_dir = "./source"
    output_dir = "./output"
-    batch_process(input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS")))
+    batch_process(
+        input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS"))
+    )
--- a/translate/README.md
+++ b/translate/README.md
@ -0,0 +1,14 @@
+# sparkastML NMT
+
+## News
+
+sparkastML's first translation model has release!
+
+### Details
+
+- Training time: 5 hours, 20k steps
+- Training device: RTX 3080 (20GB)
+- Corpus size: over 10 million sentences
+- Validation Score: BLEU
+
+[Model]
--- a/translate/validation/LLMtrans.py
+++ b/translate/validation/LLMtrans.py
@ -0,0 +1,43 @@
+from openai import OpenAI
+import argparse
+import os
+from dotenv import load_dotenv
+
+def translate_text(text, client, model_name, temp):
+    messages = [
+        {"role": "system", "content": "User will provide some text. You need to translate the text into English and output it WITHOUT ANY ADDITIONAL INFORMATION OR EXPLANATION."},
+        {"role": "user", "content": text},
+    ]
+
+    response = client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        temperature=temp,
+    )
+
+    return response.choices[0].message.content
+
+load_dotenv()
+
+parser = argparse.ArgumentParser()
+parser.add_argument("input", type=str, help="Path to the input file")
+parser.add_argument("output", type=str, help="Path to the output file")
+args = parser.parse_args()
+
+input_file = args.input
+output_file = args.output
+client = OpenAI(
+    api_key=os.getenv("API_KEY"),
+    base_url=os.getenv("BASE_URL"),
+)
+model = os.getenv("TRANSLATION_MODEL")
+temp = float(os.getenv("TRANSLATION_TEMP"))
+
+with open(input_file, "r") as f:
+    src_lines = f.readlines()
+
+
+for line in src_lines:
+    result = translate_text(line, client, model, temp)
+    with open(output_file, 'a') as f:
+        f.write(result + '\n')
--- a/translate/validation/argoTrans.py
+++ b/translate/validation/argoTrans.py
@ -0,0 +1,14 @@
+import subprocess
+
+def translate_text(text):
+    command = f'argos-translate --from zh --to en "{text}"'
+    result = subprocess.run(command, shell=True, capture_output=True, text=True)
+    return result.stdout.strip()
+
+with open("src.txt", "r") as f:
+    src_lines = f.readlines()
+    
+for line in src_lines:
+    result = translate_text(line)
+    with open("hyp-ag.txt", 'a') as f:
+        f.write(result + '\n')
--- a/translate/validation/googleTrans.py
+++ b/translate/validation/googleTrans.py
@ -0,0 +1,10 @@
+from googletrans import Translator
+translator = Translator()
+
+with open("src.txt", "r") as f:
+    src_lines = f.readlines()
+    
+for line in src_lines:
+    result = translator.translate(line, dest='en')
+    with open("hyp-gg-py.txt", 'a') as f:
+        f.write(result.text + '\n')
--- a/translate/validation/preprocess.py
+++ b/translate/validation/preprocess.py
@ -0,0 +1,56 @@
+import json
+import random
+import argparse
+from tqdm import tqdm
+
+
+# 读取jsonl文件
+def read_jsonl(file_path):
+    with open(file_path, "r", encoding="utf-8") as file:
+        for line in file:
+            yield json.loads(line)
+
+
+# 随机抽取一定数量的行
+def sample_lines(data, sample_size):
+    return random.sample(list(data), sample_size)
+
+
+# 主函数
+def main(input_file, sample_size):
+    # 读取jsonl文件
+    data = read_jsonl(input_file)
+
+    # 随机抽取一定数量的行
+    sampled_data = sample_lines(data, sample_size)
+
+    for item in tqdm(sampled_data):
+        chinese_text = item["chinese"]
+        english_text = item["english"]
+
+        with open("src.txt", 'a') as srcf, open("ref.txt", 'a') as reff:
+            srcf.write(chinese_text + '\n')
+            reff.write(english_text + '\n')
+
+
+# 示例调用
+if __name__ == "__main__":
+    # 创建命令行参数解析器
+    parser = argparse.ArgumentParser(
+        description="Process a JSONL file by sampling lines and translating text."
+    )
+
+    # 添加命令行参数
+    parser.add_argument("input", type=str, help="Path to the input JSONL file")
+    parser.add_argument(
+        "--sample_size",
+        type=int,
+        default=100,
+        help="Number of lines to sample (default: 100)",
+    )
+
+    # 解析命令行参数
+    args = parser.parse_args()
+
+    # 调用主函数
+    main(args.input, args.sample_size)