diff --git a/README.md b/README.md index 4dff30d..e752ee6 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,16 @@ This model is optimized to be lightweight, ensuring it can run on a wide range o For a detailed explanation of how it works, you can refer to [this blog post](https://blog.alikia2x.com/en/posts/sparkastml-intention/). +## Translation + +Language barriers are one of the biggest obstacles to communication between civilizations. In modern times, with the development of computer science and artificial intelligence, machine translation is bridging this barrier and building a Tower of Babel. + +Unfortunately, many machine translations are owned by commercial companies, which seriously hinders the development of freedom and innovation. + +Therefore, sparkastML is on the road to challenge commercial machine translation. We decided to tackle the translation between Chinese and English first. These are two languages ​​with a long history and a large number of users. Their writing methods and expression habits are very different, which brings challenges to the project. + +For more details, you can view [this page](./translate/README.md). + ## Dataset To support the development of Libre Intelligence, we have made a series of datasets publicly available. You can access them [here](./dataset/public/README.md). diff --git a/translate/llm-translate.py b/translate/LLMtranslator.py similarity index 77% rename from translate/llm-translate.py rename to translate/LLMtranslator.py index 5f9e9cb..eceff53 100644 --- a/translate/llm-translate.py +++ b/translate/LLMtranslator.py @@ -34,63 +34,79 @@ EXAMPLE JSON OUTPUT: } """ -def translate_text(text): + +def translate_text(text, client, model_name, temp): messages = [ {"role": "system", "content": system_prompt}, - {"role": "user", "content": text} + {"role": "user", "content": text}, ] - + response = client.chat.completions.create( - model=os.getenv("TRANSLATION_MODEL"), + model=model_name, messages=messages, - response_format={'type': 'json_object'}, - temperature=float(os.getenv("TRANSLATION_TEMP")) + response_format={"type": "json_object"}, + temperature=temp, ) - + return json.loads(response.choices[0].message.content) + def process_file(input_file, output_dir): try: - with open(input_file, 'r', encoding='utf-8') as f: + with open(input_file, "r", encoding="utf-8") as f: text = f.read() - - translation = translate_text(text) - + + model = os.getenv("TRANSLATION_MODEL") + temp = float(os.getenv("TRANSLATION_TEMP")) + translation = translate_text(text, client, model, temp) + output_path = os.path.join(output_dir, Path(input_file).stem + ".json") - with open(output_path, 'w', encoding='utf-8') as f: + with open(output_path, "w", encoding="utf-8") as f: json.dump(translation, f, ensure_ascii=False, indent=4) - + print(f"Successfully translated and saved to {output_path}") - + except Exception as e: print(f"Error processing {input_file}: {e}") + def batch_process(input_dir, output_dir, num_threads=4): if not os.path.exists(output_dir): os.makedirs(output_dir) - - input_files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))] - output_files = [f for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f))] - + + input_files = [ + f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f)) + ] + output_files = [ + f for f in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, f)) + ] + output_stems = {Path(f).stem for f in output_files} - - files = [os.path.join(input_dir, f) for f in input_files if Path(f).stem not in output_stems] - + + files = [ + os.path.join(input_dir, f) + for f in input_files + if Path(f).stem not in output_stems + ] + threads = [] for file in files: thread = threading.Thread(target=process_file, args=(file, output_dir)) threads.append(thread) thread.start() - + if len(threads) >= num_threads: for t in threads: t.join() threads = [] - + for t in threads: t.join() + if __name__ == "__main__": input_dir = "./source" output_dir = "./output" - batch_process(input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS"))) \ No newline at end of file + batch_process( + input_dir, output_dir, num_threads=int(os.getenv("TRANSLATE_THREADS")) + ) diff --git a/translate/README.md b/translate/README.md new file mode 100644 index 0000000..baa954e --- /dev/null +++ b/translate/README.md @@ -0,0 +1,14 @@ +# sparkastML NMT + +## News + +sparkastML's first translation model has release! + +### Details + +- Training time: 5 hours, 20k steps +- Training device: RTX 3080 (20GB) +- Corpus size: over 10 million sentences +- Validation Score: BLEU + +[Model] diff --git a/translate/validation/LLMtrans.py b/translate/validation/LLMtrans.py new file mode 100644 index 0000000..9d06a1a --- /dev/null +++ b/translate/validation/LLMtrans.py @@ -0,0 +1,43 @@ +from openai import OpenAI +import argparse +import os +from dotenv import load_dotenv + +def translate_text(text, client, model_name, temp): + messages = [ + {"role": "system", "content": "User will provide some text. You need to translate the text into English and output it WITHOUT ANY ADDITIONAL INFORMATION OR EXPLANATION."}, + {"role": "user", "content": text}, + ] + + response = client.chat.completions.create( + model=model_name, + messages=messages, + temperature=temp, + ) + + return response.choices[0].message.content + +load_dotenv() + +parser = argparse.ArgumentParser() +parser.add_argument("input", type=str, help="Path to the input file") +parser.add_argument("output", type=str, help="Path to the output file") +args = parser.parse_args() + +input_file = args.input +output_file = args.output +client = OpenAI( + api_key=os.getenv("API_KEY"), + base_url=os.getenv("BASE_URL"), +) +model = os.getenv("TRANSLATION_MODEL") +temp = float(os.getenv("TRANSLATION_TEMP")) + +with open(input_file, "r") as f: + src_lines = f.readlines() + + +for line in src_lines: + result = translate_text(line, client, model, temp) + with open(output_file, 'a') as f: + f.write(result + '\n') diff --git a/translate/validation/argoTrans.py b/translate/validation/argoTrans.py new file mode 100644 index 0000000..c70e7e1 --- /dev/null +++ b/translate/validation/argoTrans.py @@ -0,0 +1,14 @@ +import subprocess + +def translate_text(text): + command = f'argos-translate --from zh --to en "{text}"' + result = subprocess.run(command, shell=True, capture_output=True, text=True) + return result.stdout.strip() + +with open("src.txt", "r") as f: + src_lines = f.readlines() + +for line in src_lines: + result = translate_text(line) + with open("hyp-ag.txt", 'a') as f: + f.write(result + '\n') \ No newline at end of file diff --git a/translate/validation/googleTrans.py b/translate/validation/googleTrans.py new file mode 100644 index 0000000..be63a81 --- /dev/null +++ b/translate/validation/googleTrans.py @@ -0,0 +1,10 @@ +from googletrans import Translator +translator = Translator() + +with open("src.txt", "r") as f: + src_lines = f.readlines() + +for line in src_lines: + result = translator.translate(line, dest='en') + with open("hyp-gg-py.txt", 'a') as f: + f.write(result.text + '\n') \ No newline at end of file diff --git a/translate/validation/preprocess.py b/translate/validation/preprocess.py new file mode 100644 index 0000000..79aa725 --- /dev/null +++ b/translate/validation/preprocess.py @@ -0,0 +1,56 @@ +import json +import random +import argparse +from tqdm import tqdm + + +# 读取jsonl文件 +def read_jsonl(file_path): + with open(file_path, "r", encoding="utf-8") as file: + for line in file: + yield json.loads(line) + + +# 随机抽取一定数量的行 +def sample_lines(data, sample_size): + return random.sample(list(data), sample_size) + + +# 主函数 +def main(input_file, sample_size): + # 读取jsonl文件 + data = read_jsonl(input_file) + + # 随机抽取一定数量的行 + sampled_data = sample_lines(data, sample_size) + + for item in tqdm(sampled_data): + chinese_text = item["chinese"] + english_text = item["english"] + + with open("src.txt", 'a') as srcf, open("ref.txt", 'a') as reff: + srcf.write(chinese_text + '\n') + reff.write(english_text + '\n') + + +# 示例调用 +if __name__ == "__main__": + # 创建命令行参数解析器 + parser = argparse.ArgumentParser( + description="Process a JSONL file by sampling lines and translating text." + ) + + # 添加命令行参数 + parser.add_argument("input", type=str, help="Path to the input JSONL file") + parser.add_argument( + "--sample_size", + type=int, + default=100, + help="Number of lines to sample (default: 100)", + ) + + # 解析命令行参数 + args = parser.parse_args() + + # 调用主函数 + main(args.input, args.sample_size)