add: translation

2024-09-07 15:53:21 +08:00 · 2024-09-07 15:53:21 +08:00 · 12b9b910f4
commit 12b9b910f4
parent 86394c7f87
7 changed files with 1061 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,4 +4,6 @@ runs
 *.pt
 *.bin
 token_to_id.json
-.ipynb_checkpoints
+.ipynb_checkpoints
+translate/**/data
+__pycache__
--- a/translate/zh-en/dataloader/trans19.py
+++ b/translate/zh-en/dataloader/trans19.py
@ -0,0 +1,41 @@
+import json, random
+from torch.utils.data import Dataset
+
+max_dataset_size = 220000
+
+class TRANS19(Dataset):
+    def __init__(self, data_file):
+        self.data = self.load_data(data_file)
+
+    def load_data(self, data_file):
+        with open(data_file, "rt", encoding="utf-8") as f:
+            total_lines = sum(1 for _ in f)
+
+        # 生成不重复的随机行号列表
+        random_line_numbers = random.sample(
+            range(total_lines), min(max_dataset_size, total_lines)
+        )
+        random_line_numbers.sort()  # 排序以便按顺序读取文件
+
+        Data = []
+        current_line_number = 0
+
+        with open(data_file, "rt", encoding="utf-8") as f:
+            for idx, line in enumerate(f):
+                if current_line_number >= len(random_line_numbers):
+                    break
+                if idx == random_line_numbers[current_line_number]:
+                    try:
+                        sample = json.loads(line.strip())
+                        Data.append(sample)
+                    except json.JSONDecodeError:
+                        print(f"Error decoding line {idx}")
+                    current_line_number += 1
+
+        return Data
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        return self.data[idx]
--- a/translate/zh-en/dataloader/wikititle.py
+++ b/translate/zh-en/dataloader/wikititle.py
@ -0,0 +1,42 @@
+import random
+from torch.utils.data import Dataset
+
+max_dataset_size = 100000
+
+class Wikititle(Dataset):
+    def __init__(self, data_file):
+        self.data = self.load_data(data_file)
+
+    def load_data(self, data_file):
+        with open(data_file, "rt", encoding="utf-8") as f:
+            total_lines = sum(1 for _ in f)
+
+        # 生成不重复的随机行号列表
+        random_line_numbers = random.sample(
+            range(total_lines), min(max_dataset_size, total_lines)
+        )
+        random_line_numbers.sort()  # 排序以便按顺序读取文件
+
+        Data = []
+        current_line_number = 0
+
+        with open(data_file, "rt", encoding="utf-8") as f:
+            for idx, line in enumerate(f):
+                if current_line_number >= len(random_line_numbers):
+                    break
+                if idx == random_line_numbers[current_line_number]:
+                    zh, en = line.split("\t")
+                    sample = {
+                        "chinese": zh,
+                        "english": en
+                    }
+                    Data.append(sample)
+                    current_line_number += 1
+
+        return Data
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        return self.data[idx]
--- a/translate/zh-en/inference.ipynb
+++ b/translate/zh-en/inference.ipynb
@ -0,0 +1,194 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "07b697c8-5cc2-4021-9ab8-e7e3c90065ee",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/models/marian/tokenization_marian.py:175: UserWarning: Recommended: pip install sacremoses.\n",
+      "  warnings.warn(\"Recommended: pip install sacremoses.\")\n",
+      "/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
+      "  warnings.warn(\n",
+      "/var/folders/25/gdz0c30x3mg1dj9qkwz0ch4w0000gq/T/ipykernel_69064/1647496252.py:14: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+      "  model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'))\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
+    "\n",
+    "# 定义参数\n",
+    "model_checkpoint = \"Helsinki-NLP/opus-mt-zh-en\"\n",
+    "checkpoint_path = \"./saves/step_74500_valid_bleu_30.28_model_weights.bin\"  # 假设使用训练中的checkpoint\n",
+    "\n",
+    "# 加载tokenizer和模型\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)\n",
+    "\n",
+    "# 加载checkpoint\n",
+    "model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'))\n",
+    "model.eval()\n",
+    "\n",
+    "# 将模型转移到设备\n",
+    "device = \"cuda\" if torch.cuda.is_available() else \"mps\" if torch.backends.mps.is_available() else \"cpu\"\n",
+    "model = model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "ccfb5004-2bdd-4d64-88a3-2af96b87092c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def infer_translation(input_text, model, tokenizer, max_length=128, num_beams=1, length_penalty=1.2):\n",
+    "    # 记录推理开始时间\n",
+    "    start_time = time.time()\n",
+    "\n",
+    "    # 预处理输入文本\n",
+    "    inputs = tokenizer(\n",
+    "        input_text,\n",
+    "        return_tensors=\"pt\",\n",
+    "        padding=\"max_length\",\n",
+    "        max_length=max_length,\n",
+    "    ).to(device)\n",
+    "\n",
+    "    # 模型生成翻译\n",
+    "    with torch.no_grad():\n",
+    "        output_tokens = model.generate(\n",
+    "            inputs[\"input_ids\"],\n",
+    "            max_length=max_length,\n",
+    "            num_beams=num_beams,\n",
+    "            length_penalty=length_penalty,\n",
+    "            early_stopping=True,\n",
+    "            no_repeat_ngram_size=2,\n",
+    "            temperature = 0.3,\n",
+    "            top_p = 0.85,\n",
+    "            do_sample = False\n",
+    "        )\n",
+    "\n",
+    "    # 解码生成的tokens为文本\n",
+    "    translation = tokenizer.decode(output_tokens[0], skip_special_tokens=True)\n",
+    "\n",
+    "    # 记录推理结束时间\n",
+    "    end_time = time.time()\n",
+    "    inference_time = end_time - start_time\n",
+    "\n",
+    "    return translation, inference_time\n",
+    "\n",
+    "def translate(input_text, model, tokenizer):\n",
+    "    lines = input_text.splitlines()\n",
+    "    \n",
+    "    # 存储每一行的翻译结果\n",
+    "    translations = []\n",
+    "    total_time = 0 \n",
+    "    \n",
+    "    # 对每一行进行翻译\n",
+    "    for line in lines:\n",
+    "        if line.strip() == \"\":\n",
+    "            translations.append(\"\")\n",
+    "            continue\n",
+    "        #对于长行按句翻译\n",
+    "        if len(line) > 64 and '。' in line:\n",
+    "            sentences = line.split('。')\n",
+    "            translated_sentences=[]\n",
+    "            for sentence in sentences:\n",
+    "                if sentence.strip() == \"\":\n",
+    "                    continue\n",
+    "                translation, time_cost = infer_translation(sentence, model, tokenizer)\n",
+    "                translated_sentences.append(translation)\n",
+    "                total_time += time_cost\n",
+    "                #print(sentence,translation)\n",
+    "            translations.append(\" \".join(translated_sentences))\n",
+    "        else:\n",
+    "            translation, time_cost = infer_translation(line, model, tokenizer)\n",
+    "            #print(line,translation)\n",
+    "            translations.append(translation)\n",
+    "            total_time += time_cost\n",
+    "    \n",
+    "    final_translation = \"\\n\".join(translations)\n",
+    "    \n",
+    "    return final_translation, total_time\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "d5d35c96-3c4a-487c-ac26-d3d97f1208a6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:567: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.3` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
+      "  warnings.warn(\n",
+      "/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:572: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.85` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.\n",
+      "  warnings.warn(\n",
+      "/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:615: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams>1` or unset `early_stopping`.\n",
+      "  warnings.warn(\n",
+      "/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:634: UserWarning: `num_beams` is set to 1. However, `length_penalty` is set to `1.2` -- this flag is only used in beam-based generation modes. You should set `num_beams>1` or unset `length_penalty`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Original Text: \n",
+      "自2000年左右，台湾的珍珠奶茶传入中国大陆，市场规模逐步扩大。当地不断推出新口味的奶茶、水果茶和奶盖茶等创新饮品，并提供多样化的配料选择，统称为新式茶饮。2018年起，奶茶品牌开始采用网红营销策略，使得部分城市门店顾客络绎不绝。尽管消费者有多达两千种的搭配选择，但销量最高的依旧是珍珠、红豆和布丁这三种经典配料。\n",
+      "面对激烈的市场竞争，茶饮品牌开始区分不同的档次，从使用红茶粉和奶精的低成本产品，到采用新鲜牛奶和现场煮制的高级奶茶，甚至高端茶叶如大红袍、龙井茶也成为一些品牌的选用。\n",
+      "\n",
+      "\n",
+      "Translated Text: \n",
+      "Since about 2000, the Pearl Milk Tea of Taiwan has been spreading into the mainland, and the market has gradually expanded The new tea, fruit tea and milk tea are introduced in the local market, and the variety of ingredients is offered, collectively known as new-style tea. Since 2018, the milk tea brand has adopted a mesh marketing strategy, which has made some city stores more and more customers. Despite the fact that consumers have as many as 2, 000 combinations, the highest sales are still the three classic ingredients: pearls and red beans and pudding.\n",
+      "In the face of fierce market competition, tea and tea brands began to differentiate between low-cost products using red tea powder and cream, high-grade milk tea made from fresh milk and live, even high tea such as big red robes and dragon well tea have become a few brand selections.\n",
+      "\n",
+      "Inference Time: 3.8918 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 用户输入\n",
+    "input_text = '''自2000年左右，台湾的珍珠奶茶传入中国大陆，市场规模逐步扩大。当地不断推出新口味的奶茶、水果茶和奶盖茶等创新饮品，并提供多样化的配料选择，统称为新式茶饮。2018年起，奶茶品牌开始采用网红营销策略，使得部分城市门店顾客络绎不绝。尽管消费者有多达两千种的搭配选择，但销量最高的依旧是珍珠、红豆和布丁这三种经典配料。\n",
+    "面对激烈的市场竞争，茶饮品牌开始区分不同的档次，从使用红茶粉和奶精的低成本产品，到采用新鲜牛奶和现场煮制的高级奶茶，甚至高端茶叶如大红袍、龙井茶也成为一些品牌的选用。'''\n",
+    "\n",
+    "# 进行推理并测量时间\n",
+    "translated_text, time_taken = translate(input_text, model, tokenizer)\n",
+    "\n",
+    "# 输出结果\n",
+    "print(f\"Original Text: \\n{input_text}\\n\\n\")\n",
+    "print(f\"Translated Text: \\n{translated_text}\\n\")\n",
+    "print(f\"Inference Time: {time_taken:.4f} seconds\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/translate/zh-en/train.ipynb
+++ b/translate/zh-en/train.ipynb
@ -0,0 +1,338 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e95d90ec-1f93-45d9-ab8a-ee3d0bae293d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "import os\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "from torch.utils.data import Dataset, DataLoader, random_split\n",
+    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
+    "from transformers import AdamW, get_scheduler\n",
+    "from sacrebleu.metrics import BLEU\n",
+    "from tqdm.auto import tqdm\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9b8e703a-a5b5-43bf-9b12-2220d869145a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using cpu device\n"
+     ]
+    }
+   ],
+   "source": [
+    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+    "print(f'Using {device} device')\n",
+    "\n",
+    "max_dataset_size = 22000\n",
+    "train_set_size = 20000\n",
+    "valid_set_size = 2000\n",
+    "\n",
+    "max_input_length = 128\n",
+    "max_target_length = 128\n",
+    "\n",
+    "batch_size = 16\n",
+    "learning_rate = 1e-5\n",
+    "epoch_num = 3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "3db1484a-e923-44b9-a2e6-52178a8c09ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class TRANS(Dataset):\n",
+    "    def __init__(self, data_file):\n",
+    "        self.data = self.load_data(data_file)\n",
+    "    \n",
+    "    def load_data(self, data_file):\n",
+    "        Data = {}\n",
+    "        with open(data_file, 'rt', encoding='utf-8') as f:\n",
+    "            for idx, line in enumerate(f):\n",
+    "                if idx >= max_dataset_size:\n",
+    "                    break\n",
+    "                sample = json.loads(line.strip())\n",
+    "                Data[idx] = sample\n",
+    "        return Data\n",
+    "    \n",
+    "    def __len__(self):\n",
+    "        return len(self.data)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        return self.data[idx]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "0258cad4-f498-4952-ac29-e103ae8e9041",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/models/marian/tokenization_marian.py:175: UserWarning: Recommended: pip install sacremoses.\n",
+      "  warnings.warn(\"Recommended: pip install sacremoses.\")\n",
+      "/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "data = TRANS('./data/translation2019zh/translation2019zh_train.json')\n",
+    "train_data, valid_data = random_split(data, [train_set_size, valid_set_size])\n",
+    "test_data = TRANS('./data/translation2019zh/translation2019zh_valid.json')\n",
+    "\n",
+    "model_checkpoint = \"Helsinki-NLP/opus-mt-zh-en\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)\n",
+    "model = model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "437fb69c-59f6-48f0-9c74-330cf4862b22",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def collote_fn(batch_samples):\n",
+    "    batch_inputs, batch_targets = [], []\n",
+    "    for sample in batch_samples:\n",
+    "        batch_inputs.append(sample['chinese'])\n",
+    "        batch_targets.append(sample['english'])\n",
+    "    batch_data = tokenizer(\n",
+    "        batch_inputs, \n",
+    "        padding=True, \n",
+    "        max_length=max_input_length,\n",
+    "        truncation=True, \n",
+    "        return_tensors=\"pt\"\n",
+    "    )\n",
+    "    with tokenizer.as_target_tokenizer():\n",
+    "        labels = tokenizer(\n",
+    "            batch_targets, \n",
+    "            padding=True, \n",
+    "            max_length=max_target_length,\n",
+    "            truncation=True, \n",
+    "            return_tensors=\"pt\"\n",
+    "        )[\"input_ids\"]\n",
+    "        batch_data['decoder_input_ids'] = model.prepare_decoder_input_ids_from_labels(labels)\n",
+    "        end_token_index = torch.where(labels == tokenizer.eos_token_id)[1]\n",
+    "        for idx, end_idx in enumerate(end_token_index):\n",
+    "            labels[idx][end_idx+1:] = -100\n",
+    "        batch_data['labels'] = labels\n",
+    "    return batch_data\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "b9f261d8-02ca-47fc-92d7-6d495ae9c6a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collote_fn)\n",
+    "valid_dataloader = DataLoader(valid_data, batch_size=batch_size, shuffle=False, collate_fn=collote_fn)\n",
+    "test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collote_fn)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "6fcfa14a-a81b-4a3f-a459-cc0c06f4fa70",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_loss):\n",
+    "    progress_bar = tqdm(range(len(dataloader)))\n",
+    "    progress_bar.set_description(f'loss: {0:>7f}')\n",
+    "    finish_batch_num = (epoch-1) * len(dataloader)\n",
+    "    \n",
+    "    model.train()\n",
+    "    for batch, batch_data in enumerate(dataloader, start=1):\n",
+    "        batch_data = batch_data.to(device)\n",
+    "        outputs = model(**batch_data)\n",
+    "        loss = outputs.loss\n",
+    "\n",
+    "        optimizer.zero_grad()\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        lr_scheduler.step()\n",
+    "\n",
+    "        total_loss += loss.item()\n",
+    "        progress_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}')\n",
+    "        progress_bar.update(1)\n",
+    "    return total_loss\n",
+    "\n",
+    "bleu = BLEU()\n",
+    "\n",
+    "def test_loop(dataloader, model):\n",
+    "    preds, labels = [], []\n",
+    "    \n",
+    "    model.eval()\n",
+    "    for batch_data in tqdm(dataloader):\n",
+    "        batch_data = batch_data.to(device)\n",
+    "        with torch.no_grad():\n",
+    "            generated_tokens = model.generate(\n",
+    "                batch_data[\"input_ids\"],\n",
+    "                attention_mask=batch_data[\"attention_mask\"],\n",
+    "                max_length=max_target_length,\n",
+    "            ).cpu().numpy()\n",
+    "        label_tokens = batch_data[\"labels\"].cpu().numpy()\n",
+    "        \n",
+    "        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n",
+    "        label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)\n",
+    "        decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)\n",
+    "\n",
+    "        preds += [pred.strip() for pred in decoded_preds]\n",
+    "        labels += [[label.strip()] for label in decoded_labels]\n",
+    "    bleu_score = bleu.corpus_score(preds, labels).score\n",
+    "    print(f\"BLEU: {bleu_score:>0.2f}\\n\")\n",
+    "    return bleu_score\n",
+    "\n",
+    "optimizer = AdamW(model.parameters(), lr=learning_rate)\n",
+    "lr_scheduler = get_scheduler(\n",
+    "    \"linear\",\n",
+    "    optimizer=optimizer,\n",
+    "    num_warmup_steps=0,\n",
+    "    num_training_steps=epoch_num*len(train_dataloader),\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "12068522-df42-484f-97f1-13ce588bf47b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "896ba74b-1a6a-402c-b94a-e9cf47bb0d65",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1/3\n",
+      "-------------------------------\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0453b70899854c0191a93b53748ddaa0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/12500 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:4126: UserWarning: `as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your labels by using the argument `text_target` of the regular `__call__` method (either in the same call as your input texts if you use the same keyword arguments, or in a separate call.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "ename": "RuntimeError",
+     "evalue": "MPS backend out of memory (MPS allocated: 9.37 GB, other allocations: 8.66 GB, max allowed: 18.13 GB). Tried to allocate 222.17 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[12], line 6\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m t \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(epoch_num):\n\u001b[1;32m      5\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEpoch \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mt\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m1\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mepoch_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m-------------------------------\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 6\u001b[0m     total_loss \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_loop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrain_dataloader\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptimizer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlr_scheduler\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtotal_loss\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      7\u001b[0m     valid_bleu \u001b[38;5;241m=\u001b[39m test_loop(valid_dataloader, model)\n\u001b[1;32m      8\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m valid_bleu \u001b[38;5;241m>\u001b[39m best_bleu:\n",
+      "Cell \u001b[0;32mIn[10], line 13\u001b[0m, in \u001b[0;36mtrain_loop\u001b[0;34m(dataloader, model, optimizer, lr_scheduler, epoch, total_loss)\u001b[0m\n\u001b[1;32m     10\u001b[0m loss \u001b[38;5;241m=\u001b[39m outputs\u001b[38;5;241m.\u001b[39mloss\n\u001b[1;32m     12\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mzero_grad()\n\u001b[0;32m---> 13\u001b[0m \u001b[43mloss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     14\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mstep()\n\u001b[1;32m     15\u001b[0m lr_scheduler\u001b[38;5;241m.\u001b[39mstep()\n",
+      "File \u001b[0;32m/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/torch/_tensor.py:522\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m    512\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m    513\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m    514\u001b[0m         Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[1;32m    515\u001b[0m         (\u001b[38;5;28mself\u001b[39m,),\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    520\u001b[0m         inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[1;32m    521\u001b[0m     )\n\u001b[0;32m--> 522\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    523\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[1;32m    524\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/torch/autograd/__init__.py:347\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m    342\u001b[0m     retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[1;32m    344\u001b[0m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[1;32m    345\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[1;32m    346\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[0;32m--> 347\u001b[0m \u001b[43m_engine_run_backward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    348\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    349\u001b[0m \u001b[43m    \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    350\u001b[0m \u001b[43m    \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    351\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    352\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    353\u001b[0m \u001b[43m    \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    354\u001b[0m \u001b[43m    \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m    355\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/torch/autograd/graph.py:818\u001b[0m, in \u001b[0;36m_engine_run_backward\u001b[0;34m(t_outputs, *args, **kwargs)\u001b[0m\n\u001b[1;32m    816\u001b[0m     unregister_hooks \u001b[38;5;241m=\u001b[39m _register_logging_hooks_on_whole_graph(t_outputs)\n\u001b[1;32m    817\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 818\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mVariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[1;32m    819\u001b[0m \u001b[43m        \u001b[49m\u001b[43mt_outputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m    820\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001b[39;00m\n\u001b[1;32m    821\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m    822\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m attach_logging_hooks:\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: MPS backend out of memory (MPS allocated: 9.37 GB, other allocations: 8.66 GB, max allowed: 18.13 GB). Tried to allocate 222.17 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure)."
+     ]
+    }
+   ],
+   "source": [
+    "epoch_num = 3\n",
+    "total_loss = 0.\n",
+    "best_bleu = 0.\n",
+    "for t in range(epoch_num):\n",
+    "    print(f\"Epoch {t+1}/{epoch_num}\\n-------------------------------\")\n",
+    "    total_loss = train_loop(train_dataloader, model, optimizer, lr_scheduler, t+1, total_loss)\n",
+    "    valid_bleu = test_loop(valid_dataloader, model)\n",
+    "    if valid_bleu > best_bleu:\n",
+    "        best_bleu = valid_bleu\n",
+    "        print('saving new weights...\\n')\n",
+    "        torch.save(\n",
+    "            model.state_dict(), \n",
+    "            f'epoch_{t+1}_valid_bleu_{valid_bleu:0.2f}_model_weights.bin'\n",
+    "        )\n",
+    "print(\"Done!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6fd3439a-058a-4220-9b65-b355b52f74b5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/translate/zh-en/train.py
+++ b/translate/zh-en/train.py
@ -0,0 +1,200 @@
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, random_split
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from transformers import AdamW, get_scheduler
+from sacrebleu.metrics import BLEU
+from tqdm.auto import tqdm
+from torch.utils.tensorboard import SummaryWriter
+from dataloader.wikititle import Wikititle
+
+writer = SummaryWriter()
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using {device} device")
+
+train_set_size = 95000
+valid_set_size = 5000
+test_data_size = 0
+
+last_1k_loss = []
+kmean_loss = 0.0
+total_loss = 0.0
+best_bleu = 0.0
+step = 0
+
+max_input_length = 128
+max_target_length = 128
+
+batch_size = 8
+learning_rate = 1e-5
+epoch_num = 1
+
+# 检查点文件路径，默认为None
+# checkpoint_path = None
+checkpoint_path = "./saves/checkpoint_74000.bin"  # 如果要从检查点继续训练，设置此路径
+
+
+data = Wikititle("./data/wikititles-v3.zh-en.tsv")
+train_data, valid_data, test_data = random_split(data, [train_set_size, valid_set_size, test_data_size])
+
+# data = TRANS("./data/translation2019zh/translation2019zh_train.json")
+# train_data, valid_data = random_split(data, [train_set_size, valid_set_size])
+# test_data = TRANS("./data/translation2019zh/translation2019zh_valid.json")
+
+model_checkpoint = "Helsinki-NLP/opus-mt-zh-en"
+tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
+model = model.to(device)
+
+# 如果指定了检查点路径，则从检查点加载模型状态
+if checkpoint_path is not None:
+    print(f"Loading checkpoint from {checkpoint_path}")
+    checkpoint_data = torch.load(checkpoint_path, map_location=device)
+    model.load_state_dict(checkpoint_data["model_state_dict"])
+    total_loss = checkpoint_data.get("total_loss", 0.0)
+    step = checkpoint_data.get("step", 0)
+    kmean_loss = total_loss / step
+    last_1k_loss = [kmean_loss] * 1000
+
+
+def collote_fn(batch_samples):
+    batch_inputs, batch_targets = [], []
+    for sample in batch_samples:
+        batch_inputs.append(sample["chinese"])
+        batch_targets.append(sample["english"])
+    batch_data = tokenizer(
+        batch_inputs,
+        padding=True,
+        max_length=max_input_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    with tokenizer.as_target_tokenizer():
+        labels = tokenizer(
+            batch_targets,
+            padding=True,
+            max_length=max_target_length,
+            truncation=True,
+            return_tensors="pt",
+        )["input_ids"]
+        batch_data["decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
+            labels
+        )
+        end_token_index = torch.where(labels == tokenizer.eos_token_id)[1]
+        for idx, end_idx in enumerate(end_token_index):
+            labels[idx][end_idx + 1 :] = -100
+        batch_data["labels"] = labels
+
+    batch_data = {k: v.to(device) for k, v in batch_data.items()}
+    return batch_data
+
+
+train_dataloader = DataLoader(
+    train_data, batch_size=batch_size, shuffle=True, collate_fn=collote_fn
+)
+valid_dataloader = DataLoader(
+    valid_data, batch_size=batch_size, shuffle=False, collate_fn=collote_fn
+)
+test_dataloader = DataLoader(
+    test_data, batch_size=batch_size, shuffle=False, collate_fn=collote_fn
+)
+
+
+def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_loss, step):
+    progress_bar = tqdm(range(len(dataloader)))
+    progress_bar.set_description(f"loss: {0:>7f}")
+
+    model.train()
+    for batch, batch_data in enumerate(dataloader, start=1):
+        outputs = model(**batch_data)
+        loss = outputs.loss
+
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        lr_scheduler.step()
+
+        total_loss += loss.item()
+        del last_1k_loss[0]
+        last_1k_loss.append(loss.item())
+        kmean_loss = sum(last_1k_loss) / len(last_1k_loss)
+        progress_bar.set_description(
+            f"loss: {kmean_loss:>7f}"
+        )
+        progress_bar.update(1)
+
+        step += 1
+        writer.add_scalar("Loss", kmean_loss, step)
+        writer.add_scalar("Overall Loss", total_loss / step, step)
+
+        if step % 250 == 0:
+            checkpoint = {
+                "model_state_dict": model.state_dict(),
+                "total_loss": total_loss,
+                "kmean_loss": kmean_loss,
+                "step": step,
+            }
+            torch.save(checkpoint, f"checkpoint_{step}.bin")
+    return total_loss, step
+
+
+bleu = BLEU()
+
+
+def test_loop(dataloader, model):
+    preds, labels = [], []
+    model.eval()
+    for batch_data in tqdm(dataloader):
+        with torch.no_grad():
+            generated_tokens = (
+                model.generate(
+                    batch_data["input_ids"],
+                    attention_mask=batch_data["attention_mask"],
+                    max_length=max_target_length,
+                    no_repeat_ngram_size=3,
+                )
+                .cpu()
+                .numpy()
+            )
+
+        label_tokens = batch_data["labels"].cpu().numpy()
+        decoded_preds = tokenizer.batch_decode(
+            generated_tokens, skip_special_tokens=True
+        )
+        label_tokens = np.where(
+            label_tokens != -100, label_tokens, tokenizer.pad_token_id
+        )
+        decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)
+
+        preds += [pred.strip() for pred in decoded_preds]
+        labels += [[label.strip()] for label in decoded_labels]
+    bleu_score = bleu.corpus_score(preds, labels).score
+    print(f"BLEU: {bleu_score:>0.2f}\n")
+    return bleu_score
+
+
+optimizer = AdamW(model.parameters(), lr=learning_rate)
+lr_scheduler = get_scheduler(
+    "linear",
+    optimizer=optimizer,
+    num_warmup_steps=int(0.1 * epoch_num * len(train_dataloader)),
+    num_training_steps=epoch_num * len(train_dataloader),
+)
+
+for t in range(epoch_num):
+    print(f"Epoch {t+1}/{epoch_num}\n {'-'*20}")
+    total_loss, step = train_loop(
+        train_dataloader, model, optimizer, lr_scheduler, t + 1, total_loss, step
+    )
+    valid_bleu = test_loop(valid_dataloader, model)
+    print("saving new weights...\n")
+    checkpoint = {
+        "model_state_dict": model.state_dict(),
+        "total_loss": total_loss,
+        "kmean_loss": kmean_loss,
+        "step": step,
+    }
+    torch.save(checkpoint, f"step_{step}_valid_bleu_{valid_bleu:0.2f}_model_weights.bin")
+
+print("Done!")
--- a/translate/zh-en/validate.ipynb
+++ b/translate/zh-en/validate.ipynb
@ -0,0 +1,243 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2d0860b5-8e4e-4596-a81f-be259e188775",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
+    "import json\n",
+    "import numpy as np\n",
+    "from sacrebleu.metrics import BLEU\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "7c22baec-c987-46cd-af6f-5eb594a2b1e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 定义参数\n",
+    "checkpoint_path = \"./step_137000_valid_bleu_25.55_model_weights.bin\"  # 假设你要加载第2个epoch中的500步的checkpoint\n",
+    "data_file = \"./data/translation2019zh/translation2019zh_valid.json\"  # 假设使用验证集来测试\n",
+    "model_checkpoint = \"Helsinki-NLP/opus-mt-zh-en\"\n",
+    "max_dataset_size = 100\n",
+    "max_input_length = 128\n",
+    "max_target_length = 128\n",
+    "batch_size = 8"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "fff9be9b-57d8-4203-b644-155c76baa1ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class TRANS(Dataset):\n",
+    "    def __init__(self, data_file):\n",
+    "        self.data = self.load_data(data_file)\n",
+    "    \n",
+    "    def load_data(self, data_file):\n",
+    "        Data = {}\n",
+    "        with open(data_file, 'rt', encoding='utf-8') as f:\n",
+    "            for idx, line in enumerate(f):\n",
+    "                if idx >= max_dataset_size:\n",
+    "                    break\n",
+    "                sample = json.loads(line.strip())\n",
+    "                Data[idx] = sample\n",
+    "        return Data\n",
+    "    \n",
+    "    def __len__(self):\n",
+    "        return len(self.data)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        return self.data[idx]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "3be38090-c1e5-4fb5-b90d-7f18fe8dc23f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def collote_fn(batch_samples):\n",
+    "    batch_inputs, batch_targets = [], []\n",
+    "    for sample in batch_samples:\n",
+    "        batch_inputs.append(sample['chinese'])\n",
+    "        batch_targets.append(sample['english'])\n",
+    "    batch_data = tokenizer(\n",
+    "        batch_inputs, \n",
+    "        padding=True, \n",
+    "        max_length=max_input_length,\n",
+    "        truncation=True, \n",
+    "        return_tensors=\"pt\"\n",
+    "    )\n",
+    "    with tokenizer.as_target_tokenizer():\n",
+    "        labels = tokenizer(\n",
+    "            batch_targets, \n",
+    "            padding=True, \n",
+    "            max_length=max_target_length,\n",
+    "            truncation=True, \n",
+    "            return_tensors=\"pt\"\n",
+    "        )[\"input_ids\"]\n",
+    "        batch_data['decoder_input_ids'] = model.prepare_decoder_input_ids_from_labels(labels)\n",
+    "        end_token_index = torch.where(labels == tokenizer.eos_token_id)[1]\n",
+    "        for idx, end_idx in enumerate(end_token_index):\n",
+    "            labels[idx][end_idx+1:] = -100\n",
+    "        batch_data['labels'] = labels\n",
+    "    return batch_data\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "3d5a697a-9b44-4ff3-96df-24a2cb608773",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/25/gdz0c30x3mg1dj9qkwz0ch4w0000gq/T/ipykernel_13528/1590730426.py:6: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+      "  model.load_state_dict(torch.load(checkpoint_path, map_location=\"cpu\"))\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 加载模型和tokenizer\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)\n",
+    "\n",
+    "# 加载checkpoint\n",
+    "model.load_state_dict(torch.load(checkpoint_path, map_location=\"cpu\"))\n",
+    "model.eval()\n",
+    "\n",
+    "# 将模型转移到设备\n",
+    "device = \"cuda\" if torch.cuda.is_available() else \"mps\" if torch.backends.mps.is_available() else \"cpu\"\n",
+    "model = model.to(device)\n",
+    "\n",
+    "# 加载测试数据\n",
+    "test_data = TRANS(data_file)\n",
+    "test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True, collate_fn=collote_fn)\n",
+    "\n",
+    "# 定义BLEU评估函数\n",
+    "bleu = BLEU()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "6446e3f4-b6e2-4f8a-abc4-6fee7224d517",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "def test_model(dataloader, model):\n",
+    "    preds, labels = [], []\n",
+    "\n",
+    "    model.eval()\n",
+    "    for batch_data in tqdm(dataloader):\n",
+    "        batch_data = batch_data.to(device)\n",
+    "        with torch.no_grad():\n",
+    "            generated_tokens = model.generate(\n",
+    "                batch_data[\"input_ids\"],\n",
+    "                attention_mask=batch_data[\"attention_mask\"],\n",
+    "                max_length=max_target_length,\n",
+    "            ).cpu().numpy()\n",
+    "\n",
+    "        label_tokens = batch_data[\"labels\"].cpu().numpy()\n",
+    "        \n",
+    "\n",
+    "        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n",
+    "        label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)\n",
+    "        decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)\n",
+    "\n",
+    "        preds += [pred.strip() for pred in decoded_preds]\n",
+    "        labels += [[label.strip()] for label in decoded_labels]\n",
+    "    \n",
+    "    bleu_score = bleu.corpus_score(preds, labels).score\n",
+    "    print(f\"BLEU: {bleu_score:>0.2f}\")\n",
+    "    return bleu_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "f196f320-7d8b-44d5-9903-5fdb0532e318",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Testing model...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:33<00:00,  2.61s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "BLEU: 12.95\n",
+      "Test BLEU score: 12.95\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Testing model...\")\n",
+    "bleu_score = test_model(test_dataloader, model)\n",
+    "print(f\"Test BLEU score: {bleu_score:0.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "96e1d097-93d6-482d-8836-167974de98bc",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}