sparkastML/translate-old/zh-en/inference.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "07b697c8-5cc2-4021-9ab8-e7e3c90065ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import torch\n",
    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
    "\n",
    "# 定义参数\n",
    "model_checkpoint = \"Helsinki-NLP/opus-mt-zh-en\"\n",
    "checkpoint_path = \"./saves/step_86500_bleu_29.87.bin\"  # 假设使用训练中的checkpoint\n",
    "\n",
    "# 加载tokenizer和模型\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
    "model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)\n",
    "\n",
    "# 加载checkpoint\n",
    "#model.load_state_dict(torch.load(checkpoint_path, map_location='cpu')[\"model_state_dict\"])\n",
    "model.eval()\n",
    "\n",
    "# 将模型转移到设备\n",
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "model = model.to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ccfb5004-2bdd-4d64-88a3-2af96b87092c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def infer_translation_batch(input_texts, model, tokenizer, max_length=512, num_beams=1, length_penalty=1):\n",
    "    # 记录推理开始时间\n",
    "    start_time = time.time()\n",
    "\n",
    "    # 预处理输入文本（批量处理）\n",
    "    inputs = tokenizer(\n",
    "        input_texts,\n",
    "        return_tensors=\"pt\",\n",
    "        padding=True,  # 使用动态填充，对齐批量输入的长度\n",
    "        truncation=True,\n",
    "        max_length=max_length,\n",
    "    ).to(device)\n",
    "\n",
    "    # 模型生成翻译\n",
    "    with torch.no_grad():\n",
    "        output_tokens = model.generate(\n",
    "            inputs[\"input_ids\"],\n",
    "            num_beams=num_beams,\n",
    "            length_penalty=length_penalty,\n",
    "            early_stopping=False,\n",
    "            #temperature=0.5,\n",
    "            #top_p=0.90,\n",
    "            do_sample=False\n",
    "        )\n",
    "\n",
    "    # 解码生成的tokens为文本（批量处理）\n",
    "    translations = [\n",
    "        tokenizer.decode(output, skip_special_tokens=True) for output in output_tokens\n",
    "    ]\n",
    "\n",
    "    # 记录推理结束时间\n",
    "    end_time = time.time()\n",
    "    inference_time = end_time - start_time\n",
    "\n",
    "    return translations, inference_time\n",
    "\n",
    "def translate(input_text, model, tokenizer, batch_size=16):\n",
    "    lines = input_text.splitlines()\n",
    "    \n",
    "    # 存储每一行的翻译结果\n",
    "    translations = []\n",
    "    total_time = 0\n",
    "    \n",
    "    # 分批处理\n",
    "    for i in range(0, len(lines), batch_size):\n",
    "        batch_lines = [line for line in lines[i:i + batch_size] if line.strip()]\n",
    "        if not batch_lines:\n",
    "            translations.extend([\"\"] * len(batch_lines))\n",
    "            continue\n",
    "        batch_translations, time_cost = infer_translation_batch(batch_lines, model, tokenizer)\n",
    "        translations.extend(batch_translations)\n",
    "        total_time += time_cost\n",
    "    \n",
    "    final_translation = \"\\n\".join(translations)\n",
    "    \n",
    "    return final_translation, total_time\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "d5d35c96-3c4a-487c-ac26-d3d97f1208a6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original Text: \n",
      "\n",
      "为了降低Transformer翻译模型（如基于Helsinki-NLP的Opus模型）的推理时间并提高性能，以下是一些常见且有效的优化方法：\n",
      "\n",
      "1. 模型量化\n",
      "简介：量化是通过使用低精度数值表示模型权重（例如将32位浮点数转换为8位整数）来减少模型的计算量和内存占用，从而加快推理速度。\n",
      "方法：\n",
      "Post-training quantization (PTQ)：模型训练后对权重进行量化。\n",
      "Quantization-aware training (QAT)：在训练时引入量化，通常效果比PTQ更好。\n",
      "2. 模型剪枝\n",
      "简介：剪枝通过移除模型中对推理结果影响较小的权重和节点来减小模型规模，从而加速推理。\n",
      "方法：\n",
      "结构化剪枝：移除整个层、注意力头或神经元。\n",
      "非结构化剪枝：移除个别的低权重参数。\n",
      "3. 减少模型尺寸\n",
      "简介：通过使用更小的模型架构（例如减少层数、隐藏层维度或注意力头的数量），可以减少计算量和推理时间。\n",
      "方法：使用较小版本的模型，例如opus-mt-small，或手动调整Transformer的超参数。\n",
      "4. 启用混合精度推理\n",
      "简介：混合精度推理允许部分计算使用半精度浮点数（FP16），从而减少内存占用并提高推理速度。\n",
      "工具：\n",
      "NVIDIA的TensorRT和**AMP (Automatic Mixed Precision)**是常用的工具，可以自动处理FP16的计算。\n",
      "5. 使用高效的解码策略\n",
      "简介：解码策略的选择影响推理速度。常用的解码方式如Beam Search虽然精度较高，但速度较慢。\n",
      "方法：\n",
      "降低beam size：减小beam size可以显著加快解码速度，虽然可能会略微牺牲翻译质量。\n",
      "Top-k sampling和Nucleus Sampling (Top-p sampling)：这些方法通过限制词汇选择的范围来加快推理速度。\n",
      "\n",
      "\n",
      "\n",
      "Translated Text: \n",
      "To reduce the time of reasoning and improve performance of the Transformer translation model (e.g., the Opus model based on Helsinki-NLP), the following are common and effective methods of optimization:\n",
      "Model quantification\n",
      "Profile: Quantification reduces model computing and memory occupancy by using low precision values to indicate model weights (e.g., converting 32-digit float points to 8-digit integer values), thereby accelerating reasoning.\n",
      "Methodology:\n",
      "Post-training Quantisation (PTQ): Quantifying weights after model training.\n",
      "Quantification-aware trading (QAT): Quantification is introduced in training, usually with better results than PTQ.\n",
      "Model cutting\n",
      "Profile: Cuts reduce the size of the model by removing weights and nodes in the model that influence the reasoning results less.\n",
      "Methodology:\n",
      "Structured cut-off: removes the whole layer, attention head or neuron.\n",
      "Unstructured cut-off: removes individual low weight parameters.\n",
      "3. Reduction of model size\n",
      "Profile: The calculation and reasoning time can be reduced by using smaller model structures (e.g., reducing the number of layers, hidden layers or the number of attention points).\n",
      "Method: Use smaller versions of models, such as opus-mt-small, or manually adjust Transformer's hyperparameters.\n",
      "4. Enable mixed precision reasoning\n",
      "Introduction: The mixed precision reasoning allows for partial calculation of semi-precision floats (FP16), thereby reducing memory occupancy and increasing the speed of reasoning.\n",
      "Tools:\n",
      "The NVIDIA TensorRT and **AMP (Automatic Mixed Precision)** are commonly used tools that can automatically process FP16 calculations.\n",
      "Use of efficient decoding strategies\n",
      "Profile: The selection of the decoding strategy affects the speed of reasoning. Common decoding methods such as BeamSearch are more precise but slow.\n",
      "Methodology:\n",
      "Lower beam size: Reduction of beam size can significantly accelerate decoding, although it may be at the expense of translation quality.\n",
      "Top-k sampling and Nucleus Sampling (Top-p sampling): These methods accelerate reasoning by limiting the range of vocabulary selections.\n",
      "\n",
      "Inference Time: 2.8956 seconds\n"
     ]
    }
   ],
   "source": [
    "# 用户输入\n",
    "input_text = '''\n",
    "为了降低Transformer翻译模型（如基于Helsinki-NLP的Opus模型）的推理时间并提高性能，以下是一些常见且有效的优化方法：\n",
    "\n",
    "1. 模型量化\n",
    "简介：量化是通过使用低精度数值表示模型权重（例如将32位浮点数转换为8位整数）来减少模型的计算量和内存占用，从而加快推理速度。\n",
    "方法：\n",
    "Post-training quantization (PTQ)：模型训练后对权重进行量化。\n",
    "Quantization-aware training (QAT)：在训练时引入量化，通常效果比PTQ更好。\n",
    "2. 模型剪枝\n",
    "简介：剪枝通过移除模型中对推理结果影响较小的权重和节点来减小模型规模，从而加速推理。\n",
    "方法：\n",
    "结构化剪枝：移除整个层、注意力头或神经元。\n",
    "非结构化剪枝：移除个别的低权重参数。\n",
    "3. 减少模型尺寸\n",
    "简介：通过使用更小的模型架构（例如减少层数、隐藏层维度或注意力头的数量），可以减少计算量和推理时间。\n",
    "方法：使用较小版本的模型，例如opus-mt-small，或手动调整Transformer的超参数。\n",
    "4. 启用混合精度推理\n",
    "简介：混合精度推理允许部分计算使用半精度浮点数（FP16），从而减少内存占用并提高推理速度。\n",
    "工具：\n",
    "NVIDIA的TensorRT和**AMP (Automatic Mixed Precision)**是常用的工具，可以自动处理FP16的计算。\n",
    "5. 使用高效的解码策略\n",
    "简介：解码策略的选择影响推理速度。常用的解码方式如Beam Search虽然精度较高，但速度较慢。\n",
    "方法：\n",
    "降低beam size：减小beam size可以显著加快解码速度，虽然可能会略微牺牲翻译质量。\n",
    "Top-k sampling和Nucleus Sampling (Top-p sampling)：这些方法通过限制词汇选择的范围来加快推理速度。\n",
    "'''\n",
    "\n",
    "# 进行推理并测量时间\n",
    "translated_text, time_taken = translate(input_text, model, tokenizer)\n",
    "\n",
    "# 输出结果\n",
    "print(f\"Original Text: \\n{input_text}\\n\\n\")\n",
    "print(f\"Translated Text: \\n{translated_text}\\n\")\n",
    "print(f\"Inference Time: {time_taken:.4f} seconds\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e4a44b25-a8bb-4a82-964a-0811c34c256c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}