add: translation

This commit is contained in:
alikia2x (寒寒) 2024-09-07 15:53:21 +08:00
parent 86394c7f87
commit 12b9b910f4
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
7 changed files with 1061 additions and 1 deletions

4
.gitignore vendored
View File

@ -4,4 +4,6 @@ runs
*.pt
*.bin
token_to_id.json
.ipynb_checkpoints
.ipynb_checkpoints
translate/**/data
__pycache__

View File

@ -0,0 +1,41 @@
import json, random
from torch.utils.data import Dataset
max_dataset_size = 220000
class TRANS19(Dataset):
def __init__(self, data_file):
self.data = self.load_data(data_file)
def load_data(self, data_file):
with open(data_file, "rt", encoding="utf-8") as f:
total_lines = sum(1 for _ in f)
# 生成不重复的随机行号列表
random_line_numbers = random.sample(
range(total_lines), min(max_dataset_size, total_lines)
)
random_line_numbers.sort() # 排序以便按顺序读取文件
Data = []
current_line_number = 0
with open(data_file, "rt", encoding="utf-8") as f:
for idx, line in enumerate(f):
if current_line_number >= len(random_line_numbers):
break
if idx == random_line_numbers[current_line_number]:
try:
sample = json.loads(line.strip())
Data.append(sample)
except json.JSONDecodeError:
print(f"Error decoding line {idx}")
current_line_number += 1
return Data
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]

View File

@ -0,0 +1,42 @@
import random
from torch.utils.data import Dataset
max_dataset_size = 100000
class Wikititle(Dataset):
def __init__(self, data_file):
self.data = self.load_data(data_file)
def load_data(self, data_file):
with open(data_file, "rt", encoding="utf-8") as f:
total_lines = sum(1 for _ in f)
# 生成不重复的随机行号列表
random_line_numbers = random.sample(
range(total_lines), min(max_dataset_size, total_lines)
)
random_line_numbers.sort() # 排序以便按顺序读取文件
Data = []
current_line_number = 0
with open(data_file, "rt", encoding="utf-8") as f:
for idx, line in enumerate(f):
if current_line_number >= len(random_line_numbers):
break
if idx == random_line_numbers[current_line_number]:
zh, en = line.split("\t")
sample = {
"chinese": zh,
"english": en
}
Data.append(sample)
current_line_number += 1
return Data
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]

View File

@ -0,0 +1,194 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 16,
"id": "07b697c8-5cc2-4021-9ab8-e7e3c90065ee",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/models/marian/tokenization_marian.py:175: UserWarning: Recommended: pip install sacremoses.\n",
" warnings.warn(\"Recommended: pip install sacremoses.\")\n",
"/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
" warnings.warn(\n",
"/var/folders/25/gdz0c30x3mg1dj9qkwz0ch4w0000gq/T/ipykernel_69064/1647496252.py:14: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
" model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'))\n"
]
}
],
"source": [
"import time\n",
"import torch\n",
"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
"\n",
"# 定义参数\n",
"model_checkpoint = \"Helsinki-NLP/opus-mt-zh-en\"\n",
"checkpoint_path = \"./saves/step_74500_valid_bleu_30.28_model_weights.bin\" # 假设使用训练中的checkpoint\n",
"\n",
"# 加载tokenizer和模型\n",
"tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
"model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)\n",
"\n",
"# 加载checkpoint\n",
"model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'))\n",
"model.eval()\n",
"\n",
"# 将模型转移到设备\n",
"device = \"cuda\" if torch.cuda.is_available() else \"mps\" if torch.backends.mps.is_available() else \"cpu\"\n",
"model = model.to(device)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "ccfb5004-2bdd-4d64-88a3-2af96b87092c",
"metadata": {},
"outputs": [],
"source": [
"def infer_translation(input_text, model, tokenizer, max_length=128, num_beams=1, length_penalty=1.2):\n",
" # 记录推理开始时间\n",
" start_time = time.time()\n",
"\n",
" # 预处理输入文本\n",
" inputs = tokenizer(\n",
" input_text,\n",
" return_tensors=\"pt\",\n",
" padding=\"max_length\",\n",
" max_length=max_length,\n",
" ).to(device)\n",
"\n",
" # 模型生成翻译\n",
" with torch.no_grad():\n",
" output_tokens = model.generate(\n",
" inputs[\"input_ids\"],\n",
" max_length=max_length,\n",
" num_beams=num_beams,\n",
" length_penalty=length_penalty,\n",
" early_stopping=True,\n",
" no_repeat_ngram_size=2,\n",
" temperature = 0.3,\n",
" top_p = 0.85,\n",
" do_sample = False\n",
" )\n",
"\n",
" # 解码生成的tokens为文本\n",
" translation = tokenizer.decode(output_tokens[0], skip_special_tokens=True)\n",
"\n",
" # 记录推理结束时间\n",
" end_time = time.time()\n",
" inference_time = end_time - start_time\n",
"\n",
" return translation, inference_time\n",
"\n",
"def translate(input_text, model, tokenizer):\n",
" lines = input_text.splitlines()\n",
" \n",
" # 存储每一行的翻译结果\n",
" translations = []\n",
" total_time = 0 \n",
" \n",
" # 对每一行进行翻译\n",
" for line in lines:\n",
" if line.strip() == \"\":\n",
" translations.append(\"\")\n",
" continue\n",
" #对于长行按句翻译\n",
" if len(line) > 64 and '。' in line:\n",
" sentences = line.split('。')\n",
" translated_sentences=[]\n",
" for sentence in sentences:\n",
" if sentence.strip() == \"\":\n",
" continue\n",
" translation, time_cost = infer_translation(sentence, model, tokenizer)\n",
" translated_sentences.append(translation)\n",
" total_time += time_cost\n",
" #print(sentence,translation)\n",
" translations.append(\" \".join(translated_sentences))\n",
" else:\n",
" translation, time_cost = infer_translation(line, model, tokenizer)\n",
" #print(line,translation)\n",
" translations.append(translation)\n",
" total_time += time_cost\n",
" \n",
" final_translation = \"\\n\".join(translations)\n",
" \n",
" return final_translation, total_time\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "d5d35c96-3c4a-487c-ac26-d3d97f1208a6",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:567: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.3` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n",
" warnings.warn(\n",
"/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:572: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.85` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.\n",
" warnings.warn(\n",
"/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:615: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams>1` or unset `early_stopping`.\n",
" warnings.warn(\n",
"/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:634: UserWarning: `num_beams` is set to 1. However, `length_penalty` is set to `1.2` -- this flag is only used in beam-based generation modes. You should set `num_beams>1` or unset `length_penalty`.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Original Text: \n",
"自2000年左右台湾的珍珠奶茶传入中国大陆市场规模逐步扩大。当地不断推出新口味的奶茶、水果茶和奶盖茶等创新饮品并提供多样化的配料选择统称为新式茶饮。2018年起奶茶品牌开始采用网红营销策略使得部分城市门店顾客络绎不绝。尽管消费者有多达两千种的搭配选择但销量最高的依旧是珍珠、红豆和布丁这三种经典配料。\n",
"面对激烈的市场竞争,茶饮品牌开始区分不同的档次,从使用红茶粉和奶精的低成本产品,到采用新鲜牛奶和现场煮制的高级奶茶,甚至高端茶叶如大红袍、龙井茶也成为一些品牌的选用。\n",
"\n",
"\n",
"Translated Text: \n",
"Since about 2000, the Pearl Milk Tea of Taiwan has been spreading into the mainland, and the market has gradually expanded The new tea, fruit tea and milk tea are introduced in the local market, and the variety of ingredients is offered, collectively known as new-style tea. Since 2018, the milk tea brand has adopted a mesh marketing strategy, which has made some city stores more and more customers. Despite the fact that consumers have as many as 2, 000 combinations, the highest sales are still the three classic ingredients: pearls and red beans and pudding.\n",
"In the face of fierce market competition, tea and tea brands began to differentiate between low-cost products using red tea powder and cream, high-grade milk tea made from fresh milk and live, even high tea such as big red robes and dragon well tea have become a few brand selections.\n",
"\n",
"Inference Time: 3.8918 seconds\n"
]
}
],
"source": [
"# 用户输入\n",
"input_text = '''自2000年左右台湾的珍珠奶茶传入中国大陆市场规模逐步扩大。当地不断推出新口味的奶茶、水果茶和奶盖茶等创新饮品并提供多样化的配料选择统称为新式茶饮。2018年起奶茶品牌开始采用网红营销策略使得部分城市门店顾客络绎不绝。尽管消费者有多达两千种的搭配选择但销量最高的依旧是珍珠、红豆和布丁这三种经典配料。\n",
"面对激烈的市场竞争,茶饮品牌开始区分不同的档次,从使用红茶粉和奶精的低成本产品,到采用新鲜牛奶和现场煮制的高级奶茶,甚至高端茶叶如大红袍、龙井茶也成为一些品牌的选用。'''\n",
"\n",
"# 进行推理并测量时间\n",
"translated_text, time_taken = translate(input_text, model, tokenizer)\n",
"\n",
"# 输出结果\n",
"print(f\"Original Text: \\n{input_text}\\n\\n\")\n",
"print(f\"Translated Text: \\n{translated_text}\\n\")\n",
"print(f\"Inference Time: {time_taken:.4f} seconds\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

338
translate/zh-en/train.ipynb Normal file
View File

@ -0,0 +1,338 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "e95d90ec-1f93-45d9-ab8a-ee3d0bae293d",
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"import os\n",
"import numpy as np\n",
"import torch\n",
"from torch.utils.data import Dataset, DataLoader, random_split\n",
"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
"from transformers import AdamW, get_scheduler\n",
"from sacrebleu.metrics import BLEU\n",
"from tqdm.auto import tqdm\n",
"import json"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9b8e703a-a5b5-43bf-9b12-2220d869145a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using cpu device\n"
]
}
],
"source": [
"device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
"print(f'Using {device} device')\n",
"\n",
"max_dataset_size = 22000\n",
"train_set_size = 20000\n",
"valid_set_size = 2000\n",
"\n",
"max_input_length = 128\n",
"max_target_length = 128\n",
"\n",
"batch_size = 16\n",
"learning_rate = 1e-5\n",
"epoch_num = 3"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3db1484a-e923-44b9-a2e6-52178a8c09ee",
"metadata": {},
"outputs": [],
"source": [
"class TRANS(Dataset):\n",
" def __init__(self, data_file):\n",
" self.data = self.load_data(data_file)\n",
" \n",
" def load_data(self, data_file):\n",
" Data = {}\n",
" with open(data_file, 'rt', encoding='utf-8') as f:\n",
" for idx, line in enumerate(f):\n",
" if idx >= max_dataset_size:\n",
" break\n",
" sample = json.loads(line.strip())\n",
" Data[idx] = sample\n",
" return Data\n",
" \n",
" def __len__(self):\n",
" return len(self.data)\n",
"\n",
" def __getitem__(self, idx):\n",
" return self.data[idx]\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "0258cad4-f498-4952-ac29-e103ae8e9041",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/models/marian/tokenization_marian.py:175: UserWarning: Recommended: pip install sacremoses.\n",
" warnings.warn(\"Recommended: pip install sacremoses.\")\n",
"/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
" warnings.warn(\n"
]
}
],
"source": [
"data = TRANS('./data/translation2019zh/translation2019zh_train.json')\n",
"train_data, valid_data = random_split(data, [train_set_size, valid_set_size])\n",
"test_data = TRANS('./data/translation2019zh/translation2019zh_valid.json')\n",
"\n",
"model_checkpoint = \"Helsinki-NLP/opus-mt-zh-en\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
"model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)\n",
"model = model.to(device)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "437fb69c-59f6-48f0-9c74-330cf4862b22",
"metadata": {},
"outputs": [],
"source": [
"def collote_fn(batch_samples):\n",
" batch_inputs, batch_targets = [], []\n",
" for sample in batch_samples:\n",
" batch_inputs.append(sample['chinese'])\n",
" batch_targets.append(sample['english'])\n",
" batch_data = tokenizer(\n",
" batch_inputs, \n",
" padding=True, \n",
" max_length=max_input_length,\n",
" truncation=True, \n",
" return_tensors=\"pt\"\n",
" )\n",
" with tokenizer.as_target_tokenizer():\n",
" labels = tokenizer(\n",
" batch_targets, \n",
" padding=True, \n",
" max_length=max_target_length,\n",
" truncation=True, \n",
" return_tensors=\"pt\"\n",
" )[\"input_ids\"]\n",
" batch_data['decoder_input_ids'] = model.prepare_decoder_input_ids_from_labels(labels)\n",
" end_token_index = torch.where(labels == tokenizer.eos_token_id)[1]\n",
" for idx, end_idx in enumerate(end_token_index):\n",
" labels[idx][end_idx+1:] = -100\n",
" batch_data['labels'] = labels\n",
" return batch_data\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "b9f261d8-02ca-47fc-92d7-6d495ae9c6a1",
"metadata": {},
"outputs": [],
"source": [
"train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collote_fn)\n",
"valid_dataloader = DataLoader(valid_data, batch_size=batch_size, shuffle=False, collate_fn=collote_fn)\n",
"test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collote_fn)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "6fcfa14a-a81b-4a3f-a459-cc0c06f4fa70",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
" warnings.warn(\n"
]
}
],
"source": [
"def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_loss):\n",
" progress_bar = tqdm(range(len(dataloader)))\n",
" progress_bar.set_description(f'loss: {0:>7f}')\n",
" finish_batch_num = (epoch-1) * len(dataloader)\n",
" \n",
" model.train()\n",
" for batch, batch_data in enumerate(dataloader, start=1):\n",
" batch_data = batch_data.to(device)\n",
" outputs = model(**batch_data)\n",
" loss = outputs.loss\n",
"\n",
" optimizer.zero_grad()\n",
" loss.backward()\n",
" optimizer.step()\n",
" lr_scheduler.step()\n",
"\n",
" total_loss += loss.item()\n",
" progress_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}')\n",
" progress_bar.update(1)\n",
" return total_loss\n",
"\n",
"bleu = BLEU()\n",
"\n",
"def test_loop(dataloader, model):\n",
" preds, labels = [], []\n",
" \n",
" model.eval()\n",
" for batch_data in tqdm(dataloader):\n",
" batch_data = batch_data.to(device)\n",
" with torch.no_grad():\n",
" generated_tokens = model.generate(\n",
" batch_data[\"input_ids\"],\n",
" attention_mask=batch_data[\"attention_mask\"],\n",
" max_length=max_target_length,\n",
" ).cpu().numpy()\n",
" label_tokens = batch_data[\"labels\"].cpu().numpy()\n",
" \n",
" decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n",
" label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)\n",
" decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)\n",
"\n",
" preds += [pred.strip() for pred in decoded_preds]\n",
" labels += [[label.strip()] for label in decoded_labels]\n",
" bleu_score = bleu.corpus_score(preds, labels).score\n",
" print(f\"BLEU: {bleu_score:>0.2f}\\n\")\n",
" return bleu_score\n",
"\n",
"optimizer = AdamW(model.parameters(), lr=learning_rate)\n",
"lr_scheduler = get_scheduler(\n",
" \"linear\",\n",
" optimizer=optimizer,\n",
" num_warmup_steps=0,\n",
" num_training_steps=epoch_num*len(train_dataloader),\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "12068522-df42-484f-97f1-13ce588bf47b",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "896ba74b-1a6a-402c-b94a-e9cf47bb0d65",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/3\n",
"-------------------------------\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0453b70899854c0191a93b53748ddaa0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/12500 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:4126: UserWarning: `as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your labels by using the argument `text_target` of the regular `__call__` method (either in the same call as your input texts if you use the same keyword arguments, or in a separate call.\n",
" warnings.warn(\n"
]
},
{
"ename": "RuntimeError",
"evalue": "MPS backend out of memory (MPS allocated: 9.37 GB, other allocations: 8.66 GB, max allowed: 18.13 GB). Tried to allocate 222.17 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[12], line 6\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m t \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(epoch_num):\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEpoch \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mt\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m1\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mepoch_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m-------------------------------\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 6\u001b[0m total_loss \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_loop\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtrain_dataloader\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptimizer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlr_scheduler\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtotal_loss\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 7\u001b[0m valid_bleu \u001b[38;5;241m=\u001b[39m test_loop(valid_dataloader, model)\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m valid_bleu \u001b[38;5;241m>\u001b[39m best_bleu:\n",
"Cell \u001b[0;32mIn[10], line 13\u001b[0m, in \u001b[0;36mtrain_loop\u001b[0;34m(dataloader, model, optimizer, lr_scheduler, epoch, total_loss)\u001b[0m\n\u001b[1;32m 10\u001b[0m loss \u001b[38;5;241m=\u001b[39m outputs\u001b[38;5;241m.\u001b[39mloss\n\u001b[1;32m 12\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mzero_grad()\n\u001b[0;32m---> 13\u001b[0m \u001b[43mloss\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 14\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mstep()\n\u001b[1;32m 15\u001b[0m lr_scheduler\u001b[38;5;241m.\u001b[39mstep()\n",
"File \u001b[0;32m/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/torch/_tensor.py:522\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 512\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 513\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m 514\u001b[0m Tensor\u001b[38;5;241m.\u001b[39mbackward,\n\u001b[1;32m 515\u001b[0m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 520\u001b[0m inputs\u001b[38;5;241m=\u001b[39minputs,\n\u001b[1;32m 521\u001b[0m )\n\u001b[0;32m--> 522\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mautograd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 523\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\n\u001b[1;32m 524\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/torch/autograd/__init__.py:347\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 342\u001b[0m retain_graph \u001b[38;5;241m=\u001b[39m create_graph\n\u001b[1;32m 344\u001b[0m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[1;32m 345\u001b[0m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[1;32m 346\u001b[0m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[0;32m--> 347\u001b[0m \u001b[43m_engine_run_backward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 348\u001b[0m \u001b[43m \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 349\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 350\u001b[0m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 351\u001b[0m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 352\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 353\u001b[0m \u001b[43m \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 354\u001b[0m \u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 355\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/opt/anaconda3/envs/sparkastML/lib/python3.10/site-packages/torch/autograd/graph.py:818\u001b[0m, in \u001b[0;36m_engine_run_backward\u001b[0;34m(t_outputs, *args, **kwargs)\u001b[0m\n\u001b[1;32m 816\u001b[0m unregister_hooks \u001b[38;5;241m=\u001b[39m _register_logging_hooks_on_whole_graph(t_outputs)\n\u001b[1;32m 817\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 818\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mVariable\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execution_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[1;32m 819\u001b[0m \u001b[43m \u001b[49m\u001b[43mt_outputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m 820\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001b[39;00m\n\u001b[1;32m 821\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 822\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m attach_logging_hooks:\n",
"\u001b[0;31mRuntimeError\u001b[0m: MPS backend out of memory (MPS allocated: 9.37 GB, other allocations: 8.66 GB, max allowed: 18.13 GB). Tried to allocate 222.17 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure)."
]
}
],
"source": [
"epoch_num = 3\n",
"total_loss = 0.\n",
"best_bleu = 0.\n",
"for t in range(epoch_num):\n",
" print(f\"Epoch {t+1}/{epoch_num}\\n-------------------------------\")\n",
" total_loss = train_loop(train_dataloader, model, optimizer, lr_scheduler, t+1, total_loss)\n",
" valid_bleu = test_loop(valid_dataloader, model)\n",
" if valid_bleu > best_bleu:\n",
" best_bleu = valid_bleu\n",
" print('saving new weights...\\n')\n",
" torch.save(\n",
" model.state_dict(), \n",
" f'epoch_{t+1}_valid_bleu_{valid_bleu:0.2f}_model_weights.bin'\n",
" )\n",
"print(\"Done!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6fd3439a-058a-4220-9b65-b355b52f74b5",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

200
translate/zh-en/train.py Normal file
View File

@ -0,0 +1,200 @@
import numpy as np
import torch
from torch.utils.data import DataLoader, random_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import AdamW, get_scheduler
from sacrebleu.metrics import BLEU
from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter
from dataloader.wikititle import Wikititle
writer = SummaryWriter()
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
train_set_size = 95000
valid_set_size = 5000
test_data_size = 0
last_1k_loss = []
kmean_loss = 0.0
total_loss = 0.0
best_bleu = 0.0
step = 0
max_input_length = 128
max_target_length = 128
batch_size = 8
learning_rate = 1e-5
epoch_num = 1
# 检查点文件路径默认为None
# checkpoint_path = None
checkpoint_path = "./saves/checkpoint_74000.bin" # 如果要从检查点继续训练,设置此路径
data = Wikititle("./data/wikititles-v3.zh-en.tsv")
train_data, valid_data, test_data = random_split(data, [train_set_size, valid_set_size, test_data_size])
# data = TRANS("./data/translation2019zh/translation2019zh_train.json")
# train_data, valid_data = random_split(data, [train_set_size, valid_set_size])
# test_data = TRANS("./data/translation2019zh/translation2019zh_valid.json")
model_checkpoint = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = model.to(device)
# 如果指定了检查点路径,则从检查点加载模型状态
if checkpoint_path is not None:
print(f"Loading checkpoint from {checkpoint_path}")
checkpoint_data = torch.load(checkpoint_path, map_location=device)
model.load_state_dict(checkpoint_data["model_state_dict"])
total_loss = checkpoint_data.get("total_loss", 0.0)
step = checkpoint_data.get("step", 0)
kmean_loss = total_loss / step
last_1k_loss = [kmean_loss] * 1000
def collote_fn(batch_samples):
batch_inputs, batch_targets = [], []
for sample in batch_samples:
batch_inputs.append(sample["chinese"])
batch_targets.append(sample["english"])
batch_data = tokenizer(
batch_inputs,
padding=True,
max_length=max_input_length,
truncation=True,
return_tensors="pt",
)
with tokenizer.as_target_tokenizer():
labels = tokenizer(
batch_targets,
padding=True,
max_length=max_target_length,
truncation=True,
return_tensors="pt",
)["input_ids"]
batch_data["decoder_input_ids"] = model.prepare_decoder_input_ids_from_labels(
labels
)
end_token_index = torch.where(labels == tokenizer.eos_token_id)[1]
for idx, end_idx in enumerate(end_token_index):
labels[idx][end_idx + 1 :] = -100
batch_data["labels"] = labels
batch_data = {k: v.to(device) for k, v in batch_data.items()}
return batch_data
train_dataloader = DataLoader(
train_data, batch_size=batch_size, shuffle=True, collate_fn=collote_fn
)
valid_dataloader = DataLoader(
valid_data, batch_size=batch_size, shuffle=False, collate_fn=collote_fn
)
test_dataloader = DataLoader(
test_data, batch_size=batch_size, shuffle=False, collate_fn=collote_fn
)
def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_loss, step):
progress_bar = tqdm(range(len(dataloader)))
progress_bar.set_description(f"loss: {0:>7f}")
model.train()
for batch, batch_data in enumerate(dataloader, start=1):
outputs = model(**batch_data)
loss = outputs.loss
optimizer.zero_grad()
loss.backward()
optimizer.step()
lr_scheduler.step()
total_loss += loss.item()
del last_1k_loss[0]
last_1k_loss.append(loss.item())
kmean_loss = sum(last_1k_loss) / len(last_1k_loss)
progress_bar.set_description(
f"loss: {kmean_loss:>7f}"
)
progress_bar.update(1)
step += 1
writer.add_scalar("Loss", kmean_loss, step)
writer.add_scalar("Overall Loss", total_loss / step, step)
if step % 250 == 0:
checkpoint = {
"model_state_dict": model.state_dict(),
"total_loss": total_loss,
"kmean_loss": kmean_loss,
"step": step,
}
torch.save(checkpoint, f"checkpoint_{step}.bin")
return total_loss, step
bleu = BLEU()
def test_loop(dataloader, model):
preds, labels = [], []
model.eval()
for batch_data in tqdm(dataloader):
with torch.no_grad():
generated_tokens = (
model.generate(
batch_data["input_ids"],
attention_mask=batch_data["attention_mask"],
max_length=max_target_length,
no_repeat_ngram_size=3,
)
.cpu()
.numpy()
)
label_tokens = batch_data["labels"].cpu().numpy()
decoded_preds = tokenizer.batch_decode(
generated_tokens, skip_special_tokens=True
)
label_tokens = np.where(
label_tokens != -100, label_tokens, tokenizer.pad_token_id
)
decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)
preds += [pred.strip() for pred in decoded_preds]
labels += [[label.strip()] for label in decoded_labels]
bleu_score = bleu.corpus_score(preds, labels).score
print(f"BLEU: {bleu_score:>0.2f}\n")
return bleu_score
optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=int(0.1 * epoch_num * len(train_dataloader)),
num_training_steps=epoch_num * len(train_dataloader),
)
for t in range(epoch_num):
print(f"Epoch {t+1}/{epoch_num}\n {'-'*20}")
total_loss, step = train_loop(
train_dataloader, model, optimizer, lr_scheduler, t + 1, total_loss, step
)
valid_bleu = test_loop(valid_dataloader, model)
print("saving new weights...\n")
checkpoint = {
"model_state_dict": model.state_dict(),
"total_loss": total_loss,
"kmean_loss": kmean_loss,
"step": step,
}
torch.save(checkpoint, f"step_{step}_valid_bleu_{valid_bleu:0.2f}_model_weights.bin")
print("Done!")

View File

@ -0,0 +1,243 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "2d0860b5-8e4e-4596-a81f-be259e188775",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
"import json\n",
"import numpy as np\n",
"from sacrebleu.metrics import BLEU\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "7c22baec-c987-46cd-af6f-5eb594a2b1e4",
"metadata": {},
"outputs": [],
"source": [
"# 定义参数\n",
"checkpoint_path = \"./step_137000_valid_bleu_25.55_model_weights.bin\" # 假设你要加载第2个epoch中的500步的checkpoint\n",
"data_file = \"./data/translation2019zh/translation2019zh_valid.json\" # 假设使用验证集来测试\n",
"model_checkpoint = \"Helsinki-NLP/opus-mt-zh-en\"\n",
"max_dataset_size = 100\n",
"max_input_length = 128\n",
"max_target_length = 128\n",
"batch_size = 8"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "fff9be9b-57d8-4203-b644-155c76baa1ff",
"metadata": {},
"outputs": [],
"source": [
"class TRANS(Dataset):\n",
" def __init__(self, data_file):\n",
" self.data = self.load_data(data_file)\n",
" \n",
" def load_data(self, data_file):\n",
" Data = {}\n",
" with open(data_file, 'rt', encoding='utf-8') as f:\n",
" for idx, line in enumerate(f):\n",
" if idx >= max_dataset_size:\n",
" break\n",
" sample = json.loads(line.strip())\n",
" Data[idx] = sample\n",
" return Data\n",
" \n",
" def __len__(self):\n",
" return len(self.data)\n",
"\n",
" def __getitem__(self, idx):\n",
" return self.data[idx]\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3be38090-c1e5-4fb5-b90d-7f18fe8dc23f",
"metadata": {},
"outputs": [],
"source": [
"def collote_fn(batch_samples):\n",
" batch_inputs, batch_targets = [], []\n",
" for sample in batch_samples:\n",
" batch_inputs.append(sample['chinese'])\n",
" batch_targets.append(sample['english'])\n",
" batch_data = tokenizer(\n",
" batch_inputs, \n",
" padding=True, \n",
" max_length=max_input_length,\n",
" truncation=True, \n",
" return_tensors=\"pt\"\n",
" )\n",
" with tokenizer.as_target_tokenizer():\n",
" labels = tokenizer(\n",
" batch_targets, \n",
" padding=True, \n",
" max_length=max_target_length,\n",
" truncation=True, \n",
" return_tensors=\"pt\"\n",
" )[\"input_ids\"]\n",
" batch_data['decoder_input_ids'] = model.prepare_decoder_input_ids_from_labels(labels)\n",
" end_token_index = torch.where(labels == tokenizer.eos_token_id)[1]\n",
" for idx, end_idx in enumerate(end_token_index):\n",
" labels[idx][end_idx+1:] = -100\n",
" batch_data['labels'] = labels\n",
" return batch_data\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "3d5a697a-9b44-4ff3-96df-24a2cb608773",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/25/gdz0c30x3mg1dj9qkwz0ch4w0000gq/T/ipykernel_13528/1590730426.py:6: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
" model.load_state_dict(torch.load(checkpoint_path, map_location=\"cpu\"))\n"
]
}
],
"source": [
"# 加载模型和tokenizer\n",
"tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
"model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)\n",
"\n",
"# 加载checkpoint\n",
"model.load_state_dict(torch.load(checkpoint_path, map_location=\"cpu\"))\n",
"model.eval()\n",
"\n",
"# 将模型转移到设备\n",
"device = \"cuda\" if torch.cuda.is_available() else \"mps\" if torch.backends.mps.is_available() else \"cpu\"\n",
"model = model.to(device)\n",
"\n",
"# 加载测试数据\n",
"test_data = TRANS(data_file)\n",
"test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True, collate_fn=collote_fn)\n",
"\n",
"# 定义BLEU评估函数\n",
"bleu = BLEU()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "6446e3f4-b6e2-4f8a-abc4-6fee7224d517",
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"def test_model(dataloader, model):\n",
" preds, labels = [], []\n",
"\n",
" model.eval()\n",
" for batch_data in tqdm(dataloader):\n",
" batch_data = batch_data.to(device)\n",
" with torch.no_grad():\n",
" generated_tokens = model.generate(\n",
" batch_data[\"input_ids\"],\n",
" attention_mask=batch_data[\"attention_mask\"],\n",
" max_length=max_target_length,\n",
" ).cpu().numpy()\n",
"\n",
" label_tokens = batch_data[\"labels\"].cpu().numpy()\n",
" \n",
"\n",
" decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)\n",
" label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)\n",
" decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True)\n",
"\n",
" preds += [pred.strip() for pred in decoded_preds]\n",
" labels += [[label.strip()] for label in decoded_labels]\n",
" \n",
" bleu_score = bleu.corpus_score(preds, labels).score\n",
" print(f\"BLEU: {bleu_score:>0.2f}\")\n",
" return bleu_score"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "f196f320-7d8b-44d5-9903-5fdb0532e318",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing model...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|███████████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:33<00:00, 2.61s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"BLEU: 12.95\n",
"Test BLEU score: 12.95\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"print(\"Testing model...\")\n",
"bleu_score = test_model(test_dataloader, model)\n",
"print(f\"Test BLEU score: {bleu_score:0.2f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "96e1d097-93d6-482d-8836-167974de98bc",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}