sparkastML/intention-classify/extract.ipynb
2024-09-01 22:17:04 +08:00

200 lines
5.1 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "0d107178",
"metadata": {},
"source": [
"# Extract the Dictionary & Embedding\n",
"\n",
"Our model uses the dictionary and embedding layers from `Phi-3-mini-4k-instruct`, a pre-trained transformer model developed by Microsoft. Although our model architecture is not a transformer, we can still benefit from its tokenizer and embedding layers."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "222bdd54-c115-4845-b4e6-c63e4f9ac6b4",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer, AutoModel\n",
"import torch"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "13f1ed38-ad39-4e6a-8af7-65ace2d14f8d",
"metadata": {},
"outputs": [],
"source": [
"model_name=\"microsoft/Phi-3-mini-4k-instruct\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c1de25fc-e90a-425b-8520-3a57fa534b94",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "1aeb02c7c8084b1eb1b8e3178882fd60",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Load models\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"model = AutoModel.from_pretrained(model_name)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "0b0ba45f-5de9-465d-aa45-57ae45c5fb36",
"metadata": {},
"outputs": [],
"source": [
"embedding_layer = model.get_input_embeddings()\n",
"vocab = tokenizer.get_vocab()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "b2f7e08d-c578-4b0c-ad75-cdb545b5433f",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.decomposition import PCA\n",
"from transformers import AutoTokenizer, AutoModel\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "686e1109-e210-45a3-8c58-b8e92bbe85ff",
"metadata": {},
"outputs": [],
"source": [
"DIMENSIONS = 128"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "9c5d7235-f690-4b34-80a4-9b1db1c19100",
"metadata": {},
"outputs": [],
"source": [
"embeddings = []\n",
"for token_id in range(len(vocab)):\n",
" embedding_vector = embedding_layer(torch.tensor([token_id])).detach().numpy()\n",
" embeddings.append(embedding_vector)\n",
"\n",
"# Convert vectors to np arrays\n",
"embeddings = np.vstack(embeddings)\n",
"\n",
"# Use PCA to decrease dimension\n",
"pca = PCA(n_components=DIMENSIONS)\n",
"reduced_embeddings = pca.fit_transform(embeddings)"
]
},
{
"cell_type": "markdown",
"id": "6b834f53-7f39-41ab-9d18-71978e988b30",
"metadata": {},
"source": [
"## Save Model"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "f231dfdd-5f4f-4d0f-ab5d-7a1365535713",
"metadata": {},
"outputs": [],
"source": [
"# Create dict of tokenID -> dimension-reduced embedding\n",
"token_id_to_reduced_embedding = {token_id: reduced_embeddings[token_id] for token_id in range(len(vocab))}\n",
"\n",
"torch.save(token_id_to_reduced_embedding, \"token_id_to_reduced_embedding.pt\")\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "e28a612c-7960-42e6-aa36-abd0732f404e",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"# Create dict of token to {token_id, reduced_embedding}\n",
"token_to_id = {}\n",
"for token, token_id in vocab.items():\n",
" token_to_id[token] = token_id\n",
"\n",
"# Save as JSON\n",
"with open(\"token_to_id.json\", \"w\") as f:\n",
" json.dump(token_to_id, f)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7a466c54-7c55-4e84-957e-454ae35896ac",
"metadata": {},
"outputs": [],
"source": [
"import struct\n",
"with open(\"token_embeddings.bin\", \"wb\") as f:\n",
" for token_id in range(len(vocab)):\n",
" # Write token id (2 bytes)\n",
" f.write(struct.pack('H', token_id))\n",
" # Write embedding vector (128 float numbers)\n",
" f.write(struct.pack('128f', *reduced_embeddings[token_id]))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 5
}