sparkastML/intention-classify/extract.ipynb

315 lines
7.8 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "0d107178",
"metadata": {},
"source": [
"# Extract the Dictionary & Embedding\n",
"\n",
"Our model uses the dictionary and embedding layers from `Phi-3-mini-4k-instruct`, a pre-trained transformer model developed by Microsoft. Although our model architecture is not a transformer, we can still benefit from its tokenizer and embedding layers."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "222bdd54-c115-4845-b4e6-c63e4f9ac6b4",
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer, AutoModel\n",
"import torch"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "13f1ed38-ad39-4e6a-8af7-65ace2d14f8d",
"metadata": {},
"outputs": [],
"source": [
"model_name=\"Qwen/Qwen2.5-3B\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c1de25fc-e90a-425b-8520-3a57fa534b94",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "38137fc55ad24a9785ecbe1978bbc605",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Load models\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"model = AutoModel.from_pretrained(model_name)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "0b0ba45f-5de9-465d-aa45-57ae45c5fb36",
"metadata": {},
"outputs": [],
"source": [
"embedding_layer = model.get_input_embeddings()\n",
"vocab = tokenizer.get_vocab()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "21214ff4-018d-4230-81b9-331ebb42773b",
"metadata": {},
"outputs": [],
"source": [
"def bytes_to_unicode():\n",
" \"\"\"\n",
" Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control\n",
" characters the bpe code barfs on.\n",
"\n",
" The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab\n",
" if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for\n",
" decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup\n",
" tables between utf-8 bytes and unicode strings.\n",
" \"\"\"\n",
" bs = (\n",
" list(range(ord(\"!\"), ord(\"~\") + 1)) + list(range(ord(\"¡\"), ord(\"¬\") + 1)) + list(range(ord(\"®\"), ord(\"ÿ\") + 1))\n",
" )\n",
" cs = bs[:]\n",
" n = 0\n",
" for b in range(2**8):\n",
" if b not in bs:\n",
" bs.append(b)\n",
" cs.append(2**8 + n)\n",
" n += 1\n",
" cs = [chr(n) for n in cs]\n",
" return dict(zip(bs, cs))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "cbc23d2d-985b-443a-83ee-c2286046ad5e",
"metadata": {},
"outputs": [],
"source": [
"btu=bytes_to_unicode()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "4a99fa07-4922-4d8d-9c28-2275bf9cb8df",
"metadata": {},
"outputs": [],
"source": [
"utb = reversed_dict = {value: key for key, value in btu.items()}"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "cb218ea7-50c7-4bb8-aa7f-0ee85da76147",
"metadata": {},
"outputs": [],
"source": [
"result = tokenizer.convert_ids_to_tokens([104307])[0]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "2dcb332a-cba9-4a14-9486-4e1ff6bd3dba",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"å\n",
"229\n",
"¤\n",
"164\n",
"©\n",
"169\n",
"æ\n",
"230\n",
"°\n",
"176\n",
"Ķ\n",
"148\n"
]
}
],
"source": [
"decoded=b\"\"\n",
"for chr in result:\n",
" print(chr)\n",
" if chr in utb:\n",
" print(utb[chr])\n",
" decoded+=bytes([utb[chr]])"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "b1bf1289-2cab-4a97-ad21-b2d24de6d688",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'天气'"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"decoded.decode(\"utf-8\", errors='replace')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "b2f7e08d-c578-4b0c-ad75-cdb545b5433f",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.decomposition import PCA\n",
"from transformers import AutoTokenizer, AutoModel\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "686e1109-e210-45a3-8c58-b8e92bbe85ff",
"metadata": {},
"outputs": [],
"source": [
"DIMENSIONS = 96"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "9c5d7235-f690-4b34-80a4-9b1db1c19100",
"metadata": {},
"outputs": [],
"source": [
"embeddings = []\n",
"for token_id in range(len(vocab)):\n",
" embedding_vector = embedding_layer(torch.tensor([token_id])).detach().numpy()\n",
" embeddings.append(embedding_vector)\n",
"\n",
"# Convert vectors to np arrays\n",
"embeddings = np.vstack(embeddings)\n",
"\n",
"# Use PCA to decrease dimension\n",
"pca = PCA(n_components=DIMENSIONS)\n",
"reduced_embeddings = pca.fit_transform(embeddings)"
]
},
{
"cell_type": "markdown",
"id": "6b834f53-7f39-41ab-9d18-71978e988b30",
"metadata": {},
"source": [
"## Save Model"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "f231dfdd-5f4f-4d0f-ab5d-7a1365535713",
"metadata": {},
"outputs": [],
"source": [
"# Create dict of tokenID -> dimension-reduced embedding\n",
"token_id_to_reduced_embedding = {token_id: reduced_embeddings[token_id] for token_id in range(len(vocab))}\n",
"\n",
"torch.save(token_id_to_reduced_embedding, \"token_id_to_reduced_embedding.pt\")\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "e28a612c-7960-42e6-aa36-abd0732f404e",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"# Create dict of token to {token_id, reduced_embedding}\n",
"token_to_id = {}\n",
"for token, token_id in vocab.items():\n",
" token_to_id[token] = token_id\n",
"\n",
"# Save as JSON\n",
"with open(\"token_to_id.json\", \"w\") as f:\n",
" json.dump(token_to_id, f)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7a466c54-7c55-4e84-957e-454ae35896ac",
"metadata": {},
"outputs": [],
"source": [
"import struct\n",
"with open(\"token_embeddings.bin\", \"wb\") as f:\n",
" for token_id in range(len(vocab)):\n",
" # 将向量转换为半精度浮点数并保存\n",
" f.write(struct.pack('96e', *reduced_embeddings[token_id].astype(np.float16)))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "511a7cc4-1b8c-468c-b2a0-16dc6d74ab44",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}