315 lines
7.8 KiB
Plaintext
315 lines
7.8 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "0d107178",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Extract the Dictionary & Embedding\n",
|
|
"\n",
|
|
"Our model uses the dictionary and embedding layers from `Phi-3-mini-4k-instruct`, a pre-trained transformer model developed by Microsoft. Although our model architecture is not a transformer, we can still benefit from its tokenizer and embedding layers."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "222bdd54-c115-4845-b4e6-c63e4f9ac6b4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from transformers import AutoTokenizer, AutoModel\n",
|
|
"import torch"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "13f1ed38-ad39-4e6a-8af7-65ace2d14f8d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"model_name=\"Qwen/Qwen2.5-3B\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "c1de25fc-e90a-425b-8520-3a57fa534b94",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "38137fc55ad24a9785ecbe1978bbc605",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
"Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Load models\n",
|
|
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
|
|
"model = AutoModel.from_pretrained(model_name)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "0b0ba45f-5de9-465d-aa45-57ae45c5fb36",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"embedding_layer = model.get_input_embeddings()\n",
|
|
"vocab = tokenizer.get_vocab()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "21214ff4-018d-4230-81b9-331ebb42773b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def bytes_to_unicode():\n",
|
|
" \"\"\"\n",
|
|
" Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control\n",
|
|
" characters the bpe code barfs on.\n",
|
|
"\n",
|
|
" The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab\n",
|
|
" if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for\n",
|
|
" decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup\n",
|
|
" tables between utf-8 bytes and unicode strings.\n",
|
|
" \"\"\"\n",
|
|
" bs = (\n",
|
|
" list(range(ord(\"!\"), ord(\"~\") + 1)) + list(range(ord(\"¡\"), ord(\"¬\") + 1)) + list(range(ord(\"®\"), ord(\"ÿ\") + 1))\n",
|
|
" )\n",
|
|
" cs = bs[:]\n",
|
|
" n = 0\n",
|
|
" for b in range(2**8):\n",
|
|
" if b not in bs:\n",
|
|
" bs.append(b)\n",
|
|
" cs.append(2**8 + n)\n",
|
|
" n += 1\n",
|
|
" cs = [chr(n) for n in cs]\n",
|
|
" return dict(zip(bs, cs))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "cbc23d2d-985b-443a-83ee-c2286046ad5e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"btu=bytes_to_unicode()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "4a99fa07-4922-4d8d-9c28-2275bf9cb8df",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"utb = reversed_dict = {value: key for key, value in btu.items()}"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"id": "cb218ea7-50c7-4bb8-aa7f-0ee85da76147",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"result = tokenizer.convert_ids_to_tokens([104307])[0]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 27,
|
|
"id": "2dcb332a-cba9-4a14-9486-4e1ff6bd3dba",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"å\n",
|
|
"229\n",
|
|
"¤\n",
|
|
"164\n",
|
|
"©\n",
|
|
"169\n",
|
|
"æ\n",
|
|
"230\n",
|
|
"°\n",
|
|
"176\n",
|
|
"Ķ\n",
|
|
"148\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"decoded=b\"\"\n",
|
|
"for chr in result:\n",
|
|
" print(chr)\n",
|
|
" if chr in utb:\n",
|
|
" print(utb[chr])\n",
|
|
" decoded+=bytes([utb[chr]])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"id": "b1bf1289-2cab-4a97-ad21-b2d24de6d688",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"'天气'"
|
|
]
|
|
},
|
|
"execution_count": 29,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"decoded.decode(\"utf-8\", errors='replace')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "b2f7e08d-c578-4b0c-ad75-cdb545b5433f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from sklearn.decomposition import PCA\n",
|
|
"from transformers import AutoTokenizer, AutoModel\n",
|
|
"import numpy as np"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "686e1109-e210-45a3-8c58-b8e92bbe85ff",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"DIMENSIONS = 96"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "9c5d7235-f690-4b34-80a4-9b1db1c19100",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"embeddings = []\n",
|
|
"for token_id in range(len(vocab)):\n",
|
|
" embedding_vector = embedding_layer(torch.tensor([token_id])).detach().numpy()\n",
|
|
" embeddings.append(embedding_vector)\n",
|
|
"\n",
|
|
"# Convert vectors to np arrays\n",
|
|
"embeddings = np.vstack(embeddings)\n",
|
|
"\n",
|
|
"# Use PCA to decrease dimension\n",
|
|
"pca = PCA(n_components=DIMENSIONS)\n",
|
|
"reduced_embeddings = pca.fit_transform(embeddings)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "6b834f53-7f39-41ab-9d18-71978e988b30",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Save Model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "f231dfdd-5f4f-4d0f-ab5d-7a1365535713",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create dict of tokenID -> dimension-reduced embedding\n",
|
|
"token_id_to_reduced_embedding = {token_id: reduced_embeddings[token_id] for token_id in range(len(vocab))}\n",
|
|
"\n",
|
|
"torch.save(token_id_to_reduced_embedding, \"token_id_to_reduced_embedding.pt\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "e28a612c-7960-42e6-aa36-abd0732f404e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json\n",
|
|
"\n",
|
|
"# Create dict of token to {token_id, reduced_embedding}\n",
|
|
"token_to_id = {}\n",
|
|
"for token, token_id in vocab.items():\n",
|
|
" token_to_id[token] = token_id\n",
|
|
"\n",
|
|
"# Save as JSON\n",
|
|
"with open(\"token_to_id.json\", \"w\") as f:\n",
|
|
" json.dump(token_to_id, f)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "7a466c54-7c55-4e84-957e-454ae35896ac",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import struct\n",
|
|
"with open(\"token_embeddings.bin\", \"wb\") as f:\n",
|
|
" for token_id in range(len(vocab)):\n",
|
|
" # 将向量转换为半精度浮点数并保存\n",
|
|
" f.write(struct.pack('96e', *reduced_embeddings[token_id].astype(np.float16)))\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "511a7cc4-1b8c-468c-b2a0-16dc6d74ab44",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.14"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|