sparkastML/intention-classify/extract.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0d107178",
   "metadata": {},
   "source": [
    "# Extract the Dictionary & Embedding\n",
    "\n",
    "Our model uses the dictionary and embedding layers from `Phi-3-mini-4k-instruct`, a pre-trained transformer model developed by Microsoft. Although our model architecture is not a transformer, we can still benefit from its tokenizer and embedding layers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "222bdd54-c115-4845-b4e6-c63e4f9ac6b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer, AutoModel\n",
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "13f1ed38-ad39-4e6a-8af7-65ace2d14f8d",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_name=\"Qwen/Qwen2.5-3B\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c1de25fc-e90a-425b-8520-3a57fa534b94",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "38137fc55ad24a9785ecbe1978bbc605",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Load models\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "model = AutoModel.from_pretrained(model_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0b0ba45f-5de9-465d-aa45-57ae45c5fb36",
   "metadata": {},
   "outputs": [],
   "source": [
    "embedding_layer = model.get_input_embeddings()\n",
    "vocab = tokenizer.get_vocab()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "21214ff4-018d-4230-81b9-331ebb42773b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def bytes_to_unicode():\n",
    "    \"\"\"\n",
    "    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control\n",
    "    characters the bpe code barfs on.\n",
    "\n",
    "    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab\n",
    "    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for\n",
    "    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup\n",
    "    tables between utf-8 bytes and unicode strings.\n",
    "    \"\"\"\n",
    "    bs = (\n",
    "        list(range(ord(\"!\"), ord(\"~\") + 1)) + list(range(ord(\"¡\"), ord(\"¬\") + 1)) + list(range(ord(\"®\"), ord(\"ÿ\") + 1))\n",
    "    )\n",
    "    cs = bs[:]\n",
    "    n = 0\n",
    "    for b in range(2**8):\n",
    "        if b not in bs:\n",
    "            bs.append(b)\n",
    "            cs.append(2**8 + n)\n",
    "            n += 1\n",
    "    cs = [chr(n) for n in cs]\n",
    "    return dict(zip(bs, cs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "cbc23d2d-985b-443a-83ee-c2286046ad5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "btu=bytes_to_unicode()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "4a99fa07-4922-4d8d-9c28-2275bf9cb8df",
   "metadata": {},
   "outputs": [],
   "source": [
    "utb = reversed_dict = {value: key for key, value in btu.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "cb218ea7-50c7-4bb8-aa7f-0ee85da76147",
   "metadata": {},
   "outputs": [],
   "source": [
    "result = tokenizer.convert_ids_to_tokens([104307])[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "2dcb332a-cba9-4a14-9486-4e1ff6bd3dba",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "å\n",
      "229\n",
      "¤\n",
      "164\n",
      "©\n",
      "169\n",
      "æ\n",
      "230\n",
      "°\n",
      "176\n",
      "Ķ\n",
      "148\n"
     ]
    }
   ],
   "source": [
    "decoded=b\"\"\n",
    "for chr in result:\n",
    "    print(chr)\n",
    "    if chr in utb:\n",
    "        print(utb[chr])\n",
    "        decoded+=bytes([utb[chr]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "b1bf1289-2cab-4a97-ad21-b2d24de6d688",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'天气'"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "decoded.decode(\"utf-8\", errors='replace')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "b2f7e08d-c578-4b0c-ad75-cdb545b5433f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "from transformers import AutoTokenizer, AutoModel\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "686e1109-e210-45a3-8c58-b8e92bbe85ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "DIMENSIONS = 96"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9c5d7235-f690-4b34-80a4-9b1db1c19100",
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings = []\n",
    "for token_id in range(len(vocab)):\n",
    "    embedding_vector = embedding_layer(torch.tensor([token_id])).detach().numpy()\n",
    "    embeddings.append(embedding_vector)\n",
    "\n",
    "# Convert vectors to np arrays\n",
    "embeddings = np.vstack(embeddings)\n",
    "\n",
    "# Use PCA to decrease dimension\n",
    "pca = PCA(n_components=DIMENSIONS)\n",
    "reduced_embeddings = pca.fit_transform(embeddings)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6b834f53-7f39-41ab-9d18-71978e988b30",
   "metadata": {},
   "source": [
    "## Save Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "f231dfdd-5f4f-4d0f-ab5d-7a1365535713",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create dict of tokenID -> dimension-reduced embedding\n",
    "token_id_to_reduced_embedding = {token_id: reduced_embeddings[token_id] for token_id in range(len(vocab))}\n",
    "\n",
    "torch.save(token_id_to_reduced_embedding, \"token_id_to_reduced_embedding.pt\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "e28a612c-7960-42e6-aa36-abd0732f404e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "# Create dict of token to {token_id, reduced_embedding}\n",
    "token_to_id = {}\n",
    "for token, token_id in vocab.items():\n",
    "    token_to_id[token] = token_id\n",
    "\n",
    "# Save as JSON\n",
    "with open(\"token_to_id.json\", \"w\") as f:\n",
    "    json.dump(token_to_id, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "7a466c54-7c55-4e84-957e-454ae35896ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "import struct\n",
    "with open(\"token_embeddings.bin\", \"wb\") as f:\n",
    "    for token_id in range(len(vocab)):\n",
    "        # 将向量转换为半精度浮点数并保存\n",
    "        f.write(struct.pack('96e', *reduced_embeddings[token_id].astype(np.float16)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "511a7cc4-1b8c-468c-b2a0-16dc6d74ab44",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}