{ "cells": [ { "cell_type": "markdown", "id": "0d107178", "metadata": {}, "source": [ "# Extract the Dictionary & Embedding\n", "\n", "Our model uses the dictionary and embedding layers from `Phi-3-mini-4k-instruct`, a pre-trained transformer model developed by Microsoft. Although our model architecture is not a transformer, we can still benefit from its tokenizer and embedding layers." ] }, { "cell_type": "code", "execution_count": 1, "id": "222bdd54-c115-4845-b4e6-c63e4f9ac6b4", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer, AutoModel\n", "import torch" ] }, { "cell_type": "code", "execution_count": 2, "id": "13f1ed38-ad39-4e6a-8af7-65ace2d14f8d", "metadata": {}, "outputs": [], "source": [ "model_name=\"microsoft/Phi-3-mini-4k-instruct\"" ] }, { "cell_type": "code", "execution_count": 3, "id": "c1de25fc-e90a-425b-8520-3a57fa534b94", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1aeb02c7c8084b1eb1b8e3178882fd60", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/2 [00:00 dimension-reduced embedding\n", "token_id_to_reduced_embedding = {token_id: reduced_embeddings[token_id] for token_id in range(len(vocab))}\n", "\n", "torch.save(token_id_to_reduced_embedding, \"token_id_to_reduced_embedding.pt\")\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "e28a612c-7960-42e6-aa36-abd0732f404e", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "# Create dict of token to {token_id, reduced_embedding}\n", "token_to_id = {}\n", "for token, token_id in vocab.items():\n", " token_to_id[token] = token_id\n", "\n", "# Save as JSON\n", "with open(\"token_to_id.json\", \"w\") as f:\n", " json.dump(token_to_id, f)" ] }, { "cell_type": "code", "execution_count": 10, "id": "7a466c54-7c55-4e84-957e-454ae35896ac", "metadata": {}, "outputs": [], "source": [ "import struct\n", "with open(\"token_embeddings.bin\", \"wb\") as f:\n", " for token_id in range(len(vocab)):\n", " # Write token id (2 bytes)\n", " f.write(struct.pack('H', token_id))\n", " # Write embedding vector (128 float numbers)\n", " f.write(struct.pack('128f', *reduced_embeddings[token_id]))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.19" } }, "nbformat": 4, "nbformat_minor": 5 }