{ "cells": [ { "cell_type": "markdown", "id": "0d107178", "metadata": {}, "source": [ "# Extract the Dictionary & Embedding\n", "\n", "Our model uses the dictionary and embedding layers from `Phi-3-mini-4k-instruct`, a pre-trained transformer model developed by Microsoft. Although our model architecture is not a transformer, we can still benefit from its tokenizer and embedding layers." ] }, { "cell_type": "code", "execution_count": 1, "id": "222bdd54-c115-4845-b4e6-c63e4f9ac6b4", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer, AutoModel\n", "import torch" ] }, { "cell_type": "code", "execution_count": 2, "id": "13f1ed38-ad39-4e6a-8af7-65ace2d14f8d", "metadata": {}, "outputs": [], "source": [ "model_name=\"Qwen/Qwen2.5-3B\"" ] }, { "cell_type": "code", "execution_count": 3, "id": "c1de25fc-e90a-425b-8520-3a57fa534b94", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "38137fc55ad24a9785ecbe1978bbc605", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/2 [00:00 dimension-reduced embedding\n", "token_id_to_reduced_embedding = {token_id: reduced_embeddings[token_id] for token_id in range(len(vocab))}\n", "\n", "torch.save(token_id_to_reduced_embedding, \"token_id_to_reduced_embedding.pt\")\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "e28a612c-7960-42e6-aa36-abd0732f404e", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "# Create dict of token to {token_id, reduced_embedding}\n", "token_to_id = {}\n", "for token, token_id in vocab.items():\n", " token_to_id[token] = token_id\n", "\n", "# Save as JSON\n", "with open(\"token_to_id.json\", \"w\") as f:\n", " json.dump(token_to_id, f)" ] }, { "cell_type": "code", "execution_count": 10, "id": "7a466c54-7c55-4e84-957e-454ae35896ac", "metadata": {}, "outputs": [], "source": [ "import struct\n", "with open(\"token_embeddings.bin\", \"wb\") as f:\n", " for token_id in range(len(vocab)):\n", " # 将向量转换为半精度浮点数并保存\n", " f.write(struct.pack('96e', *reduced_embeddings[token_id].astype(np.float16)))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "511a7cc4-1b8c-468c-b2a0-16dc6d74ab44", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 5 }