# Extract the Dictionary & Embedding

Our model uses the dictionary and embedding layers from `Phi-3-mini-4k-instruct`, a pre-trained transformer model developed by Microsoft. Although our model architecture is not a transformer, we can still benefit from its tokenizer and embedding layers.

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch

In [2]:
model_name="Qwen/Qwen2.5-3B"

In [3]:
# Load models
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
embedding_layer = model.get_input_embeddings()
vocab = tokenizer.get_vocab()

In [5]:
from sklearn.decomposition import PCA
from transformers import AutoTokenizer, AutoModel
import numpy as np

In [6]:
DIMENSIONS = 96

In [7]:
embeddings = []
for token_id in range(len(vocab)):
    embedding_vector = embedding_layer(torch.tensor([token_id])).detach().numpy()
    embeddings.append(embedding_vector)

# Convert vectors to np arrays
embeddings = np.vstack(embeddings)

# Use PCA to decrease dimension
pca = PCA(n_components=DIMENSIONS)
reduced_embeddings = pca.fit_transform(embeddings)

## Save Model

In [8]:
# Create dict of tokenID -> dimension-reduced embedding
token_id_to_reduced_embedding = {token_id: reduced_embeddings[token_id] for token_id in range(len(vocab))}

torch.save(token_id_to_reduced_embedding, "token_id_to_reduced_embedding.pt")


In [9]:
import json

# Create dict of token to {token_id, reduced_embedding}
token_to_id = {}
for token, token_id in vocab.items():
    token_to_id[token] = token_id

# Save as JSON
with open("token_to_id.json", "w") as f:
    json.dump(token_to_id, f)

In [10]:
import struct
with open("token_embeddings.bin", "wb") as f:
    for token_id in range(len(vocab)):
        # 将向量转换为半精度浮点数并保存
        f.write(struct.pack('96e', *reduced_embeddings[token_id].astype(np.float16)))
