add: embedding

2024-09-22 21:09:19 +08:00 · 2024-09-22 21:09:19 +08:00 · 973b926544
commit 973b926544
parent 668a88e917
11 changed files with 108 additions and 33 deletions
--- a/bun.lockb
+++ b/bun.lockb
--- a/components/onesearch/onesearch.tsx
+++ b/components/onesearch/onesearch.tsx
@ -17,6 +17,8 @@ import i18next from "i18next";
 import { useTranslation } from "react-i18next";
 import { keywordSuggestion } from "lib/onesearch/keywordSuggestion";
 import { NLUType } from "lib/nlp/load";
+import tokenize from "lib/nlp/tokenizer";
+import { getEmbedding, getEmbeddingLayer } from "lib/nlp/getEmbedding";

 export default function OneSearch() {
 	const [suggestion, setFinalSuggetsion] = useAtom(suggestionAtom);
@ -104,6 +106,24 @@ export default function OneSearch() {
 		});
 	}, [NLUModelLoaded]);

+	// Real test for tokenizing & embedding
+	// It works.
+	// useEffect(() => {
+	// 	(async function () {
+	// 		const result = await tokenize("你好", "Qwen/Qwen2.5-3B", true, false);
+	// 	})();
+	// }, []);
+	
+	// useEffect(() => {
+	// 	const embedding_file = "/models/token_embeddings.bin";
+	// 	(async function () {
+	// 		const result = await fetch(embedding_file);
+	// 		const arrBuf = await result.arrayBuffer();
+	// 		const embeddingDict = getEmbeddingLayer(arrBuf);
+	// 		const e = getEmbedding([108386], embeddingDict, 12);
+	// 	})();
+	// }, []);
+
 	useEffect(() => {
 		cleanSuggestion("default-link", "default", "text", "link");
 		if (validLink(query)) {
--- a/lib/nlp/getEmbedding.ts
+++ b/lib/nlp/getEmbedding.ts
@ -0,0 +1,68 @@
+function getEmbeddingLayer(buffer: ArrayBuffer): { [key: number]: Float32Array } {
+    const dict: { [key: number]: Float32Array } = {};
+
+    const entrySize = 192;
+    const numEntries = buffer.byteLength / entrySize;
+    const dataView = new DataView(buffer);
+
+    for (let i = 0; i < numEntries; i++) {
+        const offset = i * entrySize;
+        const key = i;
+        const floatArray = new Float32Array(96);
+
+        for (let j = 0; j < 96; j++) {
+            const halfFloat = dataView.getUint16(offset + j * 2, true);
+            floatArray[j] = halfFloatToFloat32(halfFloat);
+        }
+
+        dict[key] = floatArray;
+    }
+
+    return dict;
+}
+
+function halfFloatToFloat32(halfFloat: number): number {
+    const sign = (halfFloat & 0x8000) >> 15;
+    const exponent = (halfFloat & 0x7C00) >> 10;
+    const fraction = halfFloat & 0x03FF;
+
+    if (exponent === 0) {
+        if (fraction === 0) {
+            // 零或负零
+            return sign ? -0.0 : 0.0;
+        } else {
+            // 非规格化的数
+            const f = fraction / 1024.0;
+            const value = f * Math.pow(2, -14);
+            return sign ? -value : value;
+        }
+    } else if (exponent === 0x1F) {
+        if (fraction === 0) {
+            // 无穷大或负无穷大
+            return sign ? -Infinity : Infinity;
+        } else {
+            // NaN
+            return NaN;
+        }
+    } else {
+        // 规格化的数
+        const f = (fraction / 1024.0) + 1.0;
+        const value = f * Math.pow(2, exponent - 15);
+        return sign ? -value : value;
+    }
+}
+
+function getEmbedding(tokenIds: number[], embeddingDict: { [key: number]: Float32Array }, contextSize: number): Float32Array {
+    let result: number[] = [];
+    for (let i = 0; i < contextSize; i++) {
+        if (i < tokenIds.length) {
+            const tokenId = tokenIds[i];
+            result = result.concat(Array.from(embeddingDict[tokenId]));
+        } else {
+            result = result.concat(new Array(96).fill(0));
+        }
+    }
+    return new Float32Array(result);
+}
+
+export { getEmbeddingLayer, getEmbedding };
--- a/lib/nlp/tokenizer.ts
+++ b/lib/nlp/tokenizer.ts
@ -1,9 +1,12 @@
 import { AutoTokenizer, env } from "@xenova/transformers";

-env.allowRemoteModels = false;
-env.localModelPath = "/transformers/";
-
-async function tokenize(text: string, model: string) {
+async function tokenize(text: string, model: string, mirror: boolean = false, remote: boolean = true) {
+	if (mirror) {
+		env.remoteHost = "https://hf-mirror.com";
+	}
+	if (!remote) {
+		env.allowRemoteModels = false;
+	}
 	const tokenizer = await AutoTokenizer.from_pretrained(model);
 	const { input_ids } = await tokenizer(text);
 	const tokenIds = [];
--- a/public/transformers/Qwen/Qwen2.5-3B/tokenizer.json
+++ b/public/transformers/Qwen/Qwen2.5-3B/tokenizer.json
--- a/public/transformers/Qwen/Qwen2.5-3B/tokenizer_config.json
+++ b/public/transformers/Qwen/Qwen2.5-3B/tokenizer_config.json
--- a/public/models/token_embeddings.bin
+++ b/public/models/token_embeddings.bin
--- a/public/transformers/Qwen/Qwen2.5-3B/config.json
+++ b/public/transformers/Qwen/Qwen2.5-3B/config.json
@ -1,28 +0,0 @@
-{
-  "architectures": [
-    "Qwen2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 151643,
-  "eos_token_id": 151643,
-  "hidden_act": "silu",
-  "hidden_size": 2048,
-  "initializer_range": 0.02,
-  "intermediate_size": 11008,
-  "max_position_embeddings": 32768,
-  "max_window_layers": 36,
-  "model_type": "qwen2",
-  "num_attention_heads": 16,
-  "num_hidden_layers": 36,
-  "num_key_value_heads": 2,
-  "rms_norm_eps": 1e-06,
-  "rope_theta": 1000000.0,
-  "sliding_window": 32768,
-  "tie_word_embeddings": true,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.40.1",
-  "use_cache": true,
-  "use_mrope": false,
-  "use_sliding_window": false,
-  "vocab_size": 151936
-}
--- a/public/transformers/Qwen/Qwen2.5-3B/vocab.json
+++ b/public/transformers/Qwen/Qwen2.5-3B/vocab.json
--- a/test/getEmbeddings.test.ts
+++ b/test/getEmbeddings.test.ts
--- a/test/tokenize.test.ts
+++ b/test/tokenize.test.ts
@ -0,0 +1,13 @@
+import { describe, expect, test } from "bun:test";
+import tokenize from "../lib/nlp/tokenizer";
+
+describe("Test if tokenizer works", () => {
+	test("Using without a mirror", async () => {
+		let result = await tokenize("你好，世界！", "Qwen/Qwen2.5-3B", false);
+		expect(result).toEqual([108386, 3837, 99489, 6313]);
+	});
+	test("Using with a mirror", async () => {
+		let result = await tokenize("你好，世界！", "Qwen/Qwen2.5-3B", true);
+		expect(result).toEqual([108386, 3837, 99489, 6313]);
+	});
+});