add: embedding

2024-09-22 21:09:19 +08:00 · 2024-09-22 21:09:19 +08:00 · 973b926544
commit 973b926544
parent 668a88e917
11 changed files with 108 additions and 33 deletions
--- a/bun.lockb
+++ b/bun.lockb
--- a/components/onesearch/onesearch.tsx
+++ b/components/onesearch/onesearch.tsx
@ -17,6 +17,8 @@ import i18next from "i18next";
 import { useTranslation } from "react-i18next";
 import { keywordSuggestion } from "lib/onesearch/keywordSuggestion";
 import { NLUType } from "lib/nlp/load";
 import tokenize from "lib/nlp/tokenizer";
 import { getEmbedding, getEmbeddingLayer } from "lib/nlp/getEmbedding";
 export default function OneSearch() {
 	const [suggestion, setFinalSuggetsion] = useAtom(suggestionAtom);
@ -104,6 +106,24 @@ export default function OneSearch() {
 		});
 	}, [NLUModelLoaded]);
 	// Real test for tokenizing & embedding
 	// It works.
 	// useEffect(() => {
 	// 	(async function () {
 	// 		const result = await tokenize("你好", "Qwen/Qwen2.5-3B", true, false);
 	// 	})();
 	// }, []);
 	// useEffect(() => {
 	// 	const embedding_file = "/models/token_embeddings.bin";
 	// 	(async function () {
 	// 		const result = await fetch(embedding_file);
 	// 		const arrBuf = await result.arrayBuffer();
 	// 		const embeddingDict = getEmbeddingLayer(arrBuf);
 	// 		const e = getEmbedding([108386], embeddingDict, 12);
 	// 	})();
 	// }, []);
 	useEffect(() => {
 		cleanSuggestion("default-link", "default", "text", "link");
 		if (validLink(query)) {
--- a/lib/nlp/getEmbedding.ts
+++ b/lib/nlp/getEmbedding.ts
@ -0,0 +1,68 @@
 function getEmbeddingLayer(buffer: ArrayBuffer): { [key: number]: Float32Array } {
    const dict: { [key: number]: Float32Array } = {};
    const entrySize = 192;
    const numEntries = buffer.byteLength / entrySize;
    const dataView = new DataView(buffer);
    for (let i = 0; i < numEntries; i++) {
        const offset = i * entrySize;
        const key = i;
        const floatArray = new Float32Array(96);
        for (let j = 0; j < 96; j++) {
            const halfFloat = dataView.getUint16(offset + j * 2, true);
            floatArray[j] = halfFloatToFloat32(halfFloat);
        }
        dict[key] = floatArray;
    }
    return dict;
 }
 function halfFloatToFloat32(halfFloat: number): number {
    const sign = (halfFloat & 0x8000) >> 15;
    const exponent = (halfFloat & 0x7C00) >> 10;
    const fraction = halfFloat & 0x03FF;
    if (exponent === 0) {
        if (fraction === 0) {
            // 零或负零
            return sign ? -0.0 : 0.0;
        } else {
            // 非规格化的数
            const f = fraction / 1024.0;
            const value = f * Math.pow(2, -14);
            return sign ? -value : value;
        }
    } else if (exponent === 0x1F) {
        if (fraction === 0) {
            // 无穷大或负无穷大
            return sign ? -Infinity : Infinity;
        } else {
            // NaN
            return NaN;
        }
    } else {
        // 规格化的数
        const f = (fraction / 1024.0) + 1.0;
        const value = f * Math.pow(2, exponent - 15);
        return sign ? -value : value;
    }
 }
 function getEmbedding(tokenIds: number[], embeddingDict: { [key: number]: Float32Array }, contextSize: number): Float32Array {
    let result: number[] = [];
    for (let i = 0; i < contextSize; i++) {
        if (i < tokenIds.length) {
            const tokenId = tokenIds[i];
            result = result.concat(Array.from(embeddingDict[tokenId]));
        } else {
            result = result.concat(new Array(96).fill(0));
        }
    }
    return new Float32Array(result);
 }
 export { getEmbeddingLayer, getEmbedding };
--- a/lib/nlp/tokenizer.ts
+++ b/lib/nlp/tokenizer.ts
@ -1,9 +1,12 @@
 import { AutoTokenizer, env } from "@xenova/transformers";
 async function tokenize(text: string, model: string, mirror: boolean = false, remote: boolean = true) {
 	if (mirror) {
 		env.remoteHost = "https://hf-mirror.com";
 	}
 	if (!remote) {
 		env.allowRemoteModels = false;
-env.localModelPath = "/transformers/";
+	}
 async function tokenize(text: string, model: string) {
 	const tokenizer = await AutoTokenizer.from_pretrained(model);
 	const { input_ids } = await tokenizer(text);
 	const tokenIds = [];
--- a/public/transformers/Qwen/Qwen2.5-3B/tokenizer.json
+++ b/public/transformers/Qwen/Qwen2.5-3B/tokenizer.json
--- a/public/transformers/Qwen/Qwen2.5-3B/tokenizer_config.json
+++ b/public/transformers/Qwen/Qwen2.5-3B/tokenizer_config.json
--- a/public/models/token_embeddings.bin
+++ b/public/models/token_embeddings.bin
--- a/public/transformers/Qwen/Qwen2.5-3B/config.json
+++ b/public/transformers/Qwen/Qwen2.5-3B/config.json
@ -1,28 +0,0 @@
 {
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151643,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 32768,
  "max_window_layers": 36,
  "model_type": "qwen2",
  "num_attention_heads": 16,
  "num_hidden_layers": 36,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_theta": 1000000.0,
  "sliding_window": 32768,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.40.1",
  "use_cache": true,
  "use_mrope": false,
  "use_sliding_window": false,
  "vocab_size": 151936
 }
--- a/public/transformers/Qwen/Qwen2.5-3B/vocab.json
+++ b/public/transformers/Qwen/Qwen2.5-3B/vocab.json
--- a/test/getEmbeddings.test.ts
+++ b/test/getEmbeddings.test.ts
--- a/test/tokenize.test.ts
+++ b/test/tokenize.test.ts
@ -0,0 +1,13 @@
 import { describe, expect, test } from "bun:test";
 import tokenize from "../lib/nlp/tokenizer";
 describe("Test if tokenizer works", () => {
 	test("Using without a mirror", async () => {
 		let result = await tokenize("你好，世界！", "Qwen/Qwen2.5-3B", false);
 		expect(result).toEqual([108386, 3837, 99489, 6313]);
 	});
 	test("Using with a mirror", async () => {
 		let result = await tokenize("你好，世界！", "Qwen/Qwen2.5-3B", true);
 		expect(result).toEqual([108386, 3837, 99489, 6313]);
 	});
 });