update: faster tokenizer

2024-10-07 02:02:20 +08:00 · 2024-10-07 02:02:20 +08:00 · a0644eb470
commit a0644eb470
parent ea8b503653
15 changed files with 148 additions and 303521 deletions
--- a/components/onesearch/onesearch.tsx
+++ b/components/onesearch/onesearch.tsx
@ -17,8 +17,10 @@ import { useAtom, useAtomValue } from "jotai";
 import i18next from "i18next";
 import { useTranslation } from "react-i18next";
 import { keywordSuggestion } from "lib/onesearch/keywordSuggestion";
-import tokenize from "lib/nlp/tokenizer";
+import tokenize from "lib/nlp/tokenize/tokenizer";
 import { getEmbedding, getEmbeddingLayer } from "lib/nlp/getEmbedding";
+import { loadVocab } from "lib/nlp/tokenize/loadVocab";
+import BPETokenizer from "lib/nlp/tokenize/BPEtokenizer";

 interface EmbeddingLayer {
 	[key: number]: Float32Array<ArrayBufferLike>;
@ -28,6 +30,7 @@ export default function OneSearch() {
 	const [suggestion, setFinalSuggetsion] = useAtom(suggestionAtom);
 	const [embeddingLayer, setEmbeddingLayer] = useState<EmbeddingLayer | null>(null);
 	const [NLUsession, setNLUsession] = useState<ort.InferenceSession | null>(null);
+	const [tokenizer, setTokenizer] = useState<BPETokenizer | null>(null);
 	const lastRequestTimeRef = useRef(0);
 	const selected = useAtomValue(selectedSuggestionAtom);
 	const settings = useAtomValue(settingsAtom);
@ -93,30 +96,44 @@ export default function OneSearch() {

 	useEffect(() => {
 		if (embeddingLayer !== null) return;
-		const embedding_file = "/models/token_embeddings.bin";
+		const embedding_file = "/model/token_embeddings.bin";
 		(async function () {
 			const result = await fetch(embedding_file);
 			const arrBuf = await result.arrayBuffer();
 			const embeddingDict = getEmbeddingLayer(arrBuf);
 			setEmbeddingLayer(embeddingDict);

-			await loadModel("/models/NLU.onnx");
+			await loadModel("/model/NLU.onnx");
 			// if (!modelLoaded) {
 			// 	console.error("NLU model was not correctly loaded.")
 			// }
 		})();
 	}, []);

+
+	useEffect(() => {
+		if (tokenizer !== null) return;
+		(async function () {
+			await loadTokenizer();
+		})();
+	},[]);
+
 	async function loadModel(modelPath: string) {
 		ort.env.wasm.wasmPaths = "/onnx/";
 		const session = await ort.InferenceSession.create(modelPath);
 		setNLUsession(session);
 	}

+	async function loadTokenizer() {
+		const vocab = await loadVocab();
+		const tokenizer = new BPETokenizer(vocab);
+		setTokenizer(tokenizer);
+	}
+
 	async function getNLUResult(query: string) {
 		const start = new Date().getTime();
-		if (embeddingLayer === null || NLUsession === null) return;
-		const tokenIds = await tokenize(query, "Qwen/Qwen2.5-3B");
+		if (embeddingLayer === null || NLUsession === null || tokenizer == null) return;
+		const tokenIds = await tokenize(query, tokenizer);
 		console.log(new Date().getTime() - start, "ms");
 		const embeddings = getEmbedding(tokenIds, embeddingLayer, 64);
 		const inputTensor = new ort.Tensor("float32", embeddings, [1, 64, 96]);
--- a/lib/nlp/tokenize/BPEtokenizer.ts
+++ b/lib/nlp/tokenize/BPEtokenizer.ts
@ -0,0 +1,94 @@
+class TrieNode {
+    children: Map<string, TrieNode>;
+    tokenId: number | null;
+
+    constructor() {
+        this.children = new Map();
+        this.tokenId = null;
+    }
+}
+
+class Trie {
+    root: TrieNode;
+
+    constructor() {
+        this.root = new TrieNode();
+    }
+
+    insert(token: string, tokenId: number) {
+        let node = this.root;
+        for (const char of token) {
+            if (!node.children.has(char)) {
+                node.children.set(char, new TrieNode());
+            }
+            node = node.children.get(char)!;
+        }
+        node.tokenId = tokenId;
+    }
+
+    searchLongestToken(text: string): [number | null, number] {
+        let node = this.root;
+        let longestTokenId: number | null = null;
+        let currentTokenLength = 0;
+
+        for (const char of text) {
+            if (!node.children.has(char)) {
+                break;
+            }
+            node = node.children.get(char)!;
+            currentTokenLength += 1;
+            if (node.tokenId !== null) {
+                longestTokenId = node.tokenId;
+            }
+        }
+
+        return [longestTokenId, currentTokenLength];
+    }
+}
+
+export default class BPETokenizer {
+    private trie: Trie;
+
+    constructor(vocabulary: { [key: string]: number }) {
+        this.trie = new Trie();
+        for (const token in vocabulary) {
+            if (vocabulary.hasOwnProperty(token)) {
+                this.trie.insert(token, vocabulary[token]);
+            }
+        }
+    }
+
+    tokenize(text: string): number[] {
+        const tokenIds: number[] = [];
+        let i = 0;
+
+        while (i < text.length) {
+            const [longestTokenId, length] = this.trie.searchLongestToken(text.slice(i));
+            if (longestTokenId !== null) {
+                tokenIds.push(longestTokenId);
+                i += length;
+            } else {
+                // If no token is found, treat the character as a single token
+                tokenIds.push(text.charCodeAt(i));
+                i += 1;
+            }
+        }
+
+        return tokenIds;
+    }
+}
+
+// Example usage:
+// const vocabulary = {
+//     'aa': 1,
+//     'bb': 2,
+//     'ab': 3,
+//     'ba': 4,
+//     'a': 5,
+//     'b': 6
+// };
+// const tokenizer = new BPETokenizer(vocabulary);
+
+// const text = 'ababbaa';
+// const tokenIds = tokenizer.tokenize(text);
+// console.log(tokenIds); // Output: [ 3, 3, 6, 1 ]
--- a/lib/nlp/tokenize/loadVocab.ts
+++ b/lib/nlp/tokenize/loadVocab.ts
@ -0,0 +1,21 @@
+export async function loadVocab(): Promise<any> {
+	try {
+		// Fetch the vocab.json file
+		const response = await fetch("/model/vocab.json");
+
+		// Check if the request was successful
+		if (!response.ok) {
+			throw new Error(`HTTP error! status: ${response.status}`);
+		}
+
+		// Parse the JSON from the response
+		const data = await response.json();
+
+		// Return the parsed JSON
+		return data;
+	} catch (error) {
+		// Handle any errors that occurred during the fetch or parsing process
+		console.error("Error loading vocab.json:", error);
+		throw error;
+	}
+}
--- a/lib/nlp/tokenize/tokenizer.ts
+++ b/lib/nlp/tokenize/tokenizer.ts
@ -0,0 +1,7 @@
+import BPETokenizer from "./BPEtokenizer";
+
+async function tokenize(text: string, tokenizer: BPETokenizer) {
+	return tokenizer.tokenize(text);
+}
+
+export default tokenize;
--- a/lib/nlp/tokenizer.ts
+++ b/lib/nlp/tokenizer.ts
@ -1,24 +0,0 @@
-import { AutoTokenizer, env } from "@xenova/transformers";
-
-async function tokenize(
-	text: string,
-	model: string,
-	mirror: boolean = false,
-	remote: boolean = true
-) {
-	if (mirror) {
-		env.remoteHost = "https://hf-mirror.com";
-	}
-	if (!remote) {
-		env.allowRemoteModels = false;
-	}
-	const tokenizer = await AutoTokenizer.from_pretrained(model);
-	const { input_ids } = await tokenizer(text);
-	const tokenIds = [];
-	for (let id of input_ids.data) {
-		tokenIds.push(parseInt(id));
-	}
-	return tokenIds;
-}
-
-export default tokenize;
--- a/pages/about/index.tsx
+++ b/pages/about/index.tsx
@ -48,8 +48,8 @@ export default function AboutPage() {
 			</p>

 			<p className="relative font-bold text-2xl mt-12">Presented By</p>
-			{!darkMode && <img src="/LuminaraStudio.png" className="relative md:h-64 mt-6" />}
-			{darkMode && <img src="/LuminaraStudioDark.png" className="relative md:h-56 mt-6" />}
+			{!darkMode && <img src="/assets/img/LuminaraStudio.png" className="relative md:h-64 mt-6" />}
+			{darkMode && <img src="/assets/img/LuminaraStudioDark.png" className="relative md:h-56 mt-6" />}
 		</AboutLayout>
 	);
 }
--- a/public/assets/img/LuminaraStudio.png
+++ b/public/assets/img/LuminaraStudio.png
--- a/public/assets/img/LuminaraStudioDark.png
+++ b/public/assets/img/LuminaraStudioDark.png
--- a/public/models/NLU.onnx
+++ b/public/models/NLU.onnx
--- a/public/models/NLU_meta.json
+++ b/public/models/NLU_meta.json
--- a/public/models/token_embeddings.bin
+++ b/public/models/token_embeddings.bin
--- a/public/model/vocab.json
+++ b/public/model/vocab.json
--- a/public/models/Qwen/Qwen2.5-3B/tokenizer.json
+++ b/public/models/Qwen/Qwen2.5-3B/tokenizer.json
--- a/public/models/Qwen/Qwen2.5-3B/tokenizer_config.json
+++ b/public/models/Qwen/Qwen2.5-3B/tokenizer_config.json
@ -1,207 +0,0 @@
-{
-	"add_bos_token": false,
-	"add_prefix_space": false,
-	"added_tokens_decoder": {
-		"151643": {
-			"content": "<|endoftext|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": true
-		},
-		"151644": {
-			"content": "<|im_start|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": true
-		},
-		"151645": {
-			"content": "<|im_end|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": true
-		},
-		"151646": {
-			"content": "<|object_ref_start|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": true
-		},
-		"151647": {
-			"content": "<|object_ref_end|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": true
-		},
-		"151648": {
-			"content": "<|box_start|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": true
-		},
-		"151649": {
-			"content": "<|box_end|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": true
-		},
-		"151650": {
-			"content": "<|quad_start|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": true
-		},
-		"151651": {
-			"content": "<|quad_end|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": true
-		},
-		"151652": {
-			"content": "<|vision_start|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": true
-		},
-		"151653": {
-			"content": "<|vision_end|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": true
-		},
-		"151654": {
-			"content": "<|vision_pad|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": true
-		},
-		"151655": {
-			"content": "<|image_pad|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": true
-		},
-		"151656": {
-			"content": "<|video_pad|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": true
-		},
-		"151657": {
-			"content": "<tool_call>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": false
-		},
-		"151658": {
-			"content": "</tool_call>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": false
-		},
-		"151659": {
-			"content": "<|fim_prefix|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": false
-		},
-		"151660": {
-			"content": "<|fim_middle|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": false
-		},
-		"151661": {
-			"content": "<|fim_suffix|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": false
-		},
-		"151662": {
-			"content": "<|fim_pad|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": false
-		},
-		"151663": {
-			"content": "<|repo_name|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": false
-		},
-		"151664": {
-			"content": "<|file_sep|>",
-			"lstrip": false,
-			"normalized": false,
-			"rstrip": false,
-			"single_word": false,
-			"special": false
-		}
-	},
-	"additional_special_tokens": [
-		"<|im_start|>",
-		"<|im_end|>",
-		"<|object_ref_start|>",
-		"<|object_ref_end|>",
-		"<|box_start|>",
-		"<|box_end|>",
-		"<|quad_start|>",
-		"<|quad_end|>",
-		"<|vision_start|>",
-		"<|vision_end|>",
-		"<|vision_pad|>",
-		"<|image_pad|>",
-		"<|video_pad|>"
-	],
-	"bos_token": null,
-	"chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
-	"clean_up_tokenization_spaces": false,
-	"eos_token": "<|endoftext|>",
-	"errors": "replace",
-	"model_max_length": 131072,
-	"pad_token": "<|endoftext|>",
-	"split_special_tokens": false,
-	"tokenizer_class": "Qwen2Tokenizer",
-	"unk_token": null
-}
--- a/test/tokenize.test.ts
+++ b/test/tokenize.test.ts
@ -1,5 +1,5 @@
 import { describe, expect, test } from "bun:test";
-import tokenize from "../lib/nlp/tokenizer";
+import tokenize from "../lib/nlp/tokenize/tokenizer";

 describe("Test if tokenizer works", () => {
 	test("Using without a mirror", async () => {