update: faster tokenizer

This commit is contained in:
alikia2x (寒寒) 2024-10-07 02:02:20 +08:00
parent ea8b503653
commit a0644eb470
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
15 changed files with 148 additions and 303521 deletions

View File

@ -17,8 +17,10 @@ import { useAtom, useAtomValue } from "jotai";
import i18next from "i18next";
import { useTranslation } from "react-i18next";
import { keywordSuggestion } from "lib/onesearch/keywordSuggestion";
import tokenize from "lib/nlp/tokenizer";
import tokenize from "lib/nlp/tokenize/tokenizer";
import { getEmbedding, getEmbeddingLayer } from "lib/nlp/getEmbedding";
import { loadVocab } from "lib/nlp/tokenize/loadVocab";
import BPETokenizer from "lib/nlp/tokenize/BPEtokenizer";
interface EmbeddingLayer {
[key: number]: Float32Array<ArrayBufferLike>;
@ -28,6 +30,7 @@ export default function OneSearch() {
const [suggestion, setFinalSuggetsion] = useAtom(suggestionAtom);
const [embeddingLayer, setEmbeddingLayer] = useState<EmbeddingLayer | null>(null);
const [NLUsession, setNLUsession] = useState<ort.InferenceSession | null>(null);
const [tokenizer, setTokenizer] = useState<BPETokenizer | null>(null);
const lastRequestTimeRef = useRef(0);
const selected = useAtomValue(selectedSuggestionAtom);
const settings = useAtomValue(settingsAtom);
@ -93,30 +96,44 @@ export default function OneSearch() {
useEffect(() => {
if (embeddingLayer !== null) return;
const embedding_file = "/models/token_embeddings.bin";
const embedding_file = "/model/token_embeddings.bin";
(async function () {
const result = await fetch(embedding_file);
const arrBuf = await result.arrayBuffer();
const embeddingDict = getEmbeddingLayer(arrBuf);
setEmbeddingLayer(embeddingDict);
await loadModel("/models/NLU.onnx");
await loadModel("/model/NLU.onnx");
// if (!modelLoaded) {
// console.error("NLU model was not correctly loaded.")
// }
})();
}, []);
useEffect(() => {
if (tokenizer !== null) return;
(async function () {
await loadTokenizer();
})();
},[]);
async function loadModel(modelPath: string) {
ort.env.wasm.wasmPaths = "/onnx/";
const session = await ort.InferenceSession.create(modelPath);
setNLUsession(session);
}
async function loadTokenizer() {
const vocab = await loadVocab();
const tokenizer = new BPETokenizer(vocab);
setTokenizer(tokenizer);
}
async function getNLUResult(query: string) {
const start = new Date().getTime();
if (embeddingLayer === null || NLUsession === null) return;
const tokenIds = await tokenize(query, "Qwen/Qwen2.5-3B");
if (embeddingLayer === null || NLUsession === null || tokenizer == null) return;
const tokenIds = await tokenize(query, tokenizer);
console.log(new Date().getTime() - start, "ms");
const embeddings = getEmbedding(tokenIds, embeddingLayer, 64);
const inputTensor = new ort.Tensor("float32", embeddings, [1, 64, 96]);

View File

@ -0,0 +1,94 @@
class TrieNode {
children: Map<string, TrieNode>;
tokenId: number | null;
constructor() {
this.children = new Map();
this.tokenId = null;
}
}
class Trie {
root: TrieNode;
constructor() {
this.root = new TrieNode();
}
insert(token: string, tokenId: number) {
let node = this.root;
for (const char of token) {
if (!node.children.has(char)) {
node.children.set(char, new TrieNode());
}
node = node.children.get(char)!;
}
node.tokenId = tokenId;
}
searchLongestToken(text: string): [number | null, number] {
let node = this.root;
let longestTokenId: number | null = null;
let currentTokenLength = 0;
for (const char of text) {
if (!node.children.has(char)) {
break;
}
node = node.children.get(char)!;
currentTokenLength += 1;
if (node.tokenId !== null) {
longestTokenId = node.tokenId;
}
}
return [longestTokenId, currentTokenLength];
}
}
export default class BPETokenizer {
private trie: Trie;
constructor(vocabulary: { [key: string]: number }) {
this.trie = new Trie();
for (const token in vocabulary) {
if (vocabulary.hasOwnProperty(token)) {
this.trie.insert(token, vocabulary[token]);
}
}
}
tokenize(text: string): number[] {
const tokenIds: number[] = [];
let i = 0;
while (i < text.length) {
const [longestTokenId, length] = this.trie.searchLongestToken(text.slice(i));
if (longestTokenId !== null) {
tokenIds.push(longestTokenId);
i += length;
} else {
// If no token is found, treat the character as a single token
tokenIds.push(text.charCodeAt(i));
i += 1;
}
}
return tokenIds;
}
}
// Example usage:
// const vocabulary = {
// 'aa': 1,
// 'bb': 2,
// 'ab': 3,
// 'ba': 4,
// 'a': 5,
// 'b': 6
// };
// const tokenizer = new BPETokenizer(vocabulary);
// const text = 'ababbaa';
// const tokenIds = tokenizer.tokenize(text);
// console.log(tokenIds); // Output: [ 3, 3, 6, 1 ]

View File

@ -0,0 +1,21 @@
export async function loadVocab(): Promise<any> {
try {
// Fetch the vocab.json file
const response = await fetch("/model/vocab.json");
// Check if the request was successful
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
// Parse the JSON from the response
const data = await response.json();
// Return the parsed JSON
return data;
} catch (error) {
// Handle any errors that occurred during the fetch or parsing process
console.error("Error loading vocab.json:", error);
throw error;
}
}

View File

@ -0,0 +1,7 @@
import BPETokenizer from "./BPEtokenizer";
async function tokenize(text: string, tokenizer: BPETokenizer) {
return tokenizer.tokenize(text);
}
export default tokenize;

View File

@ -1,24 +0,0 @@
import { AutoTokenizer, env } from "@xenova/transformers";
async function tokenize(
text: string,
model: string,
mirror: boolean = false,
remote: boolean = true
) {
if (mirror) {
env.remoteHost = "https://hf-mirror.com";
}
if (!remote) {
env.allowRemoteModels = false;
}
const tokenizer = await AutoTokenizer.from_pretrained(model);
const { input_ids } = await tokenizer(text);
const tokenIds = [];
for (let id of input_ids.data) {
tokenIds.push(parseInt(id));
}
return tokenIds;
}
export default tokenize;

View File

@ -48,8 +48,8 @@ export default function AboutPage() {
</p>
<p className="relative font-bold text-2xl mt-12">Presented By</p>
{!darkMode && <img src="/LuminaraStudio.png" className="relative md:h-64 mt-6" />}
{darkMode && <img src="/LuminaraStudioDark.png" className="relative md:h-56 mt-6" />}
{!darkMode && <img src="/assets/img/LuminaraStudio.png" className="relative md:h-64 mt-6" />}
{darkMode && <img src="/assets/img/LuminaraStudioDark.png" className="relative md:h-56 mt-6" />}
</AboutLayout>
);
}

View File

Before

Width:  |  Height:  |  Size: 84 KiB

After

Width:  |  Height:  |  Size: 84 KiB

View File

Before

Width:  |  Height:  |  Size: 81 KiB

After

Width:  |  Height:  |  Size: 81 KiB

1
public/model/vocab.json Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -1,207 +0,0 @@
{
"add_bos_token": false,
"add_prefix_space": false,
"added_tokens_decoder": {
"151643": {
"content": "<|endoftext|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151644": {
"content": "<|im_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151645": {
"content": "<|im_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151646": {
"content": "<|object_ref_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151647": {
"content": "<|object_ref_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151648": {
"content": "<|box_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151649": {
"content": "<|box_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151650": {
"content": "<|quad_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151651": {
"content": "<|quad_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151652": {
"content": "<|vision_start|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151653": {
"content": "<|vision_end|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151654": {
"content": "<|vision_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151655": {
"content": "<|image_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151656": {
"content": "<|video_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"151657": {
"content": "<tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151658": {
"content": "</tool_call>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151659": {
"content": "<|fim_prefix|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151660": {
"content": "<|fim_middle|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151661": {
"content": "<|fim_suffix|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151662": {
"content": "<|fim_pad|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151663": {
"content": "<|repo_name|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
},
"151664": {
"content": "<|file_sep|>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": false
}
},
"additional_special_tokens": [
"<|im_start|>",
"<|im_end|>",
"<|object_ref_start|>",
"<|object_ref_end|>",
"<|box_start|>",
"<|box_end|>",
"<|quad_start|>",
"<|quad_end|>",
"<|vision_start|>",
"<|vision_end|>",
"<|vision_pad|>",
"<|image_pad|>",
"<|video_pad|>"
],
"bos_token": null,
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
"clean_up_tokenization_spaces": false,
"eos_token": "<|endoftext|>",
"errors": "replace",
"model_max_length": 131072,
"pad_token": "<|endoftext|>",
"split_special_tokens": false,
"tokenizer_class": "Qwen2Tokenizer",
"unk_token": null
}

View File

@ -1,5 +1,5 @@
import { describe, expect, test } from "bun:test";
import tokenize from "../lib/nlp/tokenizer";
import tokenize from "../lib/nlp/tokenize/tokenizer";
describe("Test if tokenizer works", () => {
test("Using without a mirror", async () => {