update: faster tokenizer
This commit is contained in:
parent
ea8b503653
commit
a0644eb470
@ -17,8 +17,10 @@ import { useAtom, useAtomValue } from "jotai";
|
||||
import i18next from "i18next";
|
||||
import { useTranslation } from "react-i18next";
|
||||
import { keywordSuggestion } from "lib/onesearch/keywordSuggestion";
|
||||
import tokenize from "lib/nlp/tokenizer";
|
||||
import tokenize from "lib/nlp/tokenize/tokenizer";
|
||||
import { getEmbedding, getEmbeddingLayer } from "lib/nlp/getEmbedding";
|
||||
import { loadVocab } from "lib/nlp/tokenize/loadVocab";
|
||||
import BPETokenizer from "lib/nlp/tokenize/BPEtokenizer";
|
||||
|
||||
interface EmbeddingLayer {
|
||||
[key: number]: Float32Array<ArrayBufferLike>;
|
||||
@ -28,6 +30,7 @@ export default function OneSearch() {
|
||||
const [suggestion, setFinalSuggetsion] = useAtom(suggestionAtom);
|
||||
const [embeddingLayer, setEmbeddingLayer] = useState<EmbeddingLayer | null>(null);
|
||||
const [NLUsession, setNLUsession] = useState<ort.InferenceSession | null>(null);
|
||||
const [tokenizer, setTokenizer] = useState<BPETokenizer | null>(null);
|
||||
const lastRequestTimeRef = useRef(0);
|
||||
const selected = useAtomValue(selectedSuggestionAtom);
|
||||
const settings = useAtomValue(settingsAtom);
|
||||
@ -93,30 +96,44 @@ export default function OneSearch() {
|
||||
|
||||
useEffect(() => {
|
||||
if (embeddingLayer !== null) return;
|
||||
const embedding_file = "/models/token_embeddings.bin";
|
||||
const embedding_file = "/model/token_embeddings.bin";
|
||||
(async function () {
|
||||
const result = await fetch(embedding_file);
|
||||
const arrBuf = await result.arrayBuffer();
|
||||
const embeddingDict = getEmbeddingLayer(arrBuf);
|
||||
setEmbeddingLayer(embeddingDict);
|
||||
|
||||
await loadModel("/models/NLU.onnx");
|
||||
await loadModel("/model/NLU.onnx");
|
||||
// if (!modelLoaded) {
|
||||
// console.error("NLU model was not correctly loaded.")
|
||||
// }
|
||||
})();
|
||||
}, []);
|
||||
|
||||
|
||||
useEffect(() => {
|
||||
if (tokenizer !== null) return;
|
||||
(async function () {
|
||||
await loadTokenizer();
|
||||
})();
|
||||
},[]);
|
||||
|
||||
async function loadModel(modelPath: string) {
|
||||
ort.env.wasm.wasmPaths = "/onnx/";
|
||||
const session = await ort.InferenceSession.create(modelPath);
|
||||
setNLUsession(session);
|
||||
}
|
||||
|
||||
async function loadTokenizer() {
|
||||
const vocab = await loadVocab();
|
||||
const tokenizer = new BPETokenizer(vocab);
|
||||
setTokenizer(tokenizer);
|
||||
}
|
||||
|
||||
async function getNLUResult(query: string) {
|
||||
const start = new Date().getTime();
|
||||
if (embeddingLayer === null || NLUsession === null) return;
|
||||
const tokenIds = await tokenize(query, "Qwen/Qwen2.5-3B");
|
||||
if (embeddingLayer === null || NLUsession === null || tokenizer == null) return;
|
||||
const tokenIds = await tokenize(query, tokenizer);
|
||||
console.log(new Date().getTime() - start, "ms");
|
||||
const embeddings = getEmbedding(tokenIds, embeddingLayer, 64);
|
||||
const inputTensor = new ort.Tensor("float32", embeddings, [1, 64, 96]);
|
||||
|
94
lib/nlp/tokenize/BPEtokenizer.ts
Normal file
94
lib/nlp/tokenize/BPEtokenizer.ts
Normal file
@ -0,0 +1,94 @@
|
||||
class TrieNode {
|
||||
children: Map<string, TrieNode>;
|
||||
tokenId: number | null;
|
||||
|
||||
constructor() {
|
||||
this.children = new Map();
|
||||
this.tokenId = null;
|
||||
}
|
||||
}
|
||||
|
||||
class Trie {
|
||||
root: TrieNode;
|
||||
|
||||
constructor() {
|
||||
this.root = new TrieNode();
|
||||
}
|
||||
|
||||
insert(token: string, tokenId: number) {
|
||||
let node = this.root;
|
||||
for (const char of token) {
|
||||
if (!node.children.has(char)) {
|
||||
node.children.set(char, new TrieNode());
|
||||
}
|
||||
node = node.children.get(char)!;
|
||||
}
|
||||
node.tokenId = tokenId;
|
||||
}
|
||||
|
||||
searchLongestToken(text: string): [number | null, number] {
|
||||
let node = this.root;
|
||||
let longestTokenId: number | null = null;
|
||||
let currentTokenLength = 0;
|
||||
|
||||
for (const char of text) {
|
||||
if (!node.children.has(char)) {
|
||||
break;
|
||||
}
|
||||
node = node.children.get(char)!;
|
||||
currentTokenLength += 1;
|
||||
if (node.tokenId !== null) {
|
||||
longestTokenId = node.tokenId;
|
||||
}
|
||||
}
|
||||
|
||||
return [longestTokenId, currentTokenLength];
|
||||
}
|
||||
}
|
||||
|
||||
export default class BPETokenizer {
|
||||
private trie: Trie;
|
||||
|
||||
constructor(vocabulary: { [key: string]: number }) {
|
||||
this.trie = new Trie();
|
||||
for (const token in vocabulary) {
|
||||
if (vocabulary.hasOwnProperty(token)) {
|
||||
this.trie.insert(token, vocabulary[token]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tokenize(text: string): number[] {
|
||||
const tokenIds: number[] = [];
|
||||
let i = 0;
|
||||
|
||||
while (i < text.length) {
|
||||
const [longestTokenId, length] = this.trie.searchLongestToken(text.slice(i));
|
||||
if (longestTokenId !== null) {
|
||||
tokenIds.push(longestTokenId);
|
||||
i += length;
|
||||
} else {
|
||||
// If no token is found, treat the character as a single token
|
||||
tokenIds.push(text.charCodeAt(i));
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
return tokenIds;
|
||||
}
|
||||
}
|
||||
|
||||
// Example usage:
|
||||
// const vocabulary = {
|
||||
// 'aa': 1,
|
||||
// 'bb': 2,
|
||||
// 'ab': 3,
|
||||
// 'ba': 4,
|
||||
// 'a': 5,
|
||||
// 'b': 6
|
||||
// };
|
||||
// const tokenizer = new BPETokenizer(vocabulary);
|
||||
|
||||
// const text = 'ababbaa';
|
||||
// const tokenIds = tokenizer.tokenize(text);
|
||||
// console.log(tokenIds); // Output: [ 3, 3, 6, 1 ]
|
21
lib/nlp/tokenize/loadVocab.ts
Normal file
21
lib/nlp/tokenize/loadVocab.ts
Normal file
@ -0,0 +1,21 @@
|
||||
export async function loadVocab(): Promise<any> {
|
||||
try {
|
||||
// Fetch the vocab.json file
|
||||
const response = await fetch("/model/vocab.json");
|
||||
|
||||
// Check if the request was successful
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP error! status: ${response.status}`);
|
||||
}
|
||||
|
||||
// Parse the JSON from the response
|
||||
const data = await response.json();
|
||||
|
||||
// Return the parsed JSON
|
||||
return data;
|
||||
} catch (error) {
|
||||
// Handle any errors that occurred during the fetch or parsing process
|
||||
console.error("Error loading vocab.json:", error);
|
||||
throw error;
|
||||
}
|
||||
}
|
7
lib/nlp/tokenize/tokenizer.ts
Normal file
7
lib/nlp/tokenize/tokenizer.ts
Normal file
@ -0,0 +1,7 @@
|
||||
import BPETokenizer from "./BPEtokenizer";
|
||||
|
||||
async function tokenize(text: string, tokenizer: BPETokenizer) {
|
||||
return tokenizer.tokenize(text);
|
||||
}
|
||||
|
||||
export default tokenize;
|
@ -1,24 +0,0 @@
|
||||
import { AutoTokenizer, env } from "@xenova/transformers";
|
||||
|
||||
async function tokenize(
|
||||
text: string,
|
||||
model: string,
|
||||
mirror: boolean = false,
|
||||
remote: boolean = true
|
||||
) {
|
||||
if (mirror) {
|
||||
env.remoteHost = "https://hf-mirror.com";
|
||||
}
|
||||
if (!remote) {
|
||||
env.allowRemoteModels = false;
|
||||
}
|
||||
const tokenizer = await AutoTokenizer.from_pretrained(model);
|
||||
const { input_ids } = await tokenizer(text);
|
||||
const tokenIds = [];
|
||||
for (let id of input_ids.data) {
|
||||
tokenIds.push(parseInt(id));
|
||||
}
|
||||
return tokenIds;
|
||||
}
|
||||
|
||||
export default tokenize;
|
@ -48,8 +48,8 @@ export default function AboutPage() {
|
||||
</p>
|
||||
|
||||
<p className="relative font-bold text-2xl mt-12">Presented By</p>
|
||||
{!darkMode && <img src="/LuminaraStudio.png" className="relative md:h-64 mt-6" />}
|
||||
{darkMode && <img src="/LuminaraStudioDark.png" className="relative md:h-56 mt-6" />}
|
||||
{!darkMode && <img src="/assets/img/LuminaraStudio.png" className="relative md:h-64 mt-6" />}
|
||||
{darkMode && <img src="/assets/img/LuminaraStudioDark.png" className="relative md:h-56 mt-6" />}
|
||||
</AboutLayout>
|
||||
);
|
||||
}
|
||||
|
Before Width: | Height: | Size: 84 KiB After Width: | Height: | Size: 84 KiB |
Before Width: | Height: | Size: 81 KiB After Width: | Height: | Size: 81 KiB |
1
public/model/vocab.json
Normal file
1
public/model/vocab.json
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@ -1,207 +0,0 @@
|
||||
{
|
||||
"add_bos_token": false,
|
||||
"add_prefix_space": false,
|
||||
"added_tokens_decoder": {
|
||||
"151643": {
|
||||
"content": "<|endoftext|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151644": {
|
||||
"content": "<|im_start|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151645": {
|
||||
"content": "<|im_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151646": {
|
||||
"content": "<|object_ref_start|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151647": {
|
||||
"content": "<|object_ref_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151648": {
|
||||
"content": "<|box_start|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151649": {
|
||||
"content": "<|box_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151650": {
|
||||
"content": "<|quad_start|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151651": {
|
||||
"content": "<|quad_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151652": {
|
||||
"content": "<|vision_start|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151653": {
|
||||
"content": "<|vision_end|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151654": {
|
||||
"content": "<|vision_pad|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151655": {
|
||||
"content": "<|image_pad|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151656": {
|
||||
"content": "<|video_pad|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": true
|
||||
},
|
||||
"151657": {
|
||||
"content": "<tool_call>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"151658": {
|
||||
"content": "</tool_call>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"151659": {
|
||||
"content": "<|fim_prefix|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"151660": {
|
||||
"content": "<|fim_middle|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"151661": {
|
||||
"content": "<|fim_suffix|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"151662": {
|
||||
"content": "<|fim_pad|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"151663": {
|
||||
"content": "<|repo_name|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
},
|
||||
"151664": {
|
||||
"content": "<|file_sep|>",
|
||||
"lstrip": false,
|
||||
"normalized": false,
|
||||
"rstrip": false,
|
||||
"single_word": false,
|
||||
"special": false
|
||||
}
|
||||
},
|
||||
"additional_special_tokens": [
|
||||
"<|im_start|>",
|
||||
"<|im_end|>",
|
||||
"<|object_ref_start|>",
|
||||
"<|object_ref_end|>",
|
||||
"<|box_start|>",
|
||||
"<|box_end|>",
|
||||
"<|quad_start|>",
|
||||
"<|quad_end|>",
|
||||
"<|vision_start|>",
|
||||
"<|vision_end|>",
|
||||
"<|vision_pad|>",
|
||||
"<|image_pad|>",
|
||||
"<|video_pad|>"
|
||||
],
|
||||
"bos_token": null,
|
||||
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
|
||||
"clean_up_tokenization_spaces": false,
|
||||
"eos_token": "<|endoftext|>",
|
||||
"errors": "replace",
|
||||
"model_max_length": 131072,
|
||||
"pad_token": "<|endoftext|>",
|
||||
"split_special_tokens": false,
|
||||
"tokenizer_class": "Qwen2Tokenizer",
|
||||
"unk_token": null
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
import { describe, expect, test } from "bun:test";
|
||||
import tokenize from "../lib/nlp/tokenizer";
|
||||
import tokenize from "../lib/nlp/tokenize/tokenizer";
|
||||
|
||||
describe("Test if tokenizer works", () => {
|
||||
test("Using without a mirror", async () => {
|
||||
|
Loading…
Reference in New Issue
Block a user