add: embedding

This commit is contained in:
alikia2x (寒寒) 2024-09-22 21:09:19 +08:00
parent 668a88e917
commit 973b926544
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
11 changed files with 108 additions and 33 deletions

BIN
bun.lockb

Binary file not shown.

View File

@ -17,6 +17,8 @@ import i18next from "i18next";
import { useTranslation } from "react-i18next";
import { keywordSuggestion } from "lib/onesearch/keywordSuggestion";
import { NLUType } from "lib/nlp/load";
import tokenize from "lib/nlp/tokenizer";
import { getEmbedding, getEmbeddingLayer } from "lib/nlp/getEmbedding";
export default function OneSearch() {
const [suggestion, setFinalSuggetsion] = useAtom(suggestionAtom);
@ -104,6 +106,24 @@ export default function OneSearch() {
});
}, [NLUModelLoaded]);
// Real test for tokenizing & embedding
// It works.
// useEffect(() => {
// (async function () {
// const result = await tokenize("你好", "Qwen/Qwen2.5-3B", true, false);
// })();
// }, []);
// useEffect(() => {
// const embedding_file = "/models/token_embeddings.bin";
// (async function () {
// const result = await fetch(embedding_file);
// const arrBuf = await result.arrayBuffer();
// const embeddingDict = getEmbeddingLayer(arrBuf);
// const e = getEmbedding([108386], embeddingDict, 12);
// })();
// }, []);
useEffect(() => {
cleanSuggestion("default-link", "default", "text", "link");
if (validLink(query)) {

View File

@ -0,0 +1,68 @@
function getEmbeddingLayer(buffer: ArrayBuffer): { [key: number]: Float32Array } {
const dict: { [key: number]: Float32Array } = {};
const entrySize = 192;
const numEntries = buffer.byteLength / entrySize;
const dataView = new DataView(buffer);
for (let i = 0; i < numEntries; i++) {
const offset = i * entrySize;
const key = i;
const floatArray = new Float32Array(96);
for (let j = 0; j < 96; j++) {
const halfFloat = dataView.getUint16(offset + j * 2, true);
floatArray[j] = halfFloatToFloat32(halfFloat);
}
dict[key] = floatArray;
}
return dict;
}
function halfFloatToFloat32(halfFloat: number): number {
const sign = (halfFloat & 0x8000) >> 15;
const exponent = (halfFloat & 0x7C00) >> 10;
const fraction = halfFloat & 0x03FF;
if (exponent === 0) {
if (fraction === 0) {
// 零或负零
return sign ? -0.0 : 0.0;
} else {
// 非规格化的数
const f = fraction / 1024.0;
const value = f * Math.pow(2, -14);
return sign ? -value : value;
}
} else if (exponent === 0x1F) {
if (fraction === 0) {
// 无穷大或负无穷大
return sign ? -Infinity : Infinity;
} else {
// NaN
return NaN;
}
} else {
// 规格化的数
const f = (fraction / 1024.0) + 1.0;
const value = f * Math.pow(2, exponent - 15);
return sign ? -value : value;
}
}
function getEmbedding(tokenIds: number[], embeddingDict: { [key: number]: Float32Array }, contextSize: number): Float32Array {
let result: number[] = [];
for (let i = 0; i < contextSize; i++) {
if (i < tokenIds.length) {
const tokenId = tokenIds[i];
result = result.concat(Array.from(embeddingDict[tokenId]));
} else {
result = result.concat(new Array(96).fill(0));
}
}
return new Float32Array(result);
}
export { getEmbeddingLayer, getEmbedding };

View File

@ -1,9 +1,12 @@
import { AutoTokenizer, env } from "@xenova/transformers";
env.allowRemoteModels = false;
env.localModelPath = "/transformers/";
async function tokenize(text: string, model: string) {
async function tokenize(text: string, model: string, mirror: boolean = false, remote: boolean = true) {
if (mirror) {
env.remoteHost = "https://hf-mirror.com";
}
if (!remote) {
env.allowRemoteModels = false;
}
const tokenizer = await AutoTokenizer.from_pretrained(model);
const { input_ids } = await tokenizer(text);
const tokenIds = [];

Binary file not shown.

View File

@ -1,28 +0,0 @@
{
"architectures": [
"Qwen2ForCausalLM"
],
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151643,
"hidden_act": "silu",
"hidden_size": 2048,
"initializer_range": 0.02,
"intermediate_size": 11008,
"max_position_embeddings": 32768,
"max_window_layers": 36,
"model_type": "qwen2",
"num_attention_heads": 16,
"num_hidden_layers": 36,
"num_key_value_heads": 2,
"rms_norm_eps": 1e-06,
"rope_theta": 1000000.0,
"sliding_window": 32768,
"tie_word_embeddings": true,
"torch_dtype": "bfloat16",
"transformers_version": "4.40.1",
"use_cache": true,
"use_mrope": false,
"use_sliding_window": false,
"vocab_size": 151936
}

File diff suppressed because one or more lines are too long

View File

13
test/tokenize.test.ts Normal file
View File

@ -0,0 +1,13 @@
import { describe, expect, test } from "bun:test";
import tokenize from "../lib/nlp/tokenizer";
describe("Test if tokenizer works", () => {
test("Using without a mirror", async () => {
let result = await tokenize("你好,世界!", "Qwen/Qwen2.5-3B", false);
expect(result).toEqual([108386, 3837, 99489, 6313]);
});
test("Using with a mirror", async () => {
let result = await tokenize("你好,世界!", "Qwen/Qwen2.5-3B", true);
expect(result).toEqual([108386, 3837, 99489, 6313]);
});
});