add: tokenizer and getEmbedding

remove: model
This commit is contained in:
alikia2x (寒寒) 2024-09-22 20:14:09 +08:00
parent d97e678206
commit 5abe7521ba
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
3 changed files with 94 additions and 0 deletions

38
lib/nlp/getEmbedding.ts Normal file
View File

@ -0,0 +1,38 @@
type EmbeddingDict = { [key: number]: Float32Array };
function getEmbeddingLayer(buffer: Buffer): EmbeddingDict {
const dict: EmbeddingDict = {};
const entrySize = 514;
const numEntries = buffer.length / entrySize;
for (let i = 0; i < numEntries; i++) {
const offset = i * entrySize;
const key = buffer.readUInt16LE(offset);
const floatArray = new Float32Array(128);
for (let j = 0; j < 128; j++) {
floatArray[j] = buffer.readFloatLE(offset + 2 + j * 4);
}
dict[key] = floatArray;
}
return dict;
}
function getEmbedding(tokenIds: number[], embeddingDict: EmbeddingDict, contextSize: number) {
let result: number[] = [];
for (let i = 0; i < contextSize; i++) {
if (i < tokenIds.length) {
const tokenId = tokenIds[i];
result = result.concat(Array.from(embeddingDict[tokenId]))
}
else {
result = result.concat(new Array(128).fill(0))
}
}
return new Float32Array(result);
}
export {getEmbeddingLayer, getEmbedding};

56
lib/nlp/tokenizer.ts Normal file
View File

@ -0,0 +1,56 @@
type TokenDict = { [key: string]: number };
function tokenize(query: string, tokenDict: TokenDict): number[] {
const tokenIds: number[] = [];
let index = 0;
// Replace spaces with "▁"
query = "▁" + query.replace(/ /g, "▁");
query = query.replace(/\n/g, "<0x0A>");
while (index < query.length) {
let bestToken = null;
let bestLength = 0;
// Step 2: Find the longest token that matches the beginning of the remaining query
for (const token in tokenDict) {
if (query.startsWith(token, index) && token.length > bestLength) {
bestToken = token;
bestLength = token.length;
}
}
if (bestToken) {
tokenIds.push(tokenDict[bestToken]);
index += bestLength;
continue;
}
// Step 3: Handle the case where no token matches
const char = query[index];
if (char.charCodeAt(0) <= 127) {
// If the character is ASCII, and it doesn't match any token, treat it as an unknown token
console.error(`Unknown token: ${char}`);
index++;
continue;
}
// If the character is non-ASCII, convert it to a series of bytes and match each byte
const bytes = new TextEncoder().encode(char);
for (const byte of bytes) {
const byteToken = `<0x${byte.toString(16).toUpperCase()}>`;
if (tokenDict[byteToken] === undefined) {
console.error(`Unknown byte token: ${byteToken}`);
index++;
continue;
}
tokenIds.push(tokenDict[byteToken]);
}
index++;
}
return tokenIds;
}
export default tokenize;
export type { TokenDict };

Binary file not shown.