add: tokenizer and getEmbedding

remove: model
2024-09-22 20:14:09 +08:00 · 2024-09-22 20:14:09 +08:00 · 5abe7521ba
commit 5abe7521ba
parent d97e678206
3 changed files with 94 additions and 0 deletions
--- a/lib/nlp/getEmbedding.ts
+++ b/lib/nlp/getEmbedding.ts
@ -0,0 +1,38 @@
 type EmbeddingDict = { [key: number]: Float32Array };
 function getEmbeddingLayer(buffer: Buffer): EmbeddingDict {
    const dict: EmbeddingDict = {};
    const entrySize = 514;
    const numEntries = buffer.length / entrySize;
    for (let i = 0; i < numEntries; i++) {
        const offset = i * entrySize;
        const key = buffer.readUInt16LE(offset);
        const floatArray = new Float32Array(128);
        for (let j = 0; j < 128; j++) {
            floatArray[j] = buffer.readFloatLE(offset + 2 + j * 4);
        }
        dict[key] = floatArray;
    }
    return dict;
 }
 function getEmbedding(tokenIds: number[], embeddingDict: EmbeddingDict, contextSize: number) {
  let result: number[] = [];
  for (let i = 0; i < contextSize; i++) {
    if (i < tokenIds.length) {
      const tokenId = tokenIds[i];
      result = result.concat(Array.from(embeddingDict[tokenId]))
    }
    else {
      result = result.concat(new Array(128).fill(0))
    }
  }
  return new Float32Array(result);
 }
 export {getEmbeddingLayer, getEmbedding};
--- a/lib/nlp/tokenizer.ts
+++ b/lib/nlp/tokenizer.ts
@ -0,0 +1,56 @@
 type TokenDict = { [key: string]: number };
 function tokenize(query: string, tokenDict: TokenDict): number[] {
 	const tokenIds: number[] = [];
 	let index = 0;
 	// Replace spaces with "▁"
 	query = "▁" + query.replace(/ /g, "▁");
 	query = query.replace(/\n/g, "<0x0A>");
 	while (index < query.length) {
 		let bestToken = null;
 		let bestLength = 0;
 		// Step 2: Find the longest token that matches the beginning of the remaining query
 		for (const token in tokenDict) {
 			if (query.startsWith(token, index) && token.length > bestLength) {
 				bestToken = token;
 				bestLength = token.length;
 			}
 		}
 		if (bestToken) {
 			tokenIds.push(tokenDict[bestToken]);
 			index += bestLength;
 			continue;
 		}
 		// Step 3: Handle the case where no token matches
 		const char = query[index];
 		if (char.charCodeAt(0) <= 127) {
 			// If the character is ASCII, and it doesn't match any token, treat it as an unknown token
 			console.error(`Unknown token: ${char}`);
            index++;
            continue;
 		}
 		// If the character is non-ASCII, convert it to a series of bytes and match each byte
 		const bytes = new TextEncoder().encode(char);
 		for (const byte of bytes) {
 			const byteToken = `<0x${byte.toString(16).toUpperCase()}>`;
 			if (tokenDict[byteToken] === undefined) {
                console.error(`Unknown byte token: ${byteToken}`);
                index++;
                continue;
 			}
            tokenIds.push(tokenDict[byteToken]);
 		}
 		index++;
 	}
 	return tokenIds;
 }
 export default tokenize;
 export type { TokenDict };
--- a/public/model
+++ b/public/model