diff --git a/lib/nlp/getEmbedding.ts b/lib/nlp/getEmbedding.ts new file mode 100644 index 0000000..e20b863 --- /dev/null +++ b/lib/nlp/getEmbedding.ts @@ -0,0 +1,38 @@ +type EmbeddingDict = { [key: number]: Float32Array }; + +function getEmbeddingLayer(buffer: Buffer): EmbeddingDict { + const dict: EmbeddingDict = {}; + + const entrySize = 514; + const numEntries = buffer.length / entrySize; + + for (let i = 0; i < numEntries; i++) { + const offset = i * entrySize; + const key = buffer.readUInt16LE(offset); + const floatArray = new Float32Array(128); + + for (let j = 0; j < 128; j++) { + floatArray[j] = buffer.readFloatLE(offset + 2 + j * 4); + } + + dict[key] = floatArray; + } + + return dict; +} + +function getEmbedding(tokenIds: number[], embeddingDict: EmbeddingDict, contextSize: number) { + let result: number[] = []; + for (let i = 0; i < contextSize; i++) { + if (i < tokenIds.length) { + const tokenId = tokenIds[i]; + result = result.concat(Array.from(embeddingDict[tokenId])) + } + else { + result = result.concat(new Array(128).fill(0)) + } + } + return new Float32Array(result); +} + +export {getEmbeddingLayer, getEmbedding}; \ No newline at end of file diff --git a/lib/nlp/tokenizer.ts b/lib/nlp/tokenizer.ts new file mode 100644 index 0000000..02dd516 --- /dev/null +++ b/lib/nlp/tokenizer.ts @@ -0,0 +1,56 @@ +type TokenDict = { [key: string]: number }; + +function tokenize(query: string, tokenDict: TokenDict): number[] { + const tokenIds: number[] = []; + let index = 0; + + // Replace spaces with "▁" + query = "▁" + query.replace(/ /g, "▁"); + query = query.replace(/\n/g, "<0x0A>"); + + while (index < query.length) { + let bestToken = null; + let bestLength = 0; + + // Step 2: Find the longest token that matches the beginning of the remaining query + for (const token in tokenDict) { + if (query.startsWith(token, index) && token.length > bestLength) { + bestToken = token; + bestLength = token.length; + } + } + + if (bestToken) { + tokenIds.push(tokenDict[bestToken]); + index += bestLength; + continue; + } + + // Step 3: Handle the case where no token matches + const char = query[index]; + if (char.charCodeAt(0) <= 127) { + // If the character is ASCII, and it doesn't match any token, treat it as an unknown token + console.error(`Unknown token: ${char}`); + index++; + continue; + } + + // If the character is non-ASCII, convert it to a series of bytes and match each byte + const bytes = new TextEncoder().encode(char); + for (const byte of bytes) { + const byteToken = `<0x${byte.toString(16).toUpperCase()}>`; + if (tokenDict[byteToken] === undefined) { + console.error(`Unknown byte token: ${byteToken}`); + index++; + continue; + } + tokenIds.push(tokenDict[byteToken]); + } + index++; + } + + return tokenIds; +} + +export default tokenize; +export type { TokenDict }; diff --git a/public/model b/public/model deleted file mode 100644 index 0c4ef7e..0000000 Binary files a/public/model and /dev/null differ