sparkast/lib/nlp/tokenize/BPEtokenizer.ts

class TrieNode {
    children: Map<string, TrieNode>;
    tokenId: number | null;

    constructor() {
        this.children = new Map();
        this.tokenId = null;
    }
}

class Trie {
    root: TrieNode;

    constructor() {
        this.root = new TrieNode();
    }

    insert(token: string, tokenId: number) {
        let node = this.root;
        for (const char of token) {
            if (!node.children.has(char)) {
                node.children.set(char, new TrieNode());
            }
            node = node.children.get(char)!;
        }
        node.tokenId = tokenId;
    }

    searchLongestToken(text: string): [number | null, number] {
        let node = this.root;
        let longestTokenId: number | null = null;
        let currentTokenLength = 0;

        for (const char of text) {
            if (!node.children.has(char)) {
                break;
            }
            node = node.children.get(char)!;
            currentTokenLength += 1;
            if (node.tokenId !== null) {
                longestTokenId = node.tokenId;
            }
        }

        return [longestTokenId, currentTokenLength];
    }
}

export default class BPETokenizer {
    private trie: Trie;

    constructor(vocabulary: { [key: string]: number }) {
        this.trie = new Trie();
        for (const token in vocabulary) {
            if (vocabulary.hasOwnProperty(token)) {
                this.trie.insert(token, vocabulary[token]);
            }
        }
    }

    tokenize(text: string): number[] {
        const tokenIds: number[] = [];
        let i = 0;

        while (i < text.length) {
            const [longestTokenId, length] = this.trie.searchLongestToken(text.slice(i));
            if (longestTokenId !== null) {
                tokenIds.push(longestTokenId);
                i += length;
            } else {
                // If no token is found, treat the character as a single token
                tokenIds.push(text.charCodeAt(i));
                i += 1;
            }
        }

        return tokenIds;
    }
}

// Example usage:
// const vocabulary = {
//     'aa': 1,
//     'bb': 2,
//     'ab': 3,
//     'ba': 4,
//     'a': 5,
//     'b': 6
// };
// const tokenizer = new BPETokenizer(vocabulary);

// const text = 'ababbaa';
// const tokenIds = tokenizer.tokenize(text);
// console.log(tokenIds); // Output: [ 3, 3, 6, 1 ]