sparkast/lib/nlp/tokenizer.ts
2024-09-22 21:09:19 +08:00

19 lines
522 B
TypeScript

import { AutoTokenizer, env } from "@xenova/transformers";
async function tokenize(text: string, model: string, mirror: boolean = false, remote: boolean = true) {
if (mirror) {
env.remoteHost = "https://hf-mirror.com";
}
if (!remote) {
env.allowRemoteModels = false;
}
const tokenizer = await AutoTokenizer.from_pretrained(model);
const { input_ids } = await tokenizer(text);
const tokenIds = [];
for (let id of input_ids.data) {
tokenIds.push(parseInt(id));
}
return tokenIds;
}
export default tokenize;