32 lines
1.1 KiB
TypeScript
32 lines
1.1 KiB
TypeScript
import { AutoModel, AutoTokenizer, Tensor } from '@huggingface/transformers';
|
|
|
|
const modelName = "alikia2x/jina-embedding-v3-m2v-1024";
|
|
|
|
const modelConfig = {
|
|
config: { model_type: 'model2vec' },
|
|
dtype: 'fp32',
|
|
revision: 'refs/pr/1',
|
|
cache_dir: undefined,
|
|
local_files_only: true,
|
|
};
|
|
const tokenizerConfig = {
|
|
revision: 'refs/pr/2'
|
|
};
|
|
|
|
const model = await AutoModel.from_pretrained(modelName, modelConfig);
|
|
const tokenizer = await AutoTokenizer.from_pretrained(modelName, tokenizerConfig);
|
|
|
|
const texts = ['hello', 'hello world'];
|
|
const { input_ids } = await tokenizer(texts, { add_special_tokens: false, return_tensor: false });
|
|
|
|
const cumsum = arr => arr.reduce((acc, num, i) => [...acc, num + (acc[i - 1] || 0)], []);
|
|
const offsets = [0, ...cumsum(input_ids.slice(0, -1).map(x => x.length))];
|
|
|
|
const flattened_input_ids = input_ids.flat();
|
|
const modelInputs = {
|
|
input_ids: new Tensor('int64', flattened_input_ids, [flattened_input_ids.length]),
|
|
offsets: new Tensor('int64', offsets, [offsets.length])
|
|
};
|
|
|
|
const { embeddings } = await model(modelInputs);
|
|
console.log(embeddings.tolist()); // output matches python version
|