import { AutoModel, AutoTokenizer, Tensor } from '@huggingface/transformers'; const modelName = "alikia2x/jina-embedding-v3-m2v-1024"; const modelConfig = { config: { model_type: 'model2vec' }, dtype: 'fp32', revision: 'refs/pr/1', cache_dir: undefined, local_files_only: true, }; const tokenizerConfig = { revision: 'refs/pr/2' }; const model = await AutoModel.from_pretrained(modelName, modelConfig); const tokenizer = await AutoTokenizer.from_pretrained(modelName, tokenizerConfig); const texts = ['hello', 'hello world']; const { input_ids } = await tokenizer(texts, { add_special_tokens: false, return_tensor: false }); const cumsum = arr => arr.reduce((acc, num, i) => [...acc, num + (acc[i - 1] || 0)], []); const offsets = [0, ...cumsum(input_ids.slice(0, -1).map(x => x.length))]; const flattened_input_ids = input_ids.flat(); const modelInputs = { input_ids: new Tensor('int64', flattened_input_ids, [flattened_input_ids.length]), offsets: new Tensor('int64', offsets, [offsets.length]) }; const { embeddings } = await model(modelInputs); console.log(embeddings.tolist()); // output matches python version