cvsa/filter/embedding_range.py

import json
import torch
import random
from embedding import prepare_batch
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt

file_path = './data/filter/model_predicted.jsonl'

class Dataset:
    def __init__(self, file_path):
        all_examples = self.load_data(file_path)
        self.examples = all_examples

    def load_data(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return [json.loads(line) for line in f]

    def __getitem__(self, idx):
        end_idx = min((idx + 1) * self.batch_size, len(self.examples))
        texts = {
            'title': [ex['title'] for ex in self.examples[idx * self.batch_size:end_idx]],
            'description': [ex['description'] for ex in self.examples[idx * self.batch_size:end_idx]],
            'tags': [",".join(ex['tags']) for ex in self.examples[idx * self.batch_size:end_idx]],
            'author_info': [ex['author_info'] for ex in self.examples[idx * self.batch_size:end_idx]]
        }
        return texts

    def __len__(self):
        return len(self.examples)

    def get_batch(self, idx, batch_size):
        self.batch_size = batch_size
        return self.__getitem__(idx)

total = 600000
batch_size = 512
batch_num = total // batch_size
dataset = Dataset(file_path)
arr_len = batch_size * 4 * 1024
sample_rate = 0.1
sample_num = int(arr_len * sample_rate)

data = np.array([])
for i in tqdm(range(batch_num)):
    batch = dataset.get_batch(i, batch_size)
    batch = prepare_batch(batch, device="cpu")
    arr = batch.flatten().numpy()
    sampled = np.random.choice(arr.shape[0], size=sample_num, replace=False)
    data = np.concatenate((data, arr[sampled]), axis=0) if data.size else arr[sampled]
    if i % 10 == 0:
        np.save('embedding_range.npy', data)
np.save('embedding_range.npy', data)