From f488c3ceda8af7215ef7f3c1114211e86b5576fb Mon Sep 17 00:00:00 2001 From: alikia2x Date: Sun, 2 Mar 2025 23:45:32 +0800 Subject: [PATCH] add: filter model based on cascade classifier (V3.12) --- filter/RunningLogs.txt | 3 +- filter/embedding.py | 63 +++++++++++++++++++++--- filter/modelV3_12.py | 79 ++++++++++++++++++++++++++++++ filter/modelV3_9.py | 107 ----------------------------------------- filter/modelV6_0.py | 18 +++---- filter/train.py | 24 ++++----- 6 files changed, 154 insertions(+), 140 deletions(-) create mode 100644 filter/modelV3_12.py delete mode 100644 filter/modelV3_9.py diff --git a/filter/RunningLogs.txt b/filter/RunningLogs.txt index 45b8c6a..e6e5092 100644 --- a/filter/RunningLogs.txt +++ b/filter/RunningLogs.txt @@ -22,4 +22,5 @@ Note 1918: V3.9 2308: V3.11 2243: V3.11 # 256维嵌入 -2253: V3.11 # 1024维度嵌入(对比) \ No newline at end of file +2253: V3.11 # 1024维度嵌入(对比) +2337: V3.12 # 级联分类 diff --git a/filter/embedding.py b/filter/embedding.py index 6ef055d..7e9dfc6 100644 --- a/filter/embedding.py +++ b/filter/embedding.py @@ -1,3 +1,4 @@ +import numpy as np import torch from model2vec import StaticModel @@ -30,6 +31,10 @@ def prepare_batch(batch_data, device="cpu"): batch_tensor = torch.stack(channel_embeddings, dim=1) # 在 dim=1 上堆叠 return batch_tensor +import onnxruntime as ort +from transformers import AutoTokenizer +from itertools import accumulate + def prepare_batch_per_token(batch_data, max_length=1024): """ 将输入的 batch_data 转换为模型所需的输入格式 [batch_size, num_channels, seq_length, embedding_dim]。 @@ -46,18 +51,60 @@ def prepare_batch_per_token(batch_data, max_length=1024): 返回: torch.Tensor: 形状为 [batch_size, num_channels, seq_length, embedding_dim] 的张量。 """ + # 初始化 tokenizer 和 ONNX 模型 + tokenizer = AutoTokenizer.from_pretrained("alikia2x/jina-embedding-v3-m2v-1024") + session = ort.InferenceSession("./model/embedding_256/onnx/model.onnx") + # 1. 对每个通道的文本分别编码 channel_embeddings = [] - model = StaticModel.from_pretrained("./model/embedding_256/") for channel in ["title", "description", "tags", "author_info"]: texts = batch_data[channel] # 获取当前通道的文本列表 - # 使用tokenizer将文本转换为tokens - encoded_input = model.tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt') - with torch.no_grad(): - model_output = model.model(**encoded_input) - # 提取最后一个隐藏层的结果 - embeddings = model_output.last_hidden_state.to(torch.float32) # 将embeddings 放在指定device上 - channel_embeddings.append(embeddings) + + # Step 1: 生成 input_ids 和 offsets + # 对每个文本单独编码,保留原始 token 长度 + encoded_inputs = [tokenizer(text, truncation=True, max_length=max_length, return_tensors='np') for text in texts] + + # 提取每个文本的 input_ids 长度(考虑实际的 token 数量) + input_ids_lengths = [len(enc["input_ids"][0]) for enc in encoded_inputs] + + # 生成 offsets: [0, len1, len1+len2, ...] + offsets = list(accumulate([0] + input_ids_lengths[:-1])) # 累积和,排除最后一个长度 + + # 将所有 input_ids 展平为一维数组 + flattened_input_ids = np.concatenate([enc["input_ids"][0] for enc in encoded_inputs], axis=0).astype(np.int64) + + # Step 2: 构建 ONNX 输入 + inputs = { + "input_ids": ort.OrtValue.ortvalue_from_numpy(flattened_input_ids), + "offsets": ort.OrtValue.ortvalue_from_numpy(np.array(offsets, dtype=np.int64)) + } + + # Step 3: 运行 ONNX 模型 + embeddings = session.run(None, inputs)[0] # 假设输出名为 "embeddings" + + # Step 4: 将输出重塑为 [batch_size, seq_length, embedding_dim] + # 注意:这里假设 ONNX 输出的形状是 [total_tokens, embedding_dim] + # 需要根据实际序列长度重新分组 + batch_size = len(texts) + embeddings_split = np.split(embeddings, np.cumsum(input_ids_lengths[:-1])) + padded_embeddings = [] + for emb, seq_len in zip(embeddings_split, input_ids_lengths): + # 对每个序列填充到 max_length + if seq_len > max_length: + # 如果序列长度超过 max_length,截断 + emb = emb[:max_length] + pad_length = 0 + else: + # 否则填充到 max_length + pad_length = max_length - seq_len + + # 填充到 [max_length, embedding_dim] + padded = np.pad(emb, ((0, pad_length), (0, 0)), mode='constant') + padded_embeddings.append(padded) + + # 确保所有填充后的序列形状一致 + embeddings_tensor = torch.tensor(np.stack(padded_embeddings), dtype=torch.float32) + channel_embeddings.append(embeddings_tensor) # 2. 将编码结果堆叠为 [batch_size, num_channels, seq_length, embedding_dim] batch_tensor = torch.stack(channel_embeddings, dim=1) diff --git a/filter/modelV3_12.py b/filter/modelV3_12.py new file mode 100644 index 0000000..49d5779 --- /dev/null +++ b/filter/modelV3_12.py @@ -0,0 +1,79 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class VideoClassifierV3_12(nn.Module): + def __init__(self, embedding_dim=1024, hidden_dim=648): + super().__init__() + self.num_channels = 4 + self.channel_names = ['title', 'description', 'tags', 'author_info'] + + # 可学习温度系数 + self.temperature = nn.Parameter(torch.tensor(1.7)) + + # 带约束的通道权重(使用Sigmoid替代Softmax) + self.channel_weights = nn.Parameter(torch.ones(self.num_channels)) + + # 第一个二分类器:0 vs 1/2 + self.first_classifier = nn.Sequential( + nn.Linear(embedding_dim * self.num_channels, hidden_dim*2), + nn.BatchNorm1d(hidden_dim*2), + nn.Dropout(0.2), + nn.GELU(), + nn.Linear(hidden_dim*2, 2) # 输出为2类:0 vs 1/2 + ) + + # 第二个二分类器:1 vs 2 + self.second_classifier = nn.Sequential( + nn.Linear(embedding_dim * self.num_channels, hidden_dim*2), + nn.BatchNorm1d(hidden_dim*2), + nn.Dropout(0.2), + nn.GELU(), + nn.Linear(hidden_dim*2, 2) # 输出为2类:1 vs 2 + ) + + # 权重初始化 + self._init_weights() + + def _init_weights(self): + for layer in self.first_classifier: + if isinstance(layer, nn.Linear): + nn.init.kaiming_normal_(layer.weight, nonlinearity='relu') + nn.init.zeros_(layer.bias) + + for layer in self.second_classifier: + if isinstance(layer, nn.Linear): + nn.init.kaiming_normal_(layer.weight, nonlinearity='relu') + nn.init.zeros_(layer.bias) + + def forward(self, channel_features: torch.Tensor): + """ + 输入格式: [batch_size, num_channels, embedding_dim] + 输出格式: [batch_size, output_dim] + """ + # 自适应通道权重(Sigmoid约束) + weights = torch.sigmoid(self.channel_weights) # [0,1]范围 + weighted_features = channel_features * weights.unsqueeze(0).unsqueeze(-1) + + # 特征拼接 + combined = weighted_features.view(weighted_features.size(0), -1) + + # 第一个二分类器:0 vs 1/2 + first_output = self.first_classifier(combined) + first_probs = F.softmax(first_output, dim=1) + + # 第二个二分类器:1 vs 2 + second_output = self.second_classifier(combined) + second_probs = F.softmax(second_output, dim=1) + + # 合并结果 + final_probs = torch.zeros(channel_features.size(0), 3).to(channel_features.device) + final_probs[:, 0] = first_probs[:, 0] # 类别0的概率 + final_probs[:, 1] = first_probs[:, 1] * second_probs[:, 0] # 类别1的概率 + final_probs[:, 2] = first_probs[:, 1] * second_probs[:, 1] # 类别2的概率 + + return final_probs + + def get_channel_weights(self): + """获取各通道权重(带温度调节)""" + return torch.softmax(self.channel_weights / self.temperature, dim=0).detach().cpu().numpy() diff --git a/filter/modelV3_9.py b/filter/modelV3_9.py deleted file mode 100644 index 48bdc57..0000000 --- a/filter/modelV3_9.py +++ /dev/null @@ -1,107 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F - -class VideoClassifierV3_9(nn.Module): - def __init__(self, embedding_dim=1024, hidden_dim=648, output_dim=3): - super().__init__() - self.num_channels = 4 - self.channel_names = ['title', 'description', 'tags', 'author_info'] - - # 可学习温度系数 - self.temperature = nn.Parameter(torch.tensor(1.7)) - - # 带约束的通道权重(使用Sigmoid替代Softmax) - self.channel_weights = nn.Parameter(torch.ones(self.num_channels)) - - # 增强的非线性层 - self.fc = nn.Sequential( - nn.Linear(embedding_dim * self.num_channels, hidden_dim*2), - nn.BatchNorm1d(hidden_dim*2), - nn.Dropout(0.2), - nn.GELU(), - nn.Linear(hidden_dim*2, output_dim) - ) - - # 权重初始化 - self._init_weights() - - def _init_weights(self): - for layer in self.fc: - if isinstance(layer, nn.Linear): - # 使用ReLU的初始化参数(GELU的近似) - nn.init.kaiming_normal_(layer.weight, nonlinearity='relu') # 修改这里 - - # 或者使用Xavier初始化(更适合通用场景) - # nn.init.xavier_normal_(layer.weight, gain=nn.init.calculate_gain('relu')) - - nn.init.zeros_(layer.bias) - - - def forward(self, input_texts, sentence_transformer): - # 合并文本进行批量编码 - all_texts = [text for channel in self.channel_names for text in input_texts[channel]] - - # 冻结的文本编码 - with torch.no_grad(): - embeddings = torch.tensor( - sentence_transformer.encode(all_texts), - device=next(self.parameters()).device - ) - - # 分割并加权通道特征 - split_sizes = [len(input_texts[name]) for name in self.channel_names] - channel_features = torch.split(embeddings, split_sizes, dim=0) - channel_features = torch.stack(channel_features, dim=1) - - # 自适应通道权重(Sigmoid约束) - weights = torch.sigmoid(self.channel_weights) # [0,1]范围 - weighted_features = channel_features * weights.unsqueeze(0).unsqueeze(-1) - - # 特征拼接 - combined = weighted_features.view(weighted_features.size(0), -1) - - return self.fc(combined) - - def get_channel_weights(self): - """获取各通道权重(带温度调节)""" - return torch.softmax(self.channel_weights / self.temperature, dim=0).detach().cpu().numpy() - - -class AdaptiveRecallLoss(nn.Module): - def __init__(self, class_weights, alpha=0.8, gamma=2.0, fp_penalty=0.5): - """ - Args: - class_weights (torch.Tensor): 类别权重 - alpha (float): 召回率调节因子(0-1) - gamma (float): Focal Loss参数 - fp_penalty (float): 类别0假阳性惩罚强度 - """ - super().__init__() - self.class_weights = class_weights - self.alpha = alpha - self.gamma = gamma - self.fp_penalty = fp_penalty - - def forward(self, logits, targets): - # 基础交叉熵损失 - ce_loss = F.cross_entropy(logits, targets, weight=self.class_weights, reduction='none') - - # Focal Loss组件 - pt = torch.exp(-ce_loss) - focal_loss = ((1 - pt) ** self.gamma) * ce_loss - - # 召回率增强(对困难样本加权) - class_mask = F.one_hot(targets, num_classes=len(self.class_weights)) - class_weights = (self.alpha + (1 - self.alpha) * pt.unsqueeze(-1)) * class_mask - recall_loss = (class_weights * focal_loss.unsqueeze(-1)).sum(dim=1) - - # 类别0假阳性惩罚 - probs = F.softmax(logits, dim=1) - fp_mask = (targets != 0) & (torch.argmax(logits, dim=1) == 0) - fp_loss = self.fp_penalty * probs[:, 0][fp_mask].pow(2).sum() - - # 总损失 - total_loss = recall_loss.mean() + fp_loss / len(targets) - - return total_loss \ No newline at end of file diff --git a/filter/modelV6_0.py b/filter/modelV6_0.py index 34cc345..32502fa 100644 --- a/filter/modelV6_0.py +++ b/filter/modelV6_0.py @@ -26,10 +26,14 @@ class VideoClassifierV6_0(nn.Module): nn.Conv2d(128, 256, kernel_size=(3, 3), padding=1), nn.BatchNorm2d(256), nn.GELU(), + + # 全局平均池化层 + # 输出形状为 [batch_size, 256, 1, 1] + nn.AdaptiveAvgPool2d((1, 1)) ) - # 计算卷积后的特征维度 - self.feature_dim = self._get_conv_output_size(seq_length, embedding_dim) + # 全局池化后的特征维度固定为 256 + self.feature_dim = 256 # 全连接层 self.fc = nn.Sequential( @@ -42,12 +46,6 @@ class VideoClassifierV6_0(nn.Module): self._init_weights() - def _get_conv_output_size(self, seq_length, embedding_dim): - # 用于计算卷积输出尺寸 - x = torch.zeros(1, self.num_channels, seq_length, embedding_dim) - x = self.conv_layers(x) - return x.view(1, -1).size(1) - def _init_weights(self): for module in self.modules(): if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear): @@ -63,8 +61,8 @@ class VideoClassifierV6_0(nn.Module): # CNN特征提取 conv_features = self.conv_layers(channel_features) - # 展平特征 - flat_features = conv_features.view(conv_features.size(0), -1) + # 展平特征(全局池化后形状为 [batch_size, 256, 1, 1]) + flat_features = conv_features.view(conv_features.size(0), -1) # [batch_size, 256] # 全连接层分类 return self.fc(flat_features) diff --git a/filter/train.py b/filter/train.py index 8a18923..065d38e 100644 --- a/filter/train.py +++ b/filter/train.py @@ -4,16 +4,16 @@ import numpy as np from torch.utils.data import DataLoader import torch.optim as optim from dataset import MultiChannelDataset -from filter.modelV6_0 import VideoClassifierV6_0, AdaptiveRecallLoss +from filter.modelV3_12 import VideoClassifierV3_12 from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, classification_report import os import torch -from torch.utils.tensorboard import SummaryWriter # 引入 TensorBoard +from torch.utils.tensorboard import SummaryWriter import time -from embedding import prepare_batch_per_token +from embedding import prepare_batch +import torch.nn as nn -# 动态生成子目录名称 run_name = f"run_{time.strftime('%Y%m%d_%H%M')}" log_dir = os.path.join('./filter/runs', run_name) @@ -51,20 +51,16 @@ class_weights = torch.tensor( ) # 初始化模型和SentenceTransformer -model = VideoClassifierV6_0() -checkpoint_name = './filter/checkpoints/best_model_V6.0.pt' +model = VideoClassifierV3_12() +checkpoint_name = './filter/checkpoints/best_model_V3.12.pt' # 模型保存路径 os.makedirs('./filter/checkpoints', exist_ok=True) # 优化器 optimizer = optim.AdamW(model.parameters(), lr=4e-4) -criterion = AdaptiveRecallLoss( - class_weights=class_weights, - alpha=0.9, # 召回率权重 - gamma=1.6, # 困难样本聚焦 - fp_penalty=0.8 # 假阳性惩罚强度 -) +# Cross entropy loss +criterion = nn.CrossEntropyLoss() def count_trainable_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) @@ -76,7 +72,7 @@ def evaluate(model, dataloader): with torch.no_grad(): for batch in dataloader: - batch_tensor = prepare_batch_per_token(batch['texts'], max_length=1024) + batch_tensor = prepare_batch(batch['texts']) logits = model(batch_tensor) preds = torch.argmax(logits, dim=1) all_preds.extend(preds.cpu().numpy()) @@ -109,7 +105,7 @@ for epoch in range(num_epochs): for batch_idx, batch in enumerate(train_loader): optimizer.zero_grad() - batch_tensor = prepare_batch_per_token(batch['texts'], max_length=1024) + batch_tensor = prepare_batch(batch['texts']) logits = model(batch_tensor)