From 11ec3e829576024aa4e23bdd6a03a71766da96b0 Mon Sep 17 00:00:00 2001 From: alikia2x Date: Fri, 7 Mar 2025 04:28:44 +0800 Subject: [PATCH] update: adjust LR, grad accumulate --- filter/RunningLogs.txt | 4 +- filter/embedding.py | 8 +-- filter/modelV6_3.py | 112 +++++++++++++++++++++++++++++++++++++++++ filter/predict.py | 6 +-- filter/train.py | 34 +++++++++---- 5 files changed, 145 insertions(+), 19 deletions(-) create mode 100644 filter/modelV6_3.py diff --git a/filter/RunningLogs.txt b/filter/RunningLogs.txt index 8d76c33..ba7b286 100644 --- a/filter/RunningLogs.txt +++ b/filter/RunningLogs.txt @@ -33,4 +33,6 @@ Note 0128: V6.1 # Transformer模型(交叉熵损失) 0219: V6.1 # MPS训练 0242: V6.1 # 自定义loss -0259: V6.1 # 调整学习率 \ No newline at end of file +0259: V6.1 # 调整学习率 (自定义loss) +0314: V6.1 # 调整学习率(交叉熵损失) +0349: V6.3 # 增加层数至2 \ No newline at end of file diff --git a/filter/embedding.py b/filter/embedding.py index 91a93ea..f198a9f 100644 --- a/filter/embedding.py +++ b/filter/embedding.py @@ -32,7 +32,7 @@ def prepare_batch(batch_data, device="cpu"): import onnxruntime as ort -def prepare_batch_per_token(session, tokenizer, batch_data, device = 'cpu', max_length=1024): +def prepare_batch_per_token(session, tokenizer, batch_data, device = 'cpu', max_length=1024, embedding_dim=256): """ 将输入的 batch_data 转换为模型所需的输入格式 [batch_size, num_channels, seq_length, embedding_dim]。 @@ -49,16 +49,16 @@ def prepare_batch_per_token(session, tokenizer, batch_data, device = 'cpu', max_ """ batch_size = len(batch_data["title"]) - batch_tensor = torch.zeros(batch_size, 3, max_length, 256, device=device) + batch_tensor = torch.zeros(batch_size, 3, max_length, embedding_dim, device=device) for i in range(batch_size): - channel_embeddings = torch.zeros((3, 1024, 256), device=device) + channel_embeddings = torch.zeros((3, 1024, embedding_dim), device=device) for j, channel in enumerate(["title", "description", "tags"]): # 获取当前通道的文本 text = batch_data[channel][i] encoded_inputs = tokenizer(text, truncation=True, max_length=max_length, return_tensors='np') # embeddings: [max_length, embedding_dim] - embeddings = torch.zeros((1024, 256), device=device) + embeddings = torch.zeros((1024, embedding_dim), device=device) for idx, token in enumerate(encoded_inputs['input_ids'][0]): inputs = { "input_ids": ort.OrtValue.ortvalue_from_numpy(np.array([token])), diff --git a/filter/modelV6_3.py b/filter/modelV6_3.py new file mode 100644 index 0000000..1d7e93b --- /dev/null +++ b/filter/modelV6_3.py @@ -0,0 +1,112 @@ +import torch +import torch.nn as nn + +class VideoClassifierV6_3(nn.Module): + def __init__(self, embedding_dim=72, hidden_dim=256, output_dim=3, num_heads=4, num_layers=2): + super().__init__() + self.num_channels = 3 + self.channel_names = ['title', 'description', 'tags'] + self.embedding_dim = embedding_dim + self.hidden_dim = hidden_dim + self.num_layers = num_layers + + # 通道独立处理模块(每个通道独立的Transformer编码器) + self.channel_processors = nn.ModuleList() + for _ in range(self.num_channels): + layers = [] + # 首先将输入维度转换为hidden_dim + layers.extend([ + nn.Linear(embedding_dim, hidden_dim), + nn.GELU(), + nn.LayerNorm(hidden_dim) + ]) + # 添加num_layers层的Transformer块 + for _ in range(num_layers): + layers.extend([ + # 自注意力层(使用hidden_dim作为embed_dim) + nn.MultiheadAttention( + embed_dim=hidden_dim, # 修改为hidden_dim + num_heads=num_heads, + dropout=0.1 + ), + nn.LayerNorm(hidden_dim), + # 前馈网络部分 + nn.Linear(hidden_dim, hidden_dim), + nn.GELU(), + nn.Linear(hidden_dim, hidden_dim), + nn.LayerNorm(hidden_dim) + ]) + self.channel_processors.append(nn.Sequential(*layers)) + + # 通道权重(可学习,Sigmoid约束) + self.channel_weights = nn.Parameter(torch.ones(self.num_channels)) + + # 全连接层(扩展维度) + self.fc = nn.Sequential( + nn.Linear(self.num_channels * hidden_dim, 1024), + nn.BatchNorm1d(1024), + nn.Dropout(0.2), + nn.GELU(), + nn.Linear(1024, 512), + nn.BatchNorm1d(512), + nn.Dropout(0.2), + nn.GELU(), + nn.Linear(512, output_dim) + ) + + self._init_weights() + + def _init_weights(self): + """权重初始化(Xavier初始化)""" + for m in self.modules(): + if isinstance(m, nn.Linear): + nn.init.xavier_uniform_(m.weight) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.MultiheadAttention): + # 初始化MultiheadAttention的参数(输入投影和输出投影) + for name, param in m.named_parameters(): + if "in_proj" in name or "out_proj" in name: + if "weight" in name: + nn.init.xavier_uniform_(param) + elif "bias" in name: + nn.init.zeros_(param) + elif isinstance(m, nn.LayerNorm): + nn.init.ones_(m.weight) + + def forward(self, channel_features: torch.Tensor): + """ + 输入格式: [batch_size, num_channels, seq_length, embedding_dim] + 输出格式: [batch_size, output_dim] + """ + batch_size = channel_features.size(0) + processed_channels = [] + + for c in range(self.num_channels): + c_data = channel_features[:, c].permute(1, 0, 2) # 转为 [S, B, E] + + # 通道独立处理 + x = c_data + for layer in self.channel_processors[c]: + if isinstance(layer, nn.MultiheadAttention): + # 自注意力层需要显式提供键、值 + x = layer(x, x, x)[0] + else: + x = layer(x) + + # 转换回 [B, S, hidden_dim] 并全局平均池化 + x = x.permute(1, 0, 2) + pooled = x.mean(dim=1) + processed_channels.append(pooled) + + # 堆叠通道特征 + processed_channels = torch.stack(processed_channels, dim=1) + + # 应用通道权重(Sigmoid约束) + weights = torch.sigmoid(self.channel_weights).view(1, -1, 1) + weighted_features = processed_channels * weights + + # 拼接所有通道特征 + combined = weighted_features.view(batch_size, -1) + + return self.fc(combined) \ No newline at end of file diff --git a/filter/predict.py b/filter/predict.py index b21550b..ef3332c 100644 --- a/filter/predict.py +++ b/filter/predict.py @@ -3,7 +3,7 @@ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" import sqlite3 import json import torch -from modelV3_9 import VideoClassifierV3_9 +from modelV3_10 import VideoClassifierV3_10 from sentence_transformers import SentenceTransformer from tqdm import tqdm # 导入 tqdm @@ -43,8 +43,8 @@ def parse_entry_data(data): def initialize_model(): """初始化模型和文本编码器""" - model = VideoClassifierV3_9() - model.load_state_dict(torch.load('./filter/checkpoints/best_model_V3.9.pt', map_location=torch.device('cpu'))) + model = VideoClassifierV3_10() + model.load_state_dict(torch.load('./filter/checkpoints/best_model_V3.11.pt', map_location=torch.device('cpu'))) model.eval() st_model = SentenceTransformer("Thaweewat/jina-embedding-v3-m2v-1024") diff --git a/filter/train.py b/filter/train.py index e206455..41eee4e 100644 --- a/filter/train.py +++ b/filter/train.py @@ -5,7 +5,6 @@ from torch.utils.data import DataLoader import torch.optim as optim from dataset import MultiChannelDataset from filter.modelV6_1 import VideoClassifierV6_1 -from filter.modelV3_15 import AdaptiveRecallLoss from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, classification_report import os import torch @@ -38,9 +37,13 @@ if not os.path.exists(test_file): test_dataset = MultiChannelDataset(test_file, mode='test') # 创建DataLoader -train_loader = DataLoader(train_dataset, batch_size=24, shuffle=True) -eval_loader = DataLoader(eval_dataset, batch_size=24, shuffle=False) -test_loader = DataLoader(test_dataset, batch_size=24, shuffle=False) +batch_size = 24 +accu_steps = 3 +real_bs = batch_size // accu_steps + +train_loader = DataLoader(train_dataset, batch_size=real_bs, shuffle=True) +eval_loader = DataLoader(eval_dataset, batch_size=real_bs, shuffle=False) +test_loader = DataLoader(test_dataset, batch_size=real_bs, shuffle=False) train_labels = [] for batch in train_loader: @@ -52,18 +55,18 @@ print(f"Using device: {device}") # 计算自适应类别权重 class_counts = np.bincount(train_labels) median_freq = np.median(class_counts) -class_weights = torch.tensor( +class_weights = torch.tensor( [median_freq / count for count in class_counts], dtype=torch.float32, device=device ) model = VideoClassifierV6_1().to(device) -checkpoint_name = './filter/checkpoints/best_model_V6.2-mps-adloss.pt' +checkpoint_name = './filter/checkpoints/best_model_V6.3.pt' # 初始化tokenizer和embedding模型 tokenizer = AutoTokenizer.from_pretrained("alikia2x/jina-embedding-v3-m2v-1024") -session = ort.InferenceSession("./model/embedding_256/onnx/model.onnx") +session = ort.InferenceSession("./model/embedding_72/onnx/model.onnx") # 模型保存路径 os.makedirs('./filter/checkpoints', exist_ok=True) @@ -73,9 +76,9 @@ eval_interval = 20 num_epochs = 20 total_steps = samples_count * num_epochs / train_loader.batch_size warmup_rate = 0.1 -optimizer = optim.AdamW(model.parameters(), lr=6e-5, weight_decay=1e-5) +optimizer = optim.AdamW(model.parameters(), lr=5e-5, weight_decay=1e-5) cosine_annealing_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_steps - int(total_steps * warmup_rate)) -warmup_scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=0.14, end_factor=1.0, total_iters=int(total_steps * warmup_rate)) +warmup_scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=0.4, end_factor=1.0, total_iters=int(total_steps * warmup_rate)) scheduler = optim.lr_scheduler.SequentialLR(optimizer, schedulers=[warmup_scheduler, cosine_annealing_scheduler], milestones=[int(total_steps * warmup_rate)]) criterion = nn.CrossEntropyLoss(weight=class_weights).to(device) @@ -89,7 +92,7 @@ def evaluate(model, dataloader): with torch.no_grad(): for batch in dataloader: - batch_tensor = prepare_batch_per_token(session, tokenizer, batch['texts']).to(device) + batch_tensor = prepare_batch_per_token(session, tokenizer, batch['texts'], embedding_dim=72).to(device) logits = model(batch_tensor) preds = torch.argmax(logits, dim=1) all_preds.extend(preds.cpu().numpy()) @@ -121,7 +124,7 @@ for epoch in range(num_epochs): optimizer.zero_grad() - batch_tensor = prepare_batch_per_token(session, tokenizer, batch['texts']).to(device) + batch_tensor = prepare_batch_per_token(session, tokenizer, batch['texts'], embedding_dim=72).to(device) logits = model(batch_tensor) @@ -129,6 +132,10 @@ for epoch in range(num_epochs): loss.backward() optimizer.step() epoch_loss += loss.item() + + # 梯度累积 + if (batch_idx + 1) % accu_steps != 0: + continue # 记录训练损失 writer.add_scalar('Train/Loss', loss.item(), step) @@ -157,6 +164,11 @@ for epoch in range(num_epochs): scheduler.step() writer.add_scalar('Train/LR', scheduler.get_last_lr()[0], step) + # 处理最后一个未满累积步数的batch + if (batch_idx + 1) % accu_steps != 0: + optimizer.step() + optimizer.zero_grad() + # 记录每个 epoch 的平均训练损失 avg_epoch_loss = epoch_loss / len(train_loader) writer.add_scalar('Train/Epoch_Loss', avg_epoch_loss, epoch)