add: the pred model prototype

This commit is contained in:
alikia2x (寒寒) 2025-03-12 03:51:39 +08:00
parent 98e063e86c
commit 296e4ef4d6
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
5 changed files with 476 additions and 4 deletions

4
pred/1
View File

@ -8,14 +8,12 @@
1951059019
799277283
844610791
113469781180858
1706212240
339432
243913657
16576108
583566710
802536340
112977722344665
2976394
8321047
261045912
@ -24,7 +22,6 @@
316228425
257550414
242976248
113793849892846
9230106
517962327
752662232
@ -63,7 +60,6 @@
651329836
875035826
469433434
113418006693380
58814955
33044780
946091445

12
pred/count.py Normal file
View File

@ -0,0 +1,12 @@
# iterate all json files in ./data/pred
import os
import json
count = 0
for filename in os.listdir('./data/pred'):
if filename.endswith('.json'):
with open('./data/pred/' + filename, 'r') as f:
data = json.load(f)
count += len(data)
print(count)

206
pred/dataset.py Normal file
View File

@ -0,0 +1,206 @@
import os
import json
import random
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from datetime import datetime
import torch
class VideoPlayDataset(Dataset):
def __init__(self, data_dir, publish_time_path,
min_seq_len=6, max_seq_len=200,
min_forecast_span=60, max_forecast_span=604800):
"""
改进后的数据集类支持非等间隔时间序列
:param data_dir: JSON文件目录
:param publish_time_path: 发布时间CSV路径
:param min_seq_len: 最小历史数据点数
:param max_seq_len: 最大历史数据点数
:param min_forecast_span: 最小预测时间跨度
:param max_forecast_span: 最大预测时间跨度
"""
self.data_dir = data_dir
self.min_seq_len = min_seq_len
self.max_seq_len = max_seq_len
self.min_forecast_span = min_forecast_span
self.max_forecast_span = max_forecast_span
self.series_dict = self._load_and_process_data(data_dir, publish_time_path)
self.valid_series = self._generate_valid_series()
def _load_and_process_data(self, data_dir, publish_time_path):
# 加载发布时间数据
publish_df = pd.read_csv(publish_time_path)
publish_df['published_at'] = pd.to_datetime(publish_df['published_at'])
publish_dict = dict(zip(publish_df['aid'], publish_df['published_at']))
# 加载并处理JSON数据
series_dict = {}
for filename in os.listdir(data_dir):
if not filename.endswith('.json'):
continue
filepath = os.path.join(data_dir, filename)
with open(filepath, 'r', encoding='utf-8') as f:
json_data = json.load(f)
for item in json_data:
aid = item['aid']
if aid not in publish_dict:
continue
# 计算相对时间
added_time = datetime.fromtimestamp(item['added'])
published_time = publish_dict[aid]
rel_time = (added_time - published_time).total_seconds()
# 按视频组织数据
if aid not in series_dict:
series_dict[aid] = {
'abs_time': [],
'rel_time': [],
'play_count': []
}
series_dict[aid]['abs_time'].append(item['added'])
series_dict[aid]['rel_time'].append(rel_time)
series_dict[aid]['play_count'].append(item['view'])
# 按时间排序并计算时间间隔
for aid in series_dict:
# 按时间排序
sorted_idx = np.argsort(series_dict[aid]['abs_time'])
for key in ['abs_time', 'rel_time', 'play_count']:
series_dict[aid][key] = np.array(series_dict[aid][key])[sorted_idx]
# 计算时间间隔特征
abs_time_arr = series_dict[aid]['abs_time']
time_deltas = np.diff(abs_time_arr, prepend=abs_time_arr[0])
series_dict[aid]['time_delta'] = time_deltas
return series_dict
def _generate_valid_series(self):
# 生成有效数据序列
valid_series = []
for aid in self.series_dict:
series = self.series_dict[aid]
n_points = len(series['play_count'])
# 过滤数据量不足的视频
if n_points < self.min_seq_len + 1:
continue
valid_series.append({
'aid': aid,
'length': n_points,
'abs_time': series['abs_time'],
'rel_time': series['rel_time'],
'play_count': series['play_count'],
'time_delta': series['time_delta']
})
return valid_series
def __len__(self):
return sum(s['length'] - self.min_seq_len for s in self.valid_series)
def __getitem__(self, idx):
# 随机选择视频序列
series = random.choice(self.valid_series)
max_start = series['length'] - self.min_seq_len - 1
start_idx = random.randint(0, max_start)
# 动态确定历史窗口长度
seq_len = random.randint(self.min_seq_len, min(self.max_seq_len, series['length'] - start_idx - 1))
end_idx = start_idx + seq_len
# 提取历史窗口特征
hist_slice = slice(start_idx, end_idx)
x_play = np.log1p(series['play_count'][hist_slice])
x_abs_time = series['abs_time'][hist_slice]
x_rel_time = series['rel_time'][hist_slice]
x_time_delta = series['time_delta'][hist_slice]
# 生成预测目标(动态时间跨度)
forecast_span = random.randint(self.min_forecast_span, self.max_forecast_span)
target_time = x_abs_time[-1] + forecast_span
# 寻找实际目标点(处理数据间隙)
future_times = series['abs_time'][end_idx:]
future_plays = series['play_count'][end_idx:]
# 找到第一个超过目标时间的点
target_idx = np.searchsorted(future_times, target_time)
if target_idx >= len(future_times):
# 若超出数据范围,取最后一个点
y_play = future_plays[-1] if len(future_plays) > 0 else x_play[-1]
actual_span = future_times[-1] - x_abs_time[-1] if len(future_times) > 0 else self.max_forecast_span
else:
y_play = future_plays[target_idx]
actual_span = future_times[target_idx] - x_abs_time[-1]
y_play_val = np.log1p(y_play)
# 构造时间相关特征
time_features = np.stack([
x_abs_time,
x_rel_time,
x_time_delta,
np.log1p(x_time_delta), # 对数变换处理长尾分布
(x_time_delta > 3600).astype(float) # 间隔是否大于1小时
], axis=-1)
return {
'x_play': torch.FloatTensor(x_play),
'x_time_feat': torch.FloatTensor(time_features),
'y_play': torch.FloatTensor([y_play_val]),
'forecast_span': torch.FloatTensor([actual_span])
}
def collate_fn(batch):
"""动态填充处理"""
max_len = max(item['x_play'].shape[0] for item in batch)
padded_batch = {
'x_play': [],
'x_time_feat': [],
'y_play': [],
'forecast_span': [],
'padding_mask': []
}
for item in batch:
seq_len = item['x_play'].shape[0]
pad_len = max_len - seq_len
# 填充播放量数据
padded_play = torch.cat([
item['x_play'],
torch.zeros(pad_len)
])
padded_batch['x_play'].append(padded_play)
# 填充时间特征
padded_time_feat = torch.cat([
item['x_time_feat'],
torch.zeros(pad_len, item['x_time_feat'].shape[1])
])
padded_batch['x_time_feat'].append(padded_time_feat)
# 创建padding mask
mask = torch.cat([
torch.ones(seq_len),
torch.zeros(pad_len)
])
padded_batch['padding_mask'].append(mask.bool())
# 其他字段
padded_batch['y_play'].append(item['y_play'])
padded_batch['forecast_span'].append(item['forecast_span'])
# 转换为张量
padded_batch['x_play'] = torch.stack(padded_batch['x_play'])
padded_batch['x_time_feat'] = torch.stack(padded_batch['x_time_feat'])
padded_batch['y_play'] = torch.stack(padded_batch['y_play'])
padded_batch['forecast_span'] = torch.stack(padded_batch['forecast_span'])
padded_batch['padding_mask'] = torch.stack(padded_batch['padding_mask'])
return padded_batch

182
pred/model.py Normal file
View File

@ -0,0 +1,182 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
class TimeEmbedding(nn.Module):
"""时间特征编码模块"""
def __init__(self, embed_dim):
super().__init__()
self.embed_dim = embed_dim
self.norm = nn.LayerNorm(5)
# 时间特征编码适配新的5维时间特征
self.time_encoder = nn.Sequential(
nn.Linear(5, 64), # 输入维度对应x_time_feat的5个特征
nn.GELU(),
nn.LayerNorm(64),
nn.Linear(64, embed_dim)
)
def forward(self, time_feat):
"""
time_feat: 时间特征 (batch, seq_len, 5)
"""
time_feat = self.norm(time_feat) # 应用归一化
return self.time_encoder(time_feat)
class MultiScaleEncoder(nn.Module):
"""多尺度特征编码器"""
def __init__(self, input_dim, d_model, nhead, conv_kernels=[3, 7, 23]):
super().__init__()
self.d_model = d_model
self.conv_branches = nn.ModuleList([
nn.Sequential(
nn.Conv1d(input_dim, d_model, kernel_size=k, padding=k//2),
nn.GELU(),
) for k in conv_kernels
])
# 添加 LayerNorm 到单独的列表中
self.layer_norms = nn.ModuleList([nn.LayerNorm(d_model) for _ in conv_kernels])
# Transformer编码器
self.transformer = nn.TransformerEncoder(
nn.TransformerEncoderLayer(
d_model,
nhead,
dim_feedforward=d_model*4,
batch_first=True # 修改为batch_first
),
num_layers=4
)
# 特征融合层
self.fusion = nn.Linear(d_model*(len(conv_kernels)+1), d_model)
def forward(self, x, padding_mask=None):
"""
x: 输入特征 (batch, seq_len, input_dim)
padding_mask: 填充掩码 (batch, seq_len)
"""
# 卷积分支处理
conv_features = []
x_conv = x.permute(0, 2, 1) # (batch, input_dim, seq_len)
for i, branch in enumerate(self.conv_branches):
feat = branch(x_conv) # 输出形状 (batch, d_model, seq_len)
# 手动转置并应用 LayerNorm
feat = feat.permute(0, 2, 1) # (batch, seq_len, d_model)
feat = self.layer_norms[i](feat) # 应用 LayerNorm
conv_features.append(feat)
# Transformer分支处理
trans_feat = self.transformer(
x,
src_key_padding_mask=padding_mask
) # (batch, seq_len, d_model)
# 特征拼接与融合
combined = torch.cat(conv_features + [trans_feat], dim=-1)
fused = self.fusion(combined) # (batch, seq_len, d_model)
return fused
class VideoPlayPredictor(nn.Module):
def __init__(self, d_model=256, nhead=8):
super().__init__()
self.d_model = d_model
# 特征嵌入
self.time_embed = TimeEmbedding(embed_dim=64)
self.base_embed = nn.Linear(1 + 64, d_model) # 播放量 + 时间特征
# 编码器
self.encoder = MultiScaleEncoder(d_model, d_model, nhead)
# 时间感知预测头
self.forecast_head = nn.Sequential(
nn.Linear(2 * d_model + 1, d_model * 4), # 关键修改:输入维度为 2*d_model +1
nn.GELU(),
nn.Linear(d_model * 4, 1),
nn.ReLU() # 确保输出非负
)
# 上下文提取器
self.context_extractor = nn.LSTM(
input_size=d_model,
hidden_size=d_model,
num_layers=2,
bidirectional=True,
batch_first=True
)
# 初始化参数
self._init_weights()
def _init_weights(self):
for name, p in self.named_parameters():
if 'forecast_head' in name:
if 'weight' in name:
nn.init.xavier_normal_(p, gain=1e-2) # 缩小初始化范围
elif 'bias' in name:
nn.init.constant_(p, 0.0)
elif p.dim() > 1:
nn.init.xavier_uniform_(p)
def forward(self, x_play, x_time_feat, padding_mask, forecast_span):
"""
x_play: 历史播放量 (batch, seq_len)
x_time_feat: 时间特征 (batch, seq_len, 5)
padding_mask: 填充掩码 (batch, seq_len)
forecast_span: 预测时间跨度 (batch, 1)
"""
batch_size = x_play.size(0)
# 时间特征编码
time_emb = self.time_embed(x_time_feat) # (batch, seq_len, 64)
# 基础特征拼接
base_feat = torch.cat([
x_play.unsqueeze(-1), # (batch, seq_len, 1)
time_emb
], dim=-1) # (batch, seq_len, 1+64)
# 投影到模型维度
embedded = self.base_embed(base_feat) # (batch, seq_len, d_model)
# 编码特征
encoded = self.encoder(embedded, padding_mask) # (batch, seq_len, d_model)
# 提取上下文
context, _ = self.context_extractor(encoded) # (batch, seq_len, d_model*2)
context = context.mean(dim=1) # (batch, d_model*2)
# 融合时间跨度特征
span_feat = torch.log1p(forecast_span) / 10 # 归一化
combined = torch.cat([
context,
span_feat
], dim=-1) # (batch, d_model*2 + 1)
# 最终预测
pred = self.forecast_head(combined) # (batch, 1)
return pred
class MultiTaskWrapper(nn.Module):
"""适配新数据结构的封装"""
def __init__(self, model):
super().__init__()
self.model = model
def forward(self, batch):
return self.model(
batch['x_play'],
batch['x_time_feat'],
batch['padding_mask'],
batch['forecast_span']
)

76
pred/train.py Normal file
View File

@ -0,0 +1,76 @@
import numpy as np
from torch.utils.data import DataLoader
from model import MultiTaskWrapper, VideoPlayPredictor
import torch
import torch.nn.functional as F
from dataset import VideoPlayDataset, collate_fn
def train(model, dataloader, epochs=100, device='mps'):
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs)
steps = 0
for epoch in range(epochs):
model.train()
total_loss = 0
for batch in dataloader:
optimizer.zero_grad()
# movel whole batch to device
for k, v in batch.items():
if isinstance(v, torch.Tensor):
batch[k] = v.to(device)
# 前向传播
pred = model(batch)
y_play = batch['y_play']
real = np.expm1(y_play.cpu().detach().numpy())
yhat = np.expm1(pred.cpu().detach().numpy())
print("real", [int(real[0][0]), int(real[1][0])])
print("yhat", [int(yhat[0][0]), int(yhat[1][0])], [float(pred.cpu().detach().numpy()[0][0]), float(pred.cpu().detach().numpy()[1][0])])
# 计算加权损失
weights = torch.log1p(batch['forecast_span']) # 时间越长权重越低
loss_per_sample = F.huber_loss(pred, y_play, reduction='none')
loss = (loss_per_sample * weights).mean()
# 反向传播
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
steps += 1
print(f"Epoch {epoch+1} | Step {steps} | Loss: {loss.item():.4f}")
scheduler.step()
avg_loss = total_loss / len(dataloader)
print(f"Epoch {epoch+1:03d} | Loss: {avg_loss:.4f}")
# 初始化模型
device = 'mps'
model = MultiTaskWrapper(VideoPlayPredictor())
model = model.to(device)
data_dir = './data/pred'
publish_time_path = './data/pred/publish_time.csv'
dataset = VideoPlayDataset(
data_dir=data_dir,
publish_time_path=publish_time_path,
min_seq_len=2, # 至少2个历史点
max_seq_len=350, # 最多350个历史点
min_forecast_span=60, # 预测跨度1分钟到
max_forecast_span=86400 * 10 # 10天
)
dataloader = DataLoader(
dataset,
batch_size=2,
shuffle=True,
collate_fn=collate_fn, # 使用自定义collate函数
)
# 开始训练
train(model, dataloader, epochs=20, device=device)