add: more features

2025-03-13 23:32:13 +08:00 · 2025-03-13 23:32:13 +08:00 · 23e5d6a8c9
commit 23e5d6a8c9
parent 1de8d85d2b
3 changed files with 38 additions and 22 deletions
--- a/pred/dataset.py
+++ b/pred/dataset.py
@ -14,7 +14,7 @@ class VideoPlayDataset(Dataset):
        self.max_future_seconds = max_future_days * 86400
        self.series_dict = self._load_and_process_data(publish_time_path)
        self.valid_series = [s for s in self.series_dict.values() if len(s['abs_time']) > 1]
-        self.feature_windows = [3600, 6*3600, 24*3600, 3*24*3600, 7*24*3600]  # 1h,6h,24h,3d,7d
+        self.feature_windows = [3600, 3*3600, 6*3600, 24*3600, 3*24*3600, 7*24*3600, 60*24*3600]

    def _extract_features(self, series, current_idx, target_idx):
        """提取增量特征"""
@ -23,7 +23,7 @@ class VideoPlayDataset(Dataset):
        dt = datetime.datetime.fromtimestamp(current_time)
        # 时间特征
        time_features = [
-            dt.hour / 24, (dt.weekday() + 1) / 7,
+            (dt.hour * 3600 + dt.minute * 60 + dt.second) / 86400, (dt.weekday() * 24 + dt.hour) / 168,
            np.log2(max(current_time - series['create_time'],1))
        ]
        
@ -76,18 +76,18 @@ class VideoPlayDataset(Dataset):
    def _get_nearest_value(self, series, target_time, current_idx):
        """获取指定时间前最近的数据点"""
        min_diff = float('inf')
-        for i in range(current_idx + 1, len(series['abs_time']), 1):
+        for i in range(current_idx + 1, len(series['abs_time'])):
            diff = abs(series['abs_time'][i] - target_time)
            if diff < min_diff:
                min_diff = diff
            else:
                return i - 1
-        return None
+        return len(series['abs_time']) - 1

    def __getitem__(self, idx):
        series = random.choice(self.valid_series)
        current_idx = random.randint(0, len(series['abs_time'])-2)
-        target_idx = random.randint(max(0, current_idx-50), current_idx)
+        target_idx = random.randint(max(0, current_idx-10), current_idx)
        
        # 提取特征
        features = self._extract_features(series, current_idx, target_idx)
--- a/pred/inference.py
+++ b/pred/inference.py
@ -4,14 +4,25 @@ import torch

 def main():
    model = CompactPredictor(10).to('cpu', dtype=torch.float32)
-    model.load_state_dict(torch.load('play_predictor.pth'))
+    model.load_state_dict(torch.load('./pred/checkpoints/play_predictor.pth'))
    model.eval()
    # inference
-    data = [3,3.9315974229,5.4263146604,9.4958550269,10.9203528554,11.5835529305,13.0426853722,0.7916666667,0.2857142857,24.7794093257]
-    np_arr = np.array([data])
-    tensor = torch.from_numpy(np_arr).to('cpu', dtype=torch.float32)
-    output = model(tensor)
-    print(output)
+    last = 999469
+    for i in range(1, 48):
+        hour = i / 2
+        sec = hour * 3600
+        time_d = np.log2(sec)
+        data = [time_d, 19.9295936113, # time_delta, current_views
+                6.1575520046,8.980,10.6183855023,12.0313328273,13.2537252486, # growth_feat
+                0.625,0.2857142857,24.7794093257 # time_feat
+        ]
+        np_arr = np.array([data])
+        tensor = torch.from_numpy(np_arr).to('cpu', dtype=torch.float32)
+        output = model(tensor)
+        num = output.detach().numpy()[0][0]
+        views_pred = int(np.exp2(num)) + 999469
+        print(f"{int(15+hour)%24:02d}:{int((15+hour)*60)%60:02d}", views_pred, views_pred - last)
+        last = views_pred

 if __name__ == '__main__':
    main()
--- a/pred/train.py
+++ b/pred/train.py
@ -26,7 +26,7 @@ def train(model, dataloader, device, epochs=100):
            outputs = model(features)
            loss = criterion(outputs, targets)
            loss.backward()
-            #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            
@ -49,16 +49,21 @@ def train(model, dataloader, device, epochs=100):
                writer.add_scalar('Params/Mean', sum(param_means)/len(param_means), global_step)

                samples_count = len(targets)
-                r = random.randint(0, samples_count-1)
-                t = float(torch.exp2(targets[r])) - 1
-                o = float(torch.exp2(outputs[r])) - 1
-                d = features[r].cpu().numpy()[0]
-                speed = np.exp2(features[r].cpu().numpy()[2])
-                time_diff = np.exp2(d) / 3600
-                inc = speed * time_diff
-                model_error = abs(t - o)
-                reg_error = abs(inc - t)
-                print(f"{t:07.1f} | {o:07.1f} | {d:07.1f} | {inc:07.1f} | {model_error < reg_error}")
+                good = 0
+                for r in range(samples_count):
+                    r = random.randint(0, samples_count-1)
+                    t = float(torch.exp2(targets[r])) - 1
+                    o = float(torch.exp2(outputs[r])) - 1
+                    d = features[r].cpu().numpy()[0]
+                    speed = np.exp2(features[r].cpu().numpy()[5]) / 24
+                    time_diff = np.exp2(d) / 3600
+                    inc = speed * time_diff
+                    model_error = abs(t - o)
+                    reg_error = abs(inc - t)
+                    if model_error < reg_error:
+                        good += 1
+                    #print(f"{t:07.1f} | {o:07.1f} | {d:07.1f} | {inc:07.1f} | {good/samples_count*100:.1f}%")
+                writer.add_scalar('Train/WinRate', good/samples_count, global_step)
            
        print(f"Epoch {epoch+1} | Avg Loss: {total_loss/len(dataloader):.4f}")