1
0
cvsa/ml_new/training/models.py

62 lines
1.6 KiB
Python

"""
Data models for dataset building functionality
"""
from typing import List, Optional, Dict, Any
from pydantic import BaseModel, Field
from datetime import datetime
class DatasetBuildRequest(BaseModel):
"""Request model for dataset building"""
aid_list: List[int] = Field(..., description="List of video AIDs")
embedding_model: str = Field(..., description="Embedding model name")
force_regenerate: bool = Field(False, description="Whether to force regenerate embeddings")
class DatasetBuildResponse(BaseModel):
"""Response model for dataset building"""
dataset_id: str
total_records: int
status: str
message: str
created_at: Optional[datetime] = None
class DatasetRecord(BaseModel):
"""Model for a single dataset record"""
aid: int
embedding: List[float]
label: bool
metadata: Dict[str, Any]
user_labels: List[Dict[str, Any]]
inconsistent: bool
text_checksum: str
class DatasetInfo(BaseModel):
"""Model for dataset information"""
dataset_id: str
dataset: List[DatasetRecord]
stats: Dict[str, Any]
created_at: datetime
class DatasetBuildStats(BaseModel):
"""Statistics for dataset building process"""
total_records: int
new_embeddings: int
reused_embeddings: int
inconsistent_labels: int
embedding_model: str
processing_time: Optional[float] = None
class EmbeddingModelInfo(BaseModel):
"""Information about embedding models"""
name: str
dimensions: int
type: str
api_endpoint: Optional[str] = None
max_tokens: Optional[int] = None
max_batch_size: Optional[int] = None