62 lines
1.6 KiB
Python
62 lines
1.6 KiB
Python
"""
|
|
Data models for dataset building functionality
|
|
"""
|
|
|
|
from typing import List, Optional, Dict, Any
|
|
from pydantic import BaseModel, Field
|
|
from datetime import datetime
|
|
|
|
|
|
class DatasetBuildRequest(BaseModel):
|
|
"""Request model for dataset building"""
|
|
aid_list: List[int] = Field(..., description="List of video AIDs")
|
|
embedding_model: str = Field(..., description="Embedding model name")
|
|
force_regenerate: bool = Field(False, description="Whether to force regenerate embeddings")
|
|
|
|
|
|
class DatasetBuildResponse(BaseModel):
|
|
"""Response model for dataset building"""
|
|
dataset_id: str
|
|
total_records: int
|
|
status: str
|
|
message: str
|
|
created_at: Optional[datetime] = None
|
|
|
|
|
|
class DatasetRecord(BaseModel):
|
|
"""Model for a single dataset record"""
|
|
aid: int
|
|
embedding: List[float]
|
|
label: bool
|
|
metadata: Dict[str, Any]
|
|
user_labels: List[Dict[str, Any]]
|
|
inconsistent: bool
|
|
text_checksum: str
|
|
|
|
|
|
class DatasetInfo(BaseModel):
|
|
"""Model for dataset information"""
|
|
dataset_id: str
|
|
dataset: List[DatasetRecord]
|
|
stats: Dict[str, Any]
|
|
created_at: datetime
|
|
|
|
|
|
class DatasetBuildStats(BaseModel):
|
|
"""Statistics for dataset building process"""
|
|
total_records: int
|
|
new_embeddings: int
|
|
reused_embeddings: int
|
|
inconsistent_labels: int
|
|
embedding_model: str
|
|
processing_time: Optional[float] = None
|
|
|
|
|
|
class EmbeddingModelInfo(BaseModel):
|
|
"""Information about embedding models"""
|
|
name: str
|
|
dimensions: int
|
|
type: str
|
|
api_endpoint: Optional[str] = None
|
|
max_tokens: Optional[int] = None
|
|
max_batch_size: Optional[int] = None |