167 lines
4.8 KiB
Python
167 lines
4.8 KiB
Python
"""
|
|
Data models for dataset building functionality
|
|
"""
|
|
|
|
from typing import List, Optional, Dict, Any, Literal
|
|
import uuid
|
|
from pydantic import BaseModel, Field
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
|
|
|
|
class TaskStatus(str, Enum):
|
|
"""Task status enumeration"""
|
|
PENDING = "pending"
|
|
RUNNING = "running"
|
|
COMPLETED = "completed"
|
|
FAILED = "failed"
|
|
CANCELLED = "cancelled"
|
|
|
|
|
|
class SamplingStrategy(str, Enum):
|
|
"""Sampling strategy enumeration"""
|
|
ALL = "all" # All labeled AIDs
|
|
RANDOM = "random" # Random sampling from labeled data
|
|
|
|
|
|
class TaskProgress(BaseModel):
|
|
"""Progress information for a task"""
|
|
current_step: str
|
|
total_steps: int
|
|
completed_steps: int
|
|
percentage: float
|
|
message: Optional[str] = None
|
|
estimated_time_remaining: Optional[float] = None
|
|
|
|
|
|
class DatasetBuildTaskStatus(BaseModel):
|
|
"""Status model for dataset building task"""
|
|
task_id: str
|
|
status: TaskStatus
|
|
dataset_id: Optional[str] = None
|
|
aid_list: List[int]
|
|
embedding_model: str
|
|
force_regenerate: bool
|
|
progress: Optional[TaskProgress] = None
|
|
error_message: Optional[str] = None
|
|
created_at: datetime
|
|
started_at: Optional[datetime] = None
|
|
completed_at: Optional[datetime] = None
|
|
result: Optional[Dict[str, Any]] = None
|
|
|
|
|
|
class DatasetBuildRequest(BaseModel):
|
|
"""Request model for dataset building"""
|
|
id: Optional[str] = Field(str(uuid.uuid4()), description="Dataset ID")
|
|
aid_list: List[int] = Field(..., description="List of video AIDs")
|
|
embedding_model: str = Field(..., description="Embedding model name")
|
|
force_regenerate: bool = Field(False, description="Whether to force regenerate embeddings")
|
|
description: Optional[str] = Field(None, description="Optional description for the dataset")
|
|
|
|
|
|
class DatasetBuildResponse(BaseModel):
|
|
"""Response model for dataset building"""
|
|
dataset_id: str
|
|
total_records: int
|
|
status: str
|
|
message: str
|
|
description: Optional[str] = None
|
|
created_at: Optional[datetime] = None
|
|
|
|
|
|
class DatasetRecord(BaseModel):
|
|
"""Model for a single dataset record"""
|
|
aid: int
|
|
embedding: List[float]
|
|
label: bool
|
|
metadata: Dict[str, Any]
|
|
user_labels: List[Dict[str, Any]]
|
|
inconsistent: bool
|
|
text_checksum: str
|
|
|
|
|
|
class DatasetInfo(BaseModel):
|
|
"""Model for dataset information"""
|
|
dataset_id: str
|
|
dataset: List[DatasetRecord]
|
|
stats: Dict[str, Any]
|
|
created_at: datetime
|
|
|
|
|
|
class DatasetBuildStats(BaseModel):
|
|
"""Statistics for dataset building process"""
|
|
total_records: int
|
|
new_embeddings: int
|
|
reused_embeddings: int
|
|
inconsistent_labels: int
|
|
embedding_model: str
|
|
processing_time: Optional[float] = None
|
|
|
|
|
|
class EmbeddingModelInfo(BaseModel):
|
|
"""Information about embedding models"""
|
|
name: str
|
|
dimensions: int
|
|
type: str
|
|
api_endpoint: Optional[str] = None
|
|
max_tokens: Optional[int] = None
|
|
max_batch_size: Optional[int] = None
|
|
|
|
|
|
# Sampling and Dataset Selection Models
|
|
|
|
class SamplingRequest(BaseModel):
|
|
"""Request model for dataset sampling"""
|
|
strategy: SamplingStrategy = Field(..., description="Sampling strategy to use")
|
|
limit: Optional[int] = Field(None, description="Maximum number of AIDs to sample (for random sampling)")
|
|
|
|
|
|
class SamplingResponse(BaseModel):
|
|
"""Response model for dataset sampling"""
|
|
strategy: SamplingStrategy
|
|
total_available: int
|
|
sampled_count: int
|
|
aid_list: List[int]
|
|
filters_applied: Optional[Dict[str, Any]] = None
|
|
sampling_info: Dict[str, Any]
|
|
|
|
|
|
class DatasetCreateRequest(BaseModel):
|
|
"""Request model for creating dataset with sampling"""
|
|
sampling: SamplingRequest = Field(..., description="Sampling configuration")
|
|
embedding_model: str = Field(..., description="Embedding model name")
|
|
force_regenerate: bool = Field(False, description="Whether to force regenerate embeddings")
|
|
description: Optional[str] = Field(None, description="Optional description for the dataset")
|
|
|
|
|
|
class DatasetCreateResponse(BaseModel):
|
|
"""Response model for dataset creation"""
|
|
dataset_id: str
|
|
sampling_response: SamplingResponse
|
|
task_id: str
|
|
total_records: int
|
|
status: str
|
|
message: str
|
|
description: Optional[str] = None
|
|
|
|
|
|
class TaskStatusResponse(BaseModel):
|
|
"""Response model for task status endpoint"""
|
|
task_id: str
|
|
status: TaskStatus
|
|
progress: Optional[Dict[str, Any]] = None
|
|
result: Optional[Dict[str, Any]] = None
|
|
error: Optional[str] = None
|
|
created_at: datetime
|
|
started_at: Optional[datetime] = None
|
|
completed_at: Optional[datetime] = None
|
|
|
|
|
|
class TaskListResponse(BaseModel):
|
|
"""Response model for listing tasks"""
|
|
tasks: List[TaskStatusResponse]
|
|
total_count: int
|
|
pending_count: int
|
|
running_count: int
|
|
completed_count: int
|
|
failed_count: int |