# model.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, max_len=5000):
        super(PositionalEncoding, self).__init__()

        # Create a positional encoding matrix of shape (max_len, embedding_dim)
        pe = torch.zeros(max_len, embedding_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        # Add a batch dimension, so the shape becomes (1, max_len, embedding_dim)
        pe = pe.unsqueeze(0)
        
        # Register the positional encoding as a buffer so it won't be updated by the optimizer
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x is expected to have shape (batch_size, seq_length, embedding_dim)
        seq_length = x.size(1)
        # Add positional encoding to input
        x = x + self.pe[:, :seq_length]
        return x

class SelfAttention(nn.Module):
    def __init__(self, input_dim, heads):
        super(SelfAttention, self).__init__()
        self.heads = heads
        self.scale = (input_dim // heads) ** -0.5
        self.qkv = nn.Linear(input_dim, input_dim * 3)
        self.fc = nn.Linear(input_dim, input_dim)

    def forward(self, x):
        batch_size, seq_length, embedding_dim = x.shape
        qkv = self.qkv(x).view(
            batch_size, seq_length, self.heads, 3, embedding_dim // self.heads
        )
        q, k, v = qkv[..., 0, :], qkv[..., 1, :], qkv[..., 2, :]
        q = q.permute(0, 2, 1, 3)
        k = k.permute(0, 2, 1, 3)
        v = v.permute(0, 2, 1, 3)

        attn_weights = torch.matmul(q, k.transpose(-2, -1)) * self.scale
        attn_weights = F.softmax(attn_weights, dim=-1)

        attention_output = torch.matmul(attn_weights, v)
        attention_output = attention_output.permute(0, 2, 1, 3).contiguous()
        attention_output = attention_output.view(batch_size, seq_length, embedding_dim)
        
        return self.fc(attention_output)

class AttentionBasedModel(nn.Module):
    def __init__(self, pos_vocab_size, embedding_dim=128, num_classes=6, heads=8, num_attention_layers=3, dim_feedforward=512, max_len=128):
        super(AttentionBasedModel, self).__init__()
        self.embedding = nn.Embedding(pos_vocab_size, embedding_dim)  # Embedding for POS tags
        self.positional_encoding = PositionalEncoding(embedding_dim, max_len)  # Positional Encoding
        self.self_attention_layers = nn.ModuleList([
            SelfAttention(embedding_dim, heads) for _ in range(num_attention_layers)
        ])
        self.fc1 = nn.Linear(embedding_dim, dim_feedforward)
        self.fc2 = nn.Linear(dim_feedforward, num_classes)
        self.dropout = nn.Dropout(0.5)
        self.norm = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        # Input x is a matrix of one-hot encoded POS tags, shape: (batch_size, seq_length, pos_vocab_size)
        x = self.embedding(x)  # Convert POS tags to embeddings, shape: (batch_size, seq_length, embedding_dim)
        
        # Add positional encoding to embeddings
        x = self.positional_encoding(x)
        
        for attn_layer in self.self_attention_layers:
            attn_output = attn_layer(x)
            x = self.norm(attn_output + x)

        # Pool the output by taking the mean of the sequence (reduce along sequence length)
        pooled_output = torch.mean(x, dim=1)

        # Fully connected layers for classification
        x = F.relu(self.fc1(pooled_output))
        x = self.dropout(x)
        x = self.fc2(x)  # Output logits for the 6 classes
        
        return x


# Example Usage
# # Hyperparameters
# pos_vocab_size = 50  # Size of the POS tag vocabulary
# max_context_length = 128  # Maximum context length
# embedding_dim = 128  # Embedding size
# num_classes = 6  # Output classes
# batch_size = 32  # Example batch size

# # Model initialization
# model = AttentionBasedModel(pos_vocab_size, embedding_dim, num_classes)

# # Example input: batch of one-hot encoded POS tags (variable length sequences)
# input_data = torch.randint(0, pos_vocab_size, (batch_size, max_context_length))  # Random input for testing

# # Forward pass
# output = model(input_data)  # Output shape will be (batch_size, num_classes)

# print(output.shape)  # Should print torch.Size([batch_size, num_classes])