add: text-difficulty/grammar

2024-10-02 21:11:23 +08:00 · 2024-10-02 21:11:23 +08:00 · 33754146c8
commit 33754146c8
parent ae6f10a6f0
9 changed files with 27792 additions and 0 deletions
--- a/text-difficulty/grammar/EGP.csv
+++ b/text-difficulty/grammar/EGP.csv
--- a/text-difficulty/grammar/EGP_Derivied.csv
+++ b/text-difficulty/grammar/EGP_Derivied.csv
--- a/text-difficulty/grammar/article.py
+++ b/text-difficulty/grammar/article.py
@ -0,0 +1,103 @@
+import torch
+import torch.nn.functional as F
+import nltk
+from nltk.tokenize import sent_tokenize, word_tokenize
+from training.model import AttentionBasedModel
+
+# Ensure required NLTK resources are available
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+
+# Load pre-saved mappings
+pos2idx = torch.load('pos2idx.pt')
+class_mapping = torch.load('class_mapping.pt')
+
+
+# Load the pre-trained model and state
+model = AttentionBasedModel(40, 32, 6, 8, 6, 256)
+model.load_state_dict(torch.load("./model.pt", weights_only=True))
+
+# Define helper functions
+def pad_sequence(seq, max_len):
+    return seq + [-1] * (max_len - len(seq))
+
+def encode_pos_tags(tagged_sentence):
+    return [pos2idx[tag] if tag in pos2idx else -1 for _, tag in tagged_sentence]
+
+# Split sentence into smaller chunks based on punctuation and length constraints
+def split_long_sentence(sentence, max_len=128):
+    tokens = word_tokenize(sentence)
+    
+    if len(tokens) <= max_len:
+        return [sentence]
+    
+    # Attempt to split based on punctuation marks
+    punctuation_marks = [',', ';', ':', '!', '?', '.', '-']
+    split_chunks = []
+    current_chunk = []
+    
+    for token in tokens:
+        current_chunk.append(token)
+        if token in punctuation_marks and len(current_chunk) >= max_len // 2:
+            split_chunks.append(' '.join(current_chunk))
+            current_chunk = []
+    
+    if current_chunk:
+        split_chunks.append(' '.join(current_chunk))
+    
+    # If chunks are still too long, truncate them
+    final_chunks = []
+    for chunk in split_chunks:
+        chunk_tokens = word_tokenize(chunk)
+        if len(chunk_tokens) > max_len:
+            final_chunks.extend([' '.join(chunk_tokens[i:i + max_len]) for i in range(0, len(chunk_tokens), max_len)])
+        else:
+            final_chunks.append(chunk)
+    
+    return final_chunks
+
+# Main function to process and score a chunk
+def score_sentence(sentence, model, max_length=128):
+    # Tokenize and POS-tag the sentence
+    tagged_sentence = nltk.pos_tag(nltk.word_tokenize(sentence))
+    
+    # Encode the POS tags and pad the sequence
+    encoded_sentences = encode_pos_tags(tagged_sentence)
+    padded_sentence = torch.tensor(pad_sequence(encoded_sentences, max_length))
+    
+    # Set the device
+    device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
+    
+    # Prepare the model
+    model.to(device)
+    model.eval()  # Ensure the model is in evaluation mode
+
+    # Define weights and CEFR levels
+    w_list = [1.04, 1.64, 2.35, 3.44, 4.92, 6.13]
+    
+    # Inference without gradient calculation
+    with torch.no_grad():
+        sentence_tensor = padded_sentence.to(device)
+        sentence_tensor = torch.unsqueeze(sentence_tensor, 0)  # Add batch dimension
+        
+        # Forward pass through the model
+        outputs = model(sentence_tensor)
+        
+        # Softmax and weighted scoring
+        probabilities = torch.softmax(outputs[0], dim=0)
+        score = sum(probabilities[i] * w_list[i] for i in range(6)).cpu().numpy()
+        
+    return score
+
+# Function to process a long article and return score list for each chunk
+def score_article(article, max_length=128, chunk_max_len=128):
+    sentences = sent_tokenize(article)  # Split the article into sentences
+    score_list = []
+    
+    for sentence in sentences:
+        chunks = split_long_sentence(sentence, max_len=chunk_max_len)
+        for chunk in chunks:
+            score = score_sentence(chunk, model, max_length=max_length)
+            score_list.append(float(score))
+    
+    return score_list
--- a/text-difficulty/grammar/data.csv
+++ b/text-difficulty/grammar/data.csv
--- a/text-difficulty/grammar/data_deriving.py
+++ b/text-difficulty/grammar/data_deriving.py
@ -0,0 +1,82 @@
+import os
+from dotenv import load_dotenv
+import pandas as pd
+from openai import OpenAI
+from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+load_dotenv()
+
+client = OpenAI(
+    api_key=os.getenv("API_KEY"),
+    base_url=os.getenv("BASE_URL"),
+)
+
+def get_AI_response(text, client, model_name, temp):
+    messages = [
+        {"role": "user", "content": text},
+    ]
+
+    response = client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        temperature=temp,
+    )
+
+    return response.choices[0].message.content
+
+def get_Examples(df, row, client, model_name, temp):
+    exp = df["Example"][row]
+    cds = df["Can-do statement"][row]
+    gdw = df["guideword"][row]
+    lvl = df["Level"][row]
+    cat = df["SuperCategory"][row] + '/' + df["SubCategory"][row]
+    prompt = \
+f'''Generate 10 example sentences based on the following instructions. 
+Pay close attention to the 'Can-do Statement' and ensure all generated sentences adhere strictly to it. 
+Provide only the sentences without any additional formatting or markdown. 
+Output the sentences in plain text, one sentence per line, and do not contain empty line.
+INSTRUCTION
+Level: {lvl}
+Guideword: {gdw}
+Can-do Statement: {cds}
+Category: {cat}
+Example Sentences: 
+{exp}
+'''
+    return get_AI_response(prompt, client, model_name, temp)
+
+def process_chunk(df, chunk, client, model, temp):
+    results = []
+    for row in chunk:
+        exps = get_Examples(df, row, client, model, temp)
+        results.append(exps)
+    return results
+
+input_file = './EGP.csv'
+df = pd.read_csv(input_file)
+newdf = df.copy()
+model = os.getenv("TRANSLATION_MODEL")
+temp = float(os.getenv("TRANSLATION_TEMP"))
+
+chunk_size = 64
+total_rows = len(df.index)
+num_chunks = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division
+
+with tqdm(total=total_rows) as pbar:
+    for chunk_idx in range(num_chunks):
+        start = chunk_idx * chunk_size
+        end = min(start + chunk_size, total_rows)
+        chunk = range(start, end)
+
+        with ThreadPoolExecutor(max_workers=len(chunk)) as executor:
+            futures = {executor.submit(get_Examples, df, row, client, model, temp): row for row in chunk}  # 将 row 与 future 绑定
+            for future in as_completed(futures):
+                row = futures[future]  # 获取对应的行号
+                result = future.result()  # 获取 AI 返回的结果
+                newdf.at[row, "Example"] = result  # 更新到正确的行
+
+        pbar.update(len(chunk))
+        newdf.to_csv("output.csv", index=False)
+
+newdf.to_csv("EGP_Derivied.csv", index=False)
--- a/text-difficulty/grammar/data_postprocessing.py
+++ b/text-difficulty/grammar/data_postprocessing.py
@ -0,0 +1,23 @@
+import pandas as pd
+
+df = pd.read_csv("EGP_Derivied.csv")
+newdf = pd.DataFrame()
+
+levels_list=[]
+sentences_list=[]
+category_list=[]
+for line in range(len(df.index)):
+    examples = list(filter(None, df["Example"][line].split("\n")))
+    lvl = df["Level"][line]
+    cat = df["SuperCategory"][line] + '/' + df["SubCategory"][line]
+    for sentence in examples:
+        sentences_list.append(sentence)
+        levels_list.append(lvl)
+        category_list.append(cat)
+
+
+newdf["Level"] = levels_list
+newdf["Category"] = category_list
+newdf["Sentence"] = sentences_list
+
+newdf.to_csv("data.csv", index=False)
--- a/text-difficulty/grammar/training/model.py
+++ b/text-difficulty/grammar/training/model.py
@ -0,0 +1,111 @@
+# model.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+class PositionalEncoding(nn.Module):
+    def __init__(self, embedding_dim, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+
+        # Create a positional encoding matrix of shape (max_len, embedding_dim)
+        pe = torch.zeros(max_len, embedding_dim)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
+
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        
+        # Add a batch dimension, so the shape becomes (1, max_len, embedding_dim)
+        pe = pe.unsqueeze(0)
+        
+        # Register the positional encoding as a buffer so it won't be updated by the optimizer
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        # x is expected to have shape (batch_size, seq_length, embedding_dim)
+        seq_length = x.size(1)
+        # Add positional encoding to input
+        x = x + self.pe[:, :seq_length]
+        return x
+
+class SelfAttention(nn.Module):
+    def __init__(self, input_dim, heads):
+        super(SelfAttention, self).__init__()
+        self.heads = heads
+        self.scale = (input_dim // heads) ** -0.5
+        self.qkv = nn.Linear(input_dim, input_dim * 3)
+        self.fc = nn.Linear(input_dim, input_dim)
+
+    def forward(self, x):
+        batch_size, seq_length, embedding_dim = x.shape
+        qkv = self.qkv(x).view(
+            batch_size, seq_length, self.heads, 3, embedding_dim // self.heads
+        )
+        q, k, v = qkv[..., 0, :], qkv[..., 1, :], qkv[..., 2, :]
+        q = q.permute(0, 2, 1, 3)
+        k = k.permute(0, 2, 1, 3)
+        v = v.permute(0, 2, 1, 3)
+
+        attn_weights = torch.matmul(q, k.transpose(-2, -1)) * self.scale
+        attn_weights = F.softmax(attn_weights, dim=-1)
+
+        attention_output = torch.matmul(attn_weights, v)
+        attention_output = attention_output.permute(0, 2, 1, 3).contiguous()
+        attention_output = attention_output.view(batch_size, seq_length, embedding_dim)
+        
+        return self.fc(attention_output)
+
+class AttentionBasedModel(nn.Module):
+    def __init__(self, pos_vocab_size, embedding_dim=128, num_classes=6, heads=8, num_attention_layers=3, dim_feedforward=512, max_len=128):
+        super(AttentionBasedModel, self).__init__()
+        self.embedding = nn.Embedding(pos_vocab_size, embedding_dim)  # Embedding for POS tags
+        self.positional_encoding = PositionalEncoding(embedding_dim, max_len)  # Positional Encoding
+        self.self_attention_layers = nn.ModuleList([
+            SelfAttention(embedding_dim, heads) for _ in range(num_attention_layers)
+        ])
+        self.fc1 = nn.Linear(embedding_dim, dim_feedforward)
+        self.fc2 = nn.Linear(dim_feedforward, num_classes)
+        self.dropout = nn.Dropout(0.5)
+        self.norm = nn.LayerNorm(embedding_dim)
+
+    def forward(self, x):
+        # Input x is a matrix of one-hot encoded POS tags, shape: (batch_size, seq_length, pos_vocab_size)
+        x = self.embedding(x)  # Convert POS tags to embeddings, shape: (batch_size, seq_length, embedding_dim)
+        
+        # Add positional encoding to embeddings
+        x = self.positional_encoding(x)
+        
+        for attn_layer in self.self_attention_layers:
+            attn_output = attn_layer(x)
+            x = self.norm(attn_output + x)
+
+        # Pool the output by taking the mean of the sequence (reduce along sequence length)
+        pooled_output = torch.mean(x, dim=1)
+
+        # Fully connected layers for classification
+        x = F.relu(self.fc1(pooled_output))
+        x = self.dropout(x)
+        x = self.fc2(x)  # Output logits for the 6 classes
+        
+        return x
+
+
+# Example Usage
+# # Hyperparameters
+# pos_vocab_size = 50  # Size of the POS tag vocabulary
+# max_context_length = 128  # Maximum context length
+# embedding_dim = 128  # Embedding size
+# num_classes = 6  # Output classes
+# batch_size = 32  # Example batch size
+
+# # Model initialization
+# model = AttentionBasedModel(pos_vocab_size, embedding_dim, num_classes)
+
+# # Example input: batch of one-hot encoded POS tags (variable length sequences)
+# input_data = torch.randint(0, pos_vocab_size, (batch_size, max_context_length))  # Random input for testing
+
+# # Forward pass
+# output = model(input_data)  # Output shape will be (batch_size, num_classes)
+
+# print(output.shape)  # Should print torch.Size([batch_size, num_classes])
--- a/text-difficulty/grammar/training/train.py
+++ b/text-difficulty/grammar/training/train.py
@ -0,0 +1,150 @@
+import torch
+import torch.optim as optim
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader
+import pandas as pd
+import nltk
+from sklearn.preprocessing import LabelEncoder
+from sklearn.model_selection import train_test_split
+import numpy as np
+
+#nltk.download('punkt')
+#nltk.download('averaged_perceptron_tagger')
+
+# Load the model classes
+from model import AttentionBasedModel
+
+# Load the data
+df = pd.read_csv('data.csv')
+
+# Step 1: Extract sentences and corresponding levels
+sentences = df['Sentence'].values
+levels = df['Level'].values
+
+# Step 2: Tokenize and POS tag each sentence
+pos_tags = [nltk.pos_tag(nltk.word_tokenize(sentence)) for sentence in sentences]
+
+# Step 3: Build POS tag vocabulary
+# Extract unique POS tags from the dataset
+pos_vocab = set()
+for tagged_sentence in pos_tags:
+    for _, tag in tagged_sentence:
+        pos_vocab.add(tag)
+
+# Create a mapping from POS tag to index
+pos2idx = {pos: idx for idx, pos in enumerate(pos_vocab)}
+pos_vocab_size = len(pos2idx)
+
+# Step 4: Encode sentences into POS tag indices
+def encode_pos_tags(tagged_sentence):
+    return [pos2idx[tag] for _, tag in tagged_sentence]
+
+encoded_sentences = [encode_pos_tags(tagged_sentence) for tagged_sentence in pos_tags]
+
+# Step 5: Encode levels (classes) into integers
+le = LabelEncoder()
+encoded_levels = le.fit_transform(levels)
+num_classes = len(le.classes_)
+
+# Save class encoding mapping
+class_mapping = dict(zip(le.transform(le.classes_), le.classes_))
+torch.save(class_mapping, 'class_mapping.pt')
+
+# Save POS tag encoding mapping
+torch.save(pos2idx, 'pos2idx.pt')
+
+# Step 6: Pad sentences to a fixed length
+max_length = 64
+
+def pad_sequence(seq, max_len):
+    return seq + [-1] * (max_len - len(seq))
+
+padded_sentences = [pad_sequence(seq, max_length) for seq in encoded_sentences]
+
+# Step 7: Create a PyTorch Dataset and DataLoader
+class POSDataset(Dataset):
+    def __init__(self, sentences, labels):
+        self.sentences = sentences
+        self.labels = labels
+    
+    def __len__(self):
+        return len(self.sentences)
+    
+    def __getitem__(self, idx):
+        sentence = torch.tensor(self.sentences[idx], dtype=torch.long)
+        label = torch.tensor(self.labels[idx], dtype=torch.long)
+        return sentence, label
+
+# Split data into training and validation sets
+X_train, X_val, y_train, y_val = train_test_split(padded_sentences, encoded_levels, test_size=0.2)
+
+train_dataset = POSDataset(X_train, y_train)
+val_dataset = POSDataset(X_val, y_val)
+
+batch_size = 128
+train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
+
+# Step 8: Initialize the model, loss function, and optimizer
+embedding_dim = 32
+heads = 8
+num_attention_layers = 6
+dim_feedforward = 256
+learning_rate = 0.003
+
+model = AttentionBasedModel(pos_vocab_size, embedding_dim, num_classes, heads, num_attention_layers, dim_feedforward)
+criterion = nn.CrossEntropyLoss()
+optimizer = optim.Adam(model.parameters(), lr=learning_rate)
+
+# Step 9: Training loop
+num_epochs = 100
+device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
+model.to(device)
+
+step = 0
+
+for epoch in range(num_epochs):
+    model.train()
+    running_loss = 0.0
+    
+    for sentences, labels in train_loader:
+        sentences, labels = sentences.to(device), labels.to(device)
+        
+        # Forward pass
+        outputs = model(sentences)
+        loss = criterion(outputs, labels)
+        
+        # Backward pass and optimization
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        step += 1
+        
+        running_loss += loss.item()
+    
+    # Validation phase
+    model.eval()
+    val_loss = 0.0
+    correct = 0
+    total = 0
+    
+    with torch.no_grad():
+        for sentences, labels in val_loader:
+            sentences, labels = sentences.to(device), labels.to(device)
+            
+            outputs = model(sentences)
+            loss = criterion(outputs, labels)
+            val_loss += loss.item()
+            
+            # Calculate accuracy
+            _, predicted = torch.max(outputs, 1)
+            total += labels.size(0)
+            correct += (predicted == labels).sum().item()
+    
+    # Print training and validation stats
+    print(f'Epoch [{epoch+1}/{num_epochs}], Step {step}, Loss: {running_loss/len(train_loader):.4f}, '
+          f'Validation Loss: {val_loss/len(val_loader):.4f}, Accuracy: {100 * correct / total:.2f}%')
+    torch.save(model.state_dict(), f'checkpoints/step_{step}.pt')
+
+# Step 10: Save the trained model
+torch.save(model.state_dict(), 'model.pt')
--- a/text-difficulty/grammar/validation/inference.py
+++ b/text-difficulty/grammar/validation/inference.py
@ -0,0 +1,46 @@
+from training.model import AttentionBasedModel
+import torch
+import torch.nn.functional as F
+import nltk
+
+
+sentence = '''Smartphones have worked their way deep into our lives and have become indispensable for work and socialising.'''
+
+pos2idx = torch.load('pos2idx.pt')
+class_mapping = torch.load('class_mapping.pt')
+
+def pad_sequence(seq, max_len):
+    return seq + [-1] * (max_len - len(seq))
+
+def encode_pos_tags(tagged_sentence):
+    return [pos2idx[tag] if tag in pos2idx else -1 for _, tag in tagged_sentence]
+
+
+max_length = 64
+
+tagged_sentence = nltk.pos_tag(nltk.word_tokenize(sentence))
+encoded_sentences = encode_pos_tags(tagged_sentence)
+padded_sentence = torch.tensor(pad_sequence(encoded_sentences, max_length))
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
+model = AttentionBasedModel(40, 32, 6, 8, 6, 256)
+model.load_state_dict(torch.load("./model.pt", weights_only=True))
+model.to(device)
+model.eval()  # 确保模型处于评估模式
+
+
+w_list=[1.35, 1.63, 2.75, 3.64, 5.38, 6.32]
+cefr_dict = [None, "A1", "A2", "B1", "B2", "C1", "C2"]
+with torch.no_grad():
+    sentence = padded_sentence.to(device)
+    sentences = torch.unsqueeze(sentence, 0)
+    
+    outputs = model(sentences)
+    print(torch.max(outputs, 1))
+    print(outputs[0])
+    print(torch.softmax(outputs[0],0))
+    s=0
+    for i in range(6):
+        s+=torch.softmax(outputs[0],0)[i] * w_list[i]
+    s=s.cpu().numpy()
+    # the s is the final output.