add: text-difficulty/grammar

2024-10-02 21:11:23 +08:00 · 2024-10-02 21:11:23 +08:00 · 33754146c8
commit 33754146c8
parent ae6f10a6f0
9 changed files with 27792 additions and 0 deletions
--- a/text-difficulty/grammar/EGP.csv
+++ b/text-difficulty/grammar/EGP.csv
--- a/text-difficulty/grammar/EGP_Derivied.csv
+++ b/text-difficulty/grammar/EGP_Derivied.csv
--- a/text-difficulty/grammar/article.py
+++ b/text-difficulty/grammar/article.py
@ -0,0 +1,103 @@
 import torch
 import torch.nn.functional as F
 import nltk
 from nltk.tokenize import sent_tokenize, word_tokenize
 from training.model import AttentionBasedModel
 # Ensure required NLTK resources are available
 nltk.download('punkt')
 nltk.download('averaged_perceptron_tagger')
 # Load pre-saved mappings
 pos2idx = torch.load('pos2idx.pt')
 class_mapping = torch.load('class_mapping.pt')
 # Load the pre-trained model and state
 model = AttentionBasedModel(40, 32, 6, 8, 6, 256)
 model.load_state_dict(torch.load("./model.pt", weights_only=True))
 # Define helper functions
 def pad_sequence(seq, max_len):
    return seq + [-1] * (max_len - len(seq))
 def encode_pos_tags(tagged_sentence):
    return [pos2idx[tag] if tag in pos2idx else -1 for _, tag in tagged_sentence]
 # Split sentence into smaller chunks based on punctuation and length constraints
 def split_long_sentence(sentence, max_len=128):
    tokens = word_tokenize(sentence)
    if len(tokens) <= max_len:
        return [sentence]
    # Attempt to split based on punctuation marks
    punctuation_marks = [',', ';', ':', '!', '?', '.', '-']
    split_chunks = []
    current_chunk = []
    for token in tokens:
        current_chunk.append(token)
        if token in punctuation_marks and len(current_chunk) >= max_len // 2:
            split_chunks.append(' '.join(current_chunk))
            current_chunk = []
    if current_chunk:
        split_chunks.append(' '.join(current_chunk))
    # If chunks are still too long, truncate them
    final_chunks = []
    for chunk in split_chunks:
        chunk_tokens = word_tokenize(chunk)
        if len(chunk_tokens) > max_len:
            final_chunks.extend([' '.join(chunk_tokens[i:i + max_len]) for i in range(0, len(chunk_tokens), max_len)])
        else:
            final_chunks.append(chunk)
    return final_chunks
 # Main function to process and score a chunk
 def score_sentence(sentence, model, max_length=128):
    # Tokenize and POS-tag the sentence
    tagged_sentence = nltk.pos_tag(nltk.word_tokenize(sentence))
    # Encode the POS tags and pad the sequence
    encoded_sentences = encode_pos_tags(tagged_sentence)
    padded_sentence = torch.tensor(pad_sequence(encoded_sentences, max_length))
    # Set the device
    device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
    # Prepare the model
    model.to(device)
    model.eval()  # Ensure the model is in evaluation mode
    # Define weights and CEFR levels
    w_list = [1.04, 1.64, 2.35, 3.44, 4.92, 6.13]
    # Inference without gradient calculation
    with torch.no_grad():
        sentence_tensor = padded_sentence.to(device)
        sentence_tensor = torch.unsqueeze(sentence_tensor, 0)  # Add batch dimension
        # Forward pass through the model
        outputs = model(sentence_tensor)
        # Softmax and weighted scoring
        probabilities = torch.softmax(outputs[0], dim=0)
        score = sum(probabilities[i] * w_list[i] for i in range(6)).cpu().numpy()
    return score
 # Function to process a long article and return score list for each chunk
 def score_article(article, max_length=128, chunk_max_len=128):
    sentences = sent_tokenize(article)  # Split the article into sentences
    score_list = []
    for sentence in sentences:
        chunks = split_long_sentence(sentence, max_len=chunk_max_len)
        for chunk in chunks:
            score = score_sentence(chunk, model, max_length=max_length)
            score_list.append(float(score))
    return score_list
--- a/text-difficulty/grammar/data.csv
+++ b/text-difficulty/grammar/data.csv
--- a/text-difficulty/grammar/data_deriving.py
+++ b/text-difficulty/grammar/data_deriving.py
@ -0,0 +1,82 @@
 import os
 from dotenv import load_dotenv
 import pandas as pd
 from openai import OpenAI
 from tqdm import tqdm
 from concurrent.futures import ThreadPoolExecutor, as_completed
 load_dotenv()
 client = OpenAI(
    api_key=os.getenv("API_KEY"),
    base_url=os.getenv("BASE_URL"),
 )
 def get_AI_response(text, client, model_name, temp):
    messages = [
        {"role": "user", "content": text},
    ]
    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        temperature=temp,
    )
    return response.choices[0].message.content
 def get_Examples(df, row, client, model_name, temp):
    exp = df["Example"][row]
    cds = df["Can-do statement"][row]
    gdw = df["guideword"][row]
    lvl = df["Level"][row]
    cat = df["SuperCategory"][row] + '/' + df["SubCategory"][row]
    prompt = \
 f'''Generate 10 example sentences based on the following instructions. 
 Pay close attention to the 'Can-do Statement' and ensure all generated sentences adhere strictly to it. 
 Provide only the sentences without any additional formatting or markdown. 
 Output the sentences in plain text, one sentence per line, and do not contain empty line.
 INSTRUCTION
 Level: {lvl}
 Guideword: {gdw}
 Can-do Statement: {cds}
 Category: {cat}
 Example Sentences: 
 {exp}
 '''
    return get_AI_response(prompt, client, model_name, temp)
 def process_chunk(df, chunk, client, model, temp):
    results = []
    for row in chunk:
        exps = get_Examples(df, row, client, model, temp)
        results.append(exps)
    return results
 input_file = './EGP.csv'
 df = pd.read_csv(input_file)
 newdf = df.copy()
 model = os.getenv("TRANSLATION_MODEL")
 temp = float(os.getenv("TRANSLATION_TEMP"))
 chunk_size = 64
 total_rows = len(df.index)
 num_chunks = (total_rows + chunk_size - 1) // chunk_size  # Ceiling division
 with tqdm(total=total_rows) as pbar:
    for chunk_idx in range(num_chunks):
        start = chunk_idx * chunk_size
        end = min(start + chunk_size, total_rows)
        chunk = range(start, end)
        with ThreadPoolExecutor(max_workers=len(chunk)) as executor:
            futures = {executor.submit(get_Examples, df, row, client, model, temp): row for row in chunk}  # 将 row 与 future 绑定
            for future in as_completed(futures):
                row = futures[future]  # 获取对应的行号
                result = future.result()  # 获取 AI 返回的结果
                newdf.at[row, "Example"] = result  # 更新到正确的行
        pbar.update(len(chunk))
        newdf.to_csv("output.csv", index=False)
 newdf.to_csv("EGP_Derivied.csv", index=False)
--- a/text-difficulty/grammar/data_postprocessing.py
+++ b/text-difficulty/grammar/data_postprocessing.py
@ -0,0 +1,23 @@
 import pandas as pd
 df = pd.read_csv("EGP_Derivied.csv")
 newdf = pd.DataFrame()
 levels_list=[]
 sentences_list=[]
 category_list=[]
 for line in range(len(df.index)):
    examples = list(filter(None, df["Example"][line].split("\n")))
    lvl = df["Level"][line]
    cat = df["SuperCategory"][line] + '/' + df["SubCategory"][line]
    for sentence in examples:
        sentences_list.append(sentence)
        levels_list.append(lvl)
        category_list.append(cat)
 newdf["Level"] = levels_list
 newdf["Category"] = category_list
 newdf["Sentence"] = sentences_list
 newdf.to_csv("data.csv", index=False)
--- a/text-difficulty/grammar/training/model.py
+++ b/text-difficulty/grammar/training/model.py
@ -0,0 +1,111 @@
 # model.py
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import math
 class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, max_len=5000):
        super(PositionalEncoding, self).__init__()
        # Create a positional encoding matrix of shape (max_len, embedding_dim)
        pe = torch.zeros(max_len, embedding_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        # Add a batch dimension, so the shape becomes (1, max_len, embedding_dim)
        pe = pe.unsqueeze(0)
        # Register the positional encoding as a buffer so it won't be updated by the optimizer
        self.register_buffer('pe', pe)
    def forward(self, x):
        # x is expected to have shape (batch_size, seq_length, embedding_dim)
        seq_length = x.size(1)
        # Add positional encoding to input
        x = x + self.pe[:, :seq_length]
        return x
 class SelfAttention(nn.Module):
    def __init__(self, input_dim, heads):
        super(SelfAttention, self).__init__()
        self.heads = heads
        self.scale = (input_dim // heads) ** -0.5
        self.qkv = nn.Linear(input_dim, input_dim * 3)
        self.fc = nn.Linear(input_dim, input_dim)
    def forward(self, x):
        batch_size, seq_length, embedding_dim = x.shape
        qkv = self.qkv(x).view(
            batch_size, seq_length, self.heads, 3, embedding_dim // self.heads
        )
        q, k, v = qkv[..., 0, :], qkv[..., 1, :], qkv[..., 2, :]
        q = q.permute(0, 2, 1, 3)
        k = k.permute(0, 2, 1, 3)
        v = v.permute(0, 2, 1, 3)
        attn_weights = torch.matmul(q, k.transpose(-2, -1)) * self.scale
        attn_weights = F.softmax(attn_weights, dim=-1)
        attention_output = torch.matmul(attn_weights, v)
        attention_output = attention_output.permute(0, 2, 1, 3).contiguous()
        attention_output = attention_output.view(batch_size, seq_length, embedding_dim)
        return self.fc(attention_output)
 class AttentionBasedModel(nn.Module):
    def __init__(self, pos_vocab_size, embedding_dim=128, num_classes=6, heads=8, num_attention_layers=3, dim_feedforward=512, max_len=128):
        super(AttentionBasedModel, self).__init__()
        self.embedding = nn.Embedding(pos_vocab_size, embedding_dim)  # Embedding for POS tags
        self.positional_encoding = PositionalEncoding(embedding_dim, max_len)  # Positional Encoding
        self.self_attention_layers = nn.ModuleList([
            SelfAttention(embedding_dim, heads) for _ in range(num_attention_layers)
        ])
        self.fc1 = nn.Linear(embedding_dim, dim_feedforward)
        self.fc2 = nn.Linear(dim_feedforward, num_classes)
        self.dropout = nn.Dropout(0.5)
        self.norm = nn.LayerNorm(embedding_dim)
    def forward(self, x):
        # Input x is a matrix of one-hot encoded POS tags, shape: (batch_size, seq_length, pos_vocab_size)
        x = self.embedding(x)  # Convert POS tags to embeddings, shape: (batch_size, seq_length, embedding_dim)
        # Add positional encoding to embeddings
        x = self.positional_encoding(x)
        for attn_layer in self.self_attention_layers:
            attn_output = attn_layer(x)
            x = self.norm(attn_output + x)
        # Pool the output by taking the mean of the sequence (reduce along sequence length)
        pooled_output = torch.mean(x, dim=1)
        # Fully connected layers for classification
        x = F.relu(self.fc1(pooled_output))
        x = self.dropout(x)
        x = self.fc2(x)  # Output logits for the 6 classes
        return x
 # Example Usage
 # # Hyperparameters
 # pos_vocab_size = 50  # Size of the POS tag vocabulary
 # max_context_length = 128  # Maximum context length
 # embedding_dim = 128  # Embedding size
 # num_classes = 6  # Output classes
 # batch_size = 32  # Example batch size
 # # Model initialization
 # model = AttentionBasedModel(pos_vocab_size, embedding_dim, num_classes)
 # # Example input: batch of one-hot encoded POS tags (variable length sequences)
 # input_data = torch.randint(0, pos_vocab_size, (batch_size, max_context_length))  # Random input for testing
 # # Forward pass
 # output = model(input_data)  # Output shape will be (batch_size, num_classes)
 # print(output.shape)  # Should print torch.Size([batch_size, num_classes])
--- a/text-difficulty/grammar/training/train.py
+++ b/text-difficulty/grammar/training/train.py
@ -0,0 +1,150 @@
 import torch
 import torch.optim as optim
 import torch.nn as nn
 from torch.utils.data import Dataset, DataLoader
 import pandas as pd
 import nltk
 from sklearn.preprocessing import LabelEncoder
 from sklearn.model_selection import train_test_split
 import numpy as np
 #nltk.download('punkt')
 #nltk.download('averaged_perceptron_tagger')
 # Load the model classes
 from model import AttentionBasedModel
 # Load the data
 df = pd.read_csv('data.csv')
 # Step 1: Extract sentences and corresponding levels
 sentences = df['Sentence'].values
 levels = df['Level'].values
 # Step 2: Tokenize and POS tag each sentence
 pos_tags = [nltk.pos_tag(nltk.word_tokenize(sentence)) for sentence in sentences]
 # Step 3: Build POS tag vocabulary
 # Extract unique POS tags from the dataset
 pos_vocab = set()
 for tagged_sentence in pos_tags:
    for _, tag in tagged_sentence:
        pos_vocab.add(tag)
 # Create a mapping from POS tag to index
 pos2idx = {pos: idx for idx, pos in enumerate(pos_vocab)}
 pos_vocab_size = len(pos2idx)
 # Step 4: Encode sentences into POS tag indices
 def encode_pos_tags(tagged_sentence):
    return [pos2idx[tag] for _, tag in tagged_sentence]
 encoded_sentences = [encode_pos_tags(tagged_sentence) for tagged_sentence in pos_tags]
 # Step 5: Encode levels (classes) into integers
 le = LabelEncoder()
 encoded_levels = le.fit_transform(levels)
 num_classes = len(le.classes_)
 # Save class encoding mapping
 class_mapping = dict(zip(le.transform(le.classes_), le.classes_))
 torch.save(class_mapping, 'class_mapping.pt')
 # Save POS tag encoding mapping
 torch.save(pos2idx, 'pos2idx.pt')
 # Step 6: Pad sentences to a fixed length
 max_length = 64
 def pad_sequence(seq, max_len):
    return seq + [-1] * (max_len - len(seq))
 padded_sentences = [pad_sequence(seq, max_length) for seq in encoded_sentences]
 # Step 7: Create a PyTorch Dataset and DataLoader
 class POSDataset(Dataset):
    def __init__(self, sentences, labels):
        self.sentences = sentences
        self.labels = labels
    def __len__(self):
        return len(self.sentences)
    def __getitem__(self, idx):
        sentence = torch.tensor(self.sentences[idx], dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return sentence, label
 # Split data into training and validation sets
 X_train, X_val, y_train, y_val = train_test_split(padded_sentences, encoded_levels, test_size=0.2)
 train_dataset = POSDataset(X_train, y_train)
 val_dataset = POSDataset(X_val, y_val)
 batch_size = 128
 train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
 val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
 # Step 8: Initialize the model, loss function, and optimizer
 embedding_dim = 32
 heads = 8
 num_attention_layers = 6
 dim_feedforward = 256
 learning_rate = 0.003
 model = AttentionBasedModel(pos_vocab_size, embedding_dim, num_classes, heads, num_attention_layers, dim_feedforward)
 criterion = nn.CrossEntropyLoss()
 optimizer = optim.Adam(model.parameters(), lr=learning_rate)
 # Step 9: Training loop
 num_epochs = 100
 device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
 model.to(device)
 step = 0
 for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for sentences, labels in train_loader:
        sentences, labels = sentences.to(device), labels.to(device)
        # Forward pass
        outputs = model(sentences)
        loss = criterion(outputs, labels)
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        step += 1
        running_loss += loss.item()
    # Validation phase
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for sentences, labels in val_loader:
            sentences, labels = sentences.to(device), labels.to(device)
            outputs = model(sentences)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    # Print training and validation stats
    print(f'Epoch [{epoch+1}/{num_epochs}], Step {step}, Loss: {running_loss/len(train_loader):.4f}, '
          f'Validation Loss: {val_loss/len(val_loader):.4f}, Accuracy: {100 * correct / total:.2f}%')
    torch.save(model.state_dict(), f'checkpoints/step_{step}.pt')
 # Step 10: Save the trained model
 torch.save(model.state_dict(), 'model.pt')
--- a/text-difficulty/grammar/validation/inference.py
+++ b/text-difficulty/grammar/validation/inference.py
@ -0,0 +1,46 @@
 from training.model import AttentionBasedModel
 import torch
 import torch.nn.functional as F
 import nltk
 sentence = '''Smartphones have worked their way deep into our lives and have become indispensable for work and socialising.'''
 pos2idx = torch.load('pos2idx.pt')
 class_mapping = torch.load('class_mapping.pt')
 def pad_sequence(seq, max_len):
    return seq + [-1] * (max_len - len(seq))
 def encode_pos_tags(tagged_sentence):
    return [pos2idx[tag] if tag in pos2idx else -1 for _, tag in tagged_sentence]
 max_length = 64
 tagged_sentence = nltk.pos_tag(nltk.word_tokenize(sentence))
 encoded_sentences = encode_pos_tags(tagged_sentence)
 padded_sentence = torch.tensor(pad_sequence(encoded_sentences, max_length))
 device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
 model = AttentionBasedModel(40, 32, 6, 8, 6, 256)
 model.load_state_dict(torch.load("./model.pt", weights_only=True))
 model.to(device)
 model.eval()  # 确保模型处于评估模式
 w_list=[1.35, 1.63, 2.75, 3.64, 5.38, 6.32]
 cefr_dict = [None, "A1", "A2", "B1", "B2", "C1", "C2"]
 with torch.no_grad():
    sentence = padded_sentence.to(device)
    sentences = torch.unsqueeze(sentence, 0)
    outputs = model(sentences)
    print(torch.max(outputs, 1))
    print(outputs[0])
    print(torch.softmax(outputs[0],0))
    s=0
    for i in range(6):
        s+=torch.softmax(outputs[0],0)[i] * w_list[i]
    s=s.cpu().numpy()
    # the s is the final output.