add: text-difficulty/grammar

This commit is contained in:
alikia2x (寒寒) 2024-10-02 21:11:23 +08:00
parent ae6f10a6f0
commit 33754146c8
Signed by: alikia2x
GPG Key ID: 56209E0CCD8420C6
9 changed files with 27792 additions and 0 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,103 @@
import torch
import torch.nn.functional as F
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from training.model import AttentionBasedModel
# Ensure required NLTK resources are available
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# Load pre-saved mappings
pos2idx = torch.load('pos2idx.pt')
class_mapping = torch.load('class_mapping.pt')
# Load the pre-trained model and state
model = AttentionBasedModel(40, 32, 6, 8, 6, 256)
model.load_state_dict(torch.load("./model.pt", weights_only=True))
# Define helper functions
def pad_sequence(seq, max_len):
return seq + [-1] * (max_len - len(seq))
def encode_pos_tags(tagged_sentence):
return [pos2idx[tag] if tag in pos2idx else -1 for _, tag in tagged_sentence]
# Split sentence into smaller chunks based on punctuation and length constraints
def split_long_sentence(sentence, max_len=128):
tokens = word_tokenize(sentence)
if len(tokens) <= max_len:
return [sentence]
# Attempt to split based on punctuation marks
punctuation_marks = [',', ';', ':', '!', '?', '.', '-']
split_chunks = []
current_chunk = []
for token in tokens:
current_chunk.append(token)
if token in punctuation_marks and len(current_chunk) >= max_len // 2:
split_chunks.append(' '.join(current_chunk))
current_chunk = []
if current_chunk:
split_chunks.append(' '.join(current_chunk))
# If chunks are still too long, truncate them
final_chunks = []
for chunk in split_chunks:
chunk_tokens = word_tokenize(chunk)
if len(chunk_tokens) > max_len:
final_chunks.extend([' '.join(chunk_tokens[i:i + max_len]) for i in range(0, len(chunk_tokens), max_len)])
else:
final_chunks.append(chunk)
return final_chunks
# Main function to process and score a chunk
def score_sentence(sentence, model, max_length=128):
# Tokenize and POS-tag the sentence
tagged_sentence = nltk.pos_tag(nltk.word_tokenize(sentence))
# Encode the POS tags and pad the sequence
encoded_sentences = encode_pos_tags(tagged_sentence)
padded_sentence = torch.tensor(pad_sequence(encoded_sentences, max_length))
# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
# Prepare the model
model.to(device)
model.eval() # Ensure the model is in evaluation mode
# Define weights and CEFR levels
w_list = [1.04, 1.64, 2.35, 3.44, 4.92, 6.13]
# Inference without gradient calculation
with torch.no_grad():
sentence_tensor = padded_sentence.to(device)
sentence_tensor = torch.unsqueeze(sentence_tensor, 0) # Add batch dimension
# Forward pass through the model
outputs = model(sentence_tensor)
# Softmax and weighted scoring
probabilities = torch.softmax(outputs[0], dim=0)
score = sum(probabilities[i] * w_list[i] for i in range(6)).cpu().numpy()
return score
# Function to process a long article and return score list for each chunk
def score_article(article, max_length=128, chunk_max_len=128):
sentences = sent_tokenize(article) # Split the article into sentences
score_list = []
for sentence in sentences:
chunks = split_long_sentence(sentence, max_len=chunk_max_len)
for chunk in chunks:
score = score_sentence(chunk, model, max_length=max_length)
score_list.append(float(score))
return score_list

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,82 @@
import os
from dotenv import load_dotenv
import pandas as pd
from openai import OpenAI
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
load_dotenv()
client = OpenAI(
api_key=os.getenv("API_KEY"),
base_url=os.getenv("BASE_URL"),
)
def get_AI_response(text, client, model_name, temp):
messages = [
{"role": "user", "content": text},
]
response = client.chat.completions.create(
model=model_name,
messages=messages,
temperature=temp,
)
return response.choices[0].message.content
def get_Examples(df, row, client, model_name, temp):
exp = df["Example"][row]
cds = df["Can-do statement"][row]
gdw = df["guideword"][row]
lvl = df["Level"][row]
cat = df["SuperCategory"][row] + '/' + df["SubCategory"][row]
prompt = \
f'''Generate 10 example sentences based on the following instructions.
Pay close attention to the 'Can-do Statement' and ensure all generated sentences adhere strictly to it.
Provide only the sentences without any additional formatting or markdown.
Output the sentences in plain text, one sentence per line, and do not contain empty line.
INSTRUCTION
Level: {lvl}
Guideword: {gdw}
Can-do Statement: {cds}
Category: {cat}
Example Sentences:
{exp}
'''
return get_AI_response(prompt, client, model_name, temp)
def process_chunk(df, chunk, client, model, temp):
results = []
for row in chunk:
exps = get_Examples(df, row, client, model, temp)
results.append(exps)
return results
input_file = './EGP.csv'
df = pd.read_csv(input_file)
newdf = df.copy()
model = os.getenv("TRANSLATION_MODEL")
temp = float(os.getenv("TRANSLATION_TEMP"))
chunk_size = 64
total_rows = len(df.index)
num_chunks = (total_rows + chunk_size - 1) // chunk_size # Ceiling division
with tqdm(total=total_rows) as pbar:
for chunk_idx in range(num_chunks):
start = chunk_idx * chunk_size
end = min(start + chunk_size, total_rows)
chunk = range(start, end)
with ThreadPoolExecutor(max_workers=len(chunk)) as executor:
futures = {executor.submit(get_Examples, df, row, client, model, temp): row for row in chunk} # 将 row 与 future 绑定
for future in as_completed(futures):
row = futures[future] # 获取对应的行号
result = future.result() # 获取 AI 返回的结果
newdf.at[row, "Example"] = result # 更新到正确的行
pbar.update(len(chunk))
newdf.to_csv("output.csv", index=False)
newdf.to_csv("EGP_Derivied.csv", index=False)

View File

@ -0,0 +1,23 @@
import pandas as pd
df = pd.read_csv("EGP_Derivied.csv")
newdf = pd.DataFrame()
levels_list=[]
sentences_list=[]
category_list=[]
for line in range(len(df.index)):
examples = list(filter(None, df["Example"][line].split("\n")))
lvl = df["Level"][line]
cat = df["SuperCategory"][line] + '/' + df["SubCategory"][line]
for sentence in examples:
sentences_list.append(sentence)
levels_list.append(lvl)
category_list.append(cat)
newdf["Level"] = levels_list
newdf["Category"] = category_list
newdf["Sentence"] = sentences_list
newdf.to_csv("data.csv", index=False)

View File

@ -0,0 +1,111 @@
# model.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
class PositionalEncoding(nn.Module):
def __init__(self, embedding_dim, max_len=5000):
super(PositionalEncoding, self).__init__()
# Create a positional encoding matrix of shape (max_len, embedding_dim)
pe = torch.zeros(max_len, embedding_dim)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
# Add a batch dimension, so the shape becomes (1, max_len, embedding_dim)
pe = pe.unsqueeze(0)
# Register the positional encoding as a buffer so it won't be updated by the optimizer
self.register_buffer('pe', pe)
def forward(self, x):
# x is expected to have shape (batch_size, seq_length, embedding_dim)
seq_length = x.size(1)
# Add positional encoding to input
x = x + self.pe[:, :seq_length]
return x
class SelfAttention(nn.Module):
def __init__(self, input_dim, heads):
super(SelfAttention, self).__init__()
self.heads = heads
self.scale = (input_dim // heads) ** -0.5
self.qkv = nn.Linear(input_dim, input_dim * 3)
self.fc = nn.Linear(input_dim, input_dim)
def forward(self, x):
batch_size, seq_length, embedding_dim = x.shape
qkv = self.qkv(x).view(
batch_size, seq_length, self.heads, 3, embedding_dim // self.heads
)
q, k, v = qkv[..., 0, :], qkv[..., 1, :], qkv[..., 2, :]
q = q.permute(0, 2, 1, 3)
k = k.permute(0, 2, 1, 3)
v = v.permute(0, 2, 1, 3)
attn_weights = torch.matmul(q, k.transpose(-2, -1)) * self.scale
attn_weights = F.softmax(attn_weights, dim=-1)
attention_output = torch.matmul(attn_weights, v)
attention_output = attention_output.permute(0, 2, 1, 3).contiguous()
attention_output = attention_output.view(batch_size, seq_length, embedding_dim)
return self.fc(attention_output)
class AttentionBasedModel(nn.Module):
def __init__(self, pos_vocab_size, embedding_dim=128, num_classes=6, heads=8, num_attention_layers=3, dim_feedforward=512, max_len=128):
super(AttentionBasedModel, self).__init__()
self.embedding = nn.Embedding(pos_vocab_size, embedding_dim) # Embedding for POS tags
self.positional_encoding = PositionalEncoding(embedding_dim, max_len) # Positional Encoding
self.self_attention_layers = nn.ModuleList([
SelfAttention(embedding_dim, heads) for _ in range(num_attention_layers)
])
self.fc1 = nn.Linear(embedding_dim, dim_feedforward)
self.fc2 = nn.Linear(dim_feedforward, num_classes)
self.dropout = nn.Dropout(0.5)
self.norm = nn.LayerNorm(embedding_dim)
def forward(self, x):
# Input x is a matrix of one-hot encoded POS tags, shape: (batch_size, seq_length, pos_vocab_size)
x = self.embedding(x) # Convert POS tags to embeddings, shape: (batch_size, seq_length, embedding_dim)
# Add positional encoding to embeddings
x = self.positional_encoding(x)
for attn_layer in self.self_attention_layers:
attn_output = attn_layer(x)
x = self.norm(attn_output + x)
# Pool the output by taking the mean of the sequence (reduce along sequence length)
pooled_output = torch.mean(x, dim=1)
# Fully connected layers for classification
x = F.relu(self.fc1(pooled_output))
x = self.dropout(x)
x = self.fc2(x) # Output logits for the 6 classes
return x
# Example Usage
# # Hyperparameters
# pos_vocab_size = 50 # Size of the POS tag vocabulary
# max_context_length = 128 # Maximum context length
# embedding_dim = 128 # Embedding size
# num_classes = 6 # Output classes
# batch_size = 32 # Example batch size
# # Model initialization
# model = AttentionBasedModel(pos_vocab_size, embedding_dim, num_classes)
# # Example input: batch of one-hot encoded POS tags (variable length sequences)
# input_data = torch.randint(0, pos_vocab_size, (batch_size, max_context_length)) # Random input for testing
# # Forward pass
# output = model(input_data) # Output shape will be (batch_size, num_classes)
# print(output.shape) # Should print torch.Size([batch_size, num_classes])

View File

@ -0,0 +1,150 @@
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import nltk
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
# Load the model classes
from model import AttentionBasedModel
# Load the data
df = pd.read_csv('data.csv')
# Step 1: Extract sentences and corresponding levels
sentences = df['Sentence'].values
levels = df['Level'].values
# Step 2: Tokenize and POS tag each sentence
pos_tags = [nltk.pos_tag(nltk.word_tokenize(sentence)) for sentence in sentences]
# Step 3: Build POS tag vocabulary
# Extract unique POS tags from the dataset
pos_vocab = set()
for tagged_sentence in pos_tags:
for _, tag in tagged_sentence:
pos_vocab.add(tag)
# Create a mapping from POS tag to index
pos2idx = {pos: idx for idx, pos in enumerate(pos_vocab)}
pos_vocab_size = len(pos2idx)
# Step 4: Encode sentences into POS tag indices
def encode_pos_tags(tagged_sentence):
return [pos2idx[tag] for _, tag in tagged_sentence]
encoded_sentences = [encode_pos_tags(tagged_sentence) for tagged_sentence in pos_tags]
# Step 5: Encode levels (classes) into integers
le = LabelEncoder()
encoded_levels = le.fit_transform(levels)
num_classes = len(le.classes_)
# Save class encoding mapping
class_mapping = dict(zip(le.transform(le.classes_), le.classes_))
torch.save(class_mapping, 'class_mapping.pt')
# Save POS tag encoding mapping
torch.save(pos2idx, 'pos2idx.pt')
# Step 6: Pad sentences to a fixed length
max_length = 64
def pad_sequence(seq, max_len):
return seq + [-1] * (max_len - len(seq))
padded_sentences = [pad_sequence(seq, max_length) for seq in encoded_sentences]
# Step 7: Create a PyTorch Dataset and DataLoader
class POSDataset(Dataset):
def __init__(self, sentences, labels):
self.sentences = sentences
self.labels = labels
def __len__(self):
return len(self.sentences)
def __getitem__(self, idx):
sentence = torch.tensor(self.sentences[idx], dtype=torch.long)
label = torch.tensor(self.labels[idx], dtype=torch.long)
return sentence, label
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(padded_sentences, encoded_levels, test_size=0.2)
train_dataset = POSDataset(X_train, y_train)
val_dataset = POSDataset(X_val, y_val)
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# Step 8: Initialize the model, loss function, and optimizer
embedding_dim = 32
heads = 8
num_attention_layers = 6
dim_feedforward = 256
learning_rate = 0.003
model = AttentionBasedModel(pos_vocab_size, embedding_dim, num_classes, heads, num_attention_layers, dim_feedforward)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Step 9: Training loop
num_epochs = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
model.to(device)
step = 0
for epoch in range(num_epochs):
model.train()
running_loss = 0.0
for sentences, labels in train_loader:
sentences, labels = sentences.to(device), labels.to(device)
# Forward pass
outputs = model(sentences)
loss = criterion(outputs, labels)
# Backward pass and optimization
optimizer.zero_grad()
loss.backward()
optimizer.step()
step += 1
running_loss += loss.item()
# Validation phase
model.eval()
val_loss = 0.0
correct = 0
total = 0
with torch.no_grad():
for sentences, labels in val_loader:
sentences, labels = sentences.to(device), labels.to(device)
outputs = model(sentences)
loss = criterion(outputs, labels)
val_loss += loss.item()
# Calculate accuracy
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
# Print training and validation stats
print(f'Epoch [{epoch+1}/{num_epochs}], Step {step}, Loss: {running_loss/len(train_loader):.4f}, '
f'Validation Loss: {val_loss/len(val_loader):.4f}, Accuracy: {100 * correct / total:.2f}%')
torch.save(model.state_dict(), f'checkpoints/step_{step}.pt')
# Step 10: Save the trained model
torch.save(model.state_dict(), 'model.pt')

View File

@ -0,0 +1,46 @@
from training.model import AttentionBasedModel
import torch
import torch.nn.functional as F
import nltk
sentence = '''Smartphones have worked their way deep into our lives and have become indispensable for work and socialising.'''
pos2idx = torch.load('pos2idx.pt')
class_mapping = torch.load('class_mapping.pt')
def pad_sequence(seq, max_len):
return seq + [-1] * (max_len - len(seq))
def encode_pos_tags(tagged_sentence):
return [pos2idx[tag] if tag in pos2idx else -1 for _, tag in tagged_sentence]
max_length = 64
tagged_sentence = nltk.pos_tag(nltk.word_tokenize(sentence))
encoded_sentences = encode_pos_tags(tagged_sentence)
padded_sentence = torch.tensor(pad_sequence(encoded_sentences, max_length))
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
model = AttentionBasedModel(40, 32, 6, 8, 6, 256)
model.load_state_dict(torch.load("./model.pt", weights_only=True))
model.to(device)
model.eval() # 确保模型处于评估模式
w_list=[1.35, 1.63, 2.75, 3.64, 5.38, 6.32]
cefr_dict = [None, "A1", "A2", "B1", "B2", "C1", "C2"]
with torch.no_grad():
sentence = padded_sentence.to(device)
sentences = torch.unsqueeze(sentence, 0)
outputs = model(sentences)
print(torch.max(outputs, 1))
print(outputs[0])
print(torch.softmax(outputs[0],0))
s=0
for i in range(6):
s+=torch.softmax(outputs[0],0)[i] * w_list[i]
s=s.cpu().numpy()
# the s is the final output.