add: text-difficulty/grammar
This commit is contained in:
parent
ae6f10a6f0
commit
33754146c8
3629
text-difficulty/grammar/EGP.csv
Normal file
3629
text-difficulty/grammar/EGP.csv
Normal file
File diff suppressed because it is too large
Load Diff
11829
text-difficulty/grammar/EGP_Derivied.csv
Normal file
11829
text-difficulty/grammar/EGP_Derivied.csv
Normal file
File diff suppressed because it is too large
Load Diff
103
text-difficulty/grammar/article.py
Normal file
103
text-difficulty/grammar/article.py
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import nltk
|
||||||
|
from nltk.tokenize import sent_tokenize, word_tokenize
|
||||||
|
from training.model import AttentionBasedModel
|
||||||
|
|
||||||
|
# Ensure required NLTK resources are available
|
||||||
|
nltk.download('punkt')
|
||||||
|
nltk.download('averaged_perceptron_tagger')
|
||||||
|
|
||||||
|
# Load pre-saved mappings
|
||||||
|
pos2idx = torch.load('pos2idx.pt')
|
||||||
|
class_mapping = torch.load('class_mapping.pt')
|
||||||
|
|
||||||
|
|
||||||
|
# Load the pre-trained model and state
|
||||||
|
model = AttentionBasedModel(40, 32, 6, 8, 6, 256)
|
||||||
|
model.load_state_dict(torch.load("./model.pt", weights_only=True))
|
||||||
|
|
||||||
|
# Define helper functions
|
||||||
|
def pad_sequence(seq, max_len):
|
||||||
|
return seq + [-1] * (max_len - len(seq))
|
||||||
|
|
||||||
|
def encode_pos_tags(tagged_sentence):
|
||||||
|
return [pos2idx[tag] if tag in pos2idx else -1 for _, tag in tagged_sentence]
|
||||||
|
|
||||||
|
# Split sentence into smaller chunks based on punctuation and length constraints
|
||||||
|
def split_long_sentence(sentence, max_len=128):
|
||||||
|
tokens = word_tokenize(sentence)
|
||||||
|
|
||||||
|
if len(tokens) <= max_len:
|
||||||
|
return [sentence]
|
||||||
|
|
||||||
|
# Attempt to split based on punctuation marks
|
||||||
|
punctuation_marks = [',', ';', ':', '!', '?', '.', '-']
|
||||||
|
split_chunks = []
|
||||||
|
current_chunk = []
|
||||||
|
|
||||||
|
for token in tokens:
|
||||||
|
current_chunk.append(token)
|
||||||
|
if token in punctuation_marks and len(current_chunk) >= max_len // 2:
|
||||||
|
split_chunks.append(' '.join(current_chunk))
|
||||||
|
current_chunk = []
|
||||||
|
|
||||||
|
if current_chunk:
|
||||||
|
split_chunks.append(' '.join(current_chunk))
|
||||||
|
|
||||||
|
# If chunks are still too long, truncate them
|
||||||
|
final_chunks = []
|
||||||
|
for chunk in split_chunks:
|
||||||
|
chunk_tokens = word_tokenize(chunk)
|
||||||
|
if len(chunk_tokens) > max_len:
|
||||||
|
final_chunks.extend([' '.join(chunk_tokens[i:i + max_len]) for i in range(0, len(chunk_tokens), max_len)])
|
||||||
|
else:
|
||||||
|
final_chunks.append(chunk)
|
||||||
|
|
||||||
|
return final_chunks
|
||||||
|
|
||||||
|
# Main function to process and score a chunk
|
||||||
|
def score_sentence(sentence, model, max_length=128):
|
||||||
|
# Tokenize and POS-tag the sentence
|
||||||
|
tagged_sentence = nltk.pos_tag(nltk.word_tokenize(sentence))
|
||||||
|
|
||||||
|
# Encode the POS tags and pad the sequence
|
||||||
|
encoded_sentences = encode_pos_tags(tagged_sentence)
|
||||||
|
padded_sentence = torch.tensor(pad_sequence(encoded_sentences, max_length))
|
||||||
|
|
||||||
|
# Set the device
|
||||||
|
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
|
||||||
|
|
||||||
|
# Prepare the model
|
||||||
|
model.to(device)
|
||||||
|
model.eval() # Ensure the model is in evaluation mode
|
||||||
|
|
||||||
|
# Define weights and CEFR levels
|
||||||
|
w_list = [1.04, 1.64, 2.35, 3.44, 4.92, 6.13]
|
||||||
|
|
||||||
|
# Inference without gradient calculation
|
||||||
|
with torch.no_grad():
|
||||||
|
sentence_tensor = padded_sentence.to(device)
|
||||||
|
sentence_tensor = torch.unsqueeze(sentence_tensor, 0) # Add batch dimension
|
||||||
|
|
||||||
|
# Forward pass through the model
|
||||||
|
outputs = model(sentence_tensor)
|
||||||
|
|
||||||
|
# Softmax and weighted scoring
|
||||||
|
probabilities = torch.softmax(outputs[0], dim=0)
|
||||||
|
score = sum(probabilities[i] * w_list[i] for i in range(6)).cpu().numpy()
|
||||||
|
|
||||||
|
return score
|
||||||
|
|
||||||
|
# Function to process a long article and return score list for each chunk
|
||||||
|
def score_article(article, max_length=128, chunk_max_len=128):
|
||||||
|
sentences = sent_tokenize(article) # Split the article into sentences
|
||||||
|
score_list = []
|
||||||
|
|
||||||
|
for sentence in sentences:
|
||||||
|
chunks = split_long_sentence(sentence, max_len=chunk_max_len)
|
||||||
|
for chunk in chunks:
|
||||||
|
score = score_sentence(chunk, model, max_length=max_length)
|
||||||
|
score_list.append(float(score))
|
||||||
|
|
||||||
|
return score_list
|
11819
text-difficulty/grammar/data.csv
Normal file
11819
text-difficulty/grammar/data.csv
Normal file
File diff suppressed because it is too large
Load Diff
82
text-difficulty/grammar/data_deriving.py
Normal file
82
text-difficulty/grammar/data_deriving.py
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import pandas as pd
|
||||||
|
from openai import OpenAI
|
||||||
|
from tqdm import tqdm
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
client = OpenAI(
|
||||||
|
api_key=os.getenv("API_KEY"),
|
||||||
|
base_url=os.getenv("BASE_URL"),
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_AI_response(text, client, model_name, temp):
|
||||||
|
messages = [
|
||||||
|
{"role": "user", "content": text},
|
||||||
|
]
|
||||||
|
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model=model_name,
|
||||||
|
messages=messages,
|
||||||
|
temperature=temp,
|
||||||
|
)
|
||||||
|
|
||||||
|
return response.choices[0].message.content
|
||||||
|
|
||||||
|
def get_Examples(df, row, client, model_name, temp):
|
||||||
|
exp = df["Example"][row]
|
||||||
|
cds = df["Can-do statement"][row]
|
||||||
|
gdw = df["guideword"][row]
|
||||||
|
lvl = df["Level"][row]
|
||||||
|
cat = df["SuperCategory"][row] + '/' + df["SubCategory"][row]
|
||||||
|
prompt = \
|
||||||
|
f'''Generate 10 example sentences based on the following instructions.
|
||||||
|
Pay close attention to the 'Can-do Statement' and ensure all generated sentences adhere strictly to it.
|
||||||
|
Provide only the sentences without any additional formatting or markdown.
|
||||||
|
Output the sentences in plain text, one sentence per line, and do not contain empty line.
|
||||||
|
INSTRUCTION
|
||||||
|
Level: {lvl}
|
||||||
|
Guideword: {gdw}
|
||||||
|
Can-do Statement: {cds}
|
||||||
|
Category: {cat}
|
||||||
|
Example Sentences:
|
||||||
|
{exp}
|
||||||
|
'''
|
||||||
|
return get_AI_response(prompt, client, model_name, temp)
|
||||||
|
|
||||||
|
def process_chunk(df, chunk, client, model, temp):
|
||||||
|
results = []
|
||||||
|
for row in chunk:
|
||||||
|
exps = get_Examples(df, row, client, model, temp)
|
||||||
|
results.append(exps)
|
||||||
|
return results
|
||||||
|
|
||||||
|
input_file = './EGP.csv'
|
||||||
|
df = pd.read_csv(input_file)
|
||||||
|
newdf = df.copy()
|
||||||
|
model = os.getenv("TRANSLATION_MODEL")
|
||||||
|
temp = float(os.getenv("TRANSLATION_TEMP"))
|
||||||
|
|
||||||
|
chunk_size = 64
|
||||||
|
total_rows = len(df.index)
|
||||||
|
num_chunks = (total_rows + chunk_size - 1) // chunk_size # Ceiling division
|
||||||
|
|
||||||
|
with tqdm(total=total_rows) as pbar:
|
||||||
|
for chunk_idx in range(num_chunks):
|
||||||
|
start = chunk_idx * chunk_size
|
||||||
|
end = min(start + chunk_size, total_rows)
|
||||||
|
chunk = range(start, end)
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=len(chunk)) as executor:
|
||||||
|
futures = {executor.submit(get_Examples, df, row, client, model, temp): row for row in chunk} # 将 row 与 future 绑定
|
||||||
|
for future in as_completed(futures):
|
||||||
|
row = futures[future] # 获取对应的行号
|
||||||
|
result = future.result() # 获取 AI 返回的结果
|
||||||
|
newdf.at[row, "Example"] = result # 更新到正确的行
|
||||||
|
|
||||||
|
pbar.update(len(chunk))
|
||||||
|
newdf.to_csv("output.csv", index=False)
|
||||||
|
|
||||||
|
newdf.to_csv("EGP_Derivied.csv", index=False)
|
23
text-difficulty/grammar/data_postprocessing.py
Normal file
23
text-difficulty/grammar/data_postprocessing.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
df = pd.read_csv("EGP_Derivied.csv")
|
||||||
|
newdf = pd.DataFrame()
|
||||||
|
|
||||||
|
levels_list=[]
|
||||||
|
sentences_list=[]
|
||||||
|
category_list=[]
|
||||||
|
for line in range(len(df.index)):
|
||||||
|
examples = list(filter(None, df["Example"][line].split("\n")))
|
||||||
|
lvl = df["Level"][line]
|
||||||
|
cat = df["SuperCategory"][line] + '/' + df["SubCategory"][line]
|
||||||
|
for sentence in examples:
|
||||||
|
sentences_list.append(sentence)
|
||||||
|
levels_list.append(lvl)
|
||||||
|
category_list.append(cat)
|
||||||
|
|
||||||
|
|
||||||
|
newdf["Level"] = levels_list
|
||||||
|
newdf["Category"] = category_list
|
||||||
|
newdf["Sentence"] = sentences_list
|
||||||
|
|
||||||
|
newdf.to_csv("data.csv", index=False)
|
111
text-difficulty/grammar/training/model.py
Normal file
111
text-difficulty/grammar/training/model.py
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
# model.py
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import math
|
||||||
|
|
||||||
|
class PositionalEncoding(nn.Module):
|
||||||
|
def __init__(self, embedding_dim, max_len=5000):
|
||||||
|
super(PositionalEncoding, self).__init__()
|
||||||
|
|
||||||
|
# Create a positional encoding matrix of shape (max_len, embedding_dim)
|
||||||
|
pe = torch.zeros(max_len, embedding_dim)
|
||||||
|
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
|
||||||
|
div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
|
||||||
|
|
||||||
|
pe[:, 0::2] = torch.sin(position * div_term)
|
||||||
|
pe[:, 1::2] = torch.cos(position * div_term)
|
||||||
|
|
||||||
|
# Add a batch dimension, so the shape becomes (1, max_len, embedding_dim)
|
||||||
|
pe = pe.unsqueeze(0)
|
||||||
|
|
||||||
|
# Register the positional encoding as a buffer so it won't be updated by the optimizer
|
||||||
|
self.register_buffer('pe', pe)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# x is expected to have shape (batch_size, seq_length, embedding_dim)
|
||||||
|
seq_length = x.size(1)
|
||||||
|
# Add positional encoding to input
|
||||||
|
x = x + self.pe[:, :seq_length]
|
||||||
|
return x
|
||||||
|
|
||||||
|
class SelfAttention(nn.Module):
|
||||||
|
def __init__(self, input_dim, heads):
|
||||||
|
super(SelfAttention, self).__init__()
|
||||||
|
self.heads = heads
|
||||||
|
self.scale = (input_dim // heads) ** -0.5
|
||||||
|
self.qkv = nn.Linear(input_dim, input_dim * 3)
|
||||||
|
self.fc = nn.Linear(input_dim, input_dim)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
batch_size, seq_length, embedding_dim = x.shape
|
||||||
|
qkv = self.qkv(x).view(
|
||||||
|
batch_size, seq_length, self.heads, 3, embedding_dim // self.heads
|
||||||
|
)
|
||||||
|
q, k, v = qkv[..., 0, :], qkv[..., 1, :], qkv[..., 2, :]
|
||||||
|
q = q.permute(0, 2, 1, 3)
|
||||||
|
k = k.permute(0, 2, 1, 3)
|
||||||
|
v = v.permute(0, 2, 1, 3)
|
||||||
|
|
||||||
|
attn_weights = torch.matmul(q, k.transpose(-2, -1)) * self.scale
|
||||||
|
attn_weights = F.softmax(attn_weights, dim=-1)
|
||||||
|
|
||||||
|
attention_output = torch.matmul(attn_weights, v)
|
||||||
|
attention_output = attention_output.permute(0, 2, 1, 3).contiguous()
|
||||||
|
attention_output = attention_output.view(batch_size, seq_length, embedding_dim)
|
||||||
|
|
||||||
|
return self.fc(attention_output)
|
||||||
|
|
||||||
|
class AttentionBasedModel(nn.Module):
|
||||||
|
def __init__(self, pos_vocab_size, embedding_dim=128, num_classes=6, heads=8, num_attention_layers=3, dim_feedforward=512, max_len=128):
|
||||||
|
super(AttentionBasedModel, self).__init__()
|
||||||
|
self.embedding = nn.Embedding(pos_vocab_size, embedding_dim) # Embedding for POS tags
|
||||||
|
self.positional_encoding = PositionalEncoding(embedding_dim, max_len) # Positional Encoding
|
||||||
|
self.self_attention_layers = nn.ModuleList([
|
||||||
|
SelfAttention(embedding_dim, heads) for _ in range(num_attention_layers)
|
||||||
|
])
|
||||||
|
self.fc1 = nn.Linear(embedding_dim, dim_feedforward)
|
||||||
|
self.fc2 = nn.Linear(dim_feedforward, num_classes)
|
||||||
|
self.dropout = nn.Dropout(0.5)
|
||||||
|
self.norm = nn.LayerNorm(embedding_dim)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# Input x is a matrix of one-hot encoded POS tags, shape: (batch_size, seq_length, pos_vocab_size)
|
||||||
|
x = self.embedding(x) # Convert POS tags to embeddings, shape: (batch_size, seq_length, embedding_dim)
|
||||||
|
|
||||||
|
# Add positional encoding to embeddings
|
||||||
|
x = self.positional_encoding(x)
|
||||||
|
|
||||||
|
for attn_layer in self.self_attention_layers:
|
||||||
|
attn_output = attn_layer(x)
|
||||||
|
x = self.norm(attn_output + x)
|
||||||
|
|
||||||
|
# Pool the output by taking the mean of the sequence (reduce along sequence length)
|
||||||
|
pooled_output = torch.mean(x, dim=1)
|
||||||
|
|
||||||
|
# Fully connected layers for classification
|
||||||
|
x = F.relu(self.fc1(pooled_output))
|
||||||
|
x = self.dropout(x)
|
||||||
|
x = self.fc2(x) # Output logits for the 6 classes
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
# Example Usage
|
||||||
|
# # Hyperparameters
|
||||||
|
# pos_vocab_size = 50 # Size of the POS tag vocabulary
|
||||||
|
# max_context_length = 128 # Maximum context length
|
||||||
|
# embedding_dim = 128 # Embedding size
|
||||||
|
# num_classes = 6 # Output classes
|
||||||
|
# batch_size = 32 # Example batch size
|
||||||
|
|
||||||
|
# # Model initialization
|
||||||
|
# model = AttentionBasedModel(pos_vocab_size, embedding_dim, num_classes)
|
||||||
|
|
||||||
|
# # Example input: batch of one-hot encoded POS tags (variable length sequences)
|
||||||
|
# input_data = torch.randint(0, pos_vocab_size, (batch_size, max_context_length)) # Random input for testing
|
||||||
|
|
||||||
|
# # Forward pass
|
||||||
|
# output = model(input_data) # Output shape will be (batch_size, num_classes)
|
||||||
|
|
||||||
|
# print(output.shape) # Should print torch.Size([batch_size, num_classes])
|
150
text-difficulty/grammar/training/train.py
Normal file
150
text-difficulty/grammar/training/train.py
Normal file
@ -0,0 +1,150 @@
|
|||||||
|
import torch
|
||||||
|
import torch.optim as optim
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.utils.data import Dataset, DataLoader
|
||||||
|
import pandas as pd
|
||||||
|
import nltk
|
||||||
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
#nltk.download('punkt')
|
||||||
|
#nltk.download('averaged_perceptron_tagger')
|
||||||
|
|
||||||
|
# Load the model classes
|
||||||
|
from model import AttentionBasedModel
|
||||||
|
|
||||||
|
# Load the data
|
||||||
|
df = pd.read_csv('data.csv')
|
||||||
|
|
||||||
|
# Step 1: Extract sentences and corresponding levels
|
||||||
|
sentences = df['Sentence'].values
|
||||||
|
levels = df['Level'].values
|
||||||
|
|
||||||
|
# Step 2: Tokenize and POS tag each sentence
|
||||||
|
pos_tags = [nltk.pos_tag(nltk.word_tokenize(sentence)) for sentence in sentences]
|
||||||
|
|
||||||
|
# Step 3: Build POS tag vocabulary
|
||||||
|
# Extract unique POS tags from the dataset
|
||||||
|
pos_vocab = set()
|
||||||
|
for tagged_sentence in pos_tags:
|
||||||
|
for _, tag in tagged_sentence:
|
||||||
|
pos_vocab.add(tag)
|
||||||
|
|
||||||
|
# Create a mapping from POS tag to index
|
||||||
|
pos2idx = {pos: idx for idx, pos in enumerate(pos_vocab)}
|
||||||
|
pos_vocab_size = len(pos2idx)
|
||||||
|
|
||||||
|
# Step 4: Encode sentences into POS tag indices
|
||||||
|
def encode_pos_tags(tagged_sentence):
|
||||||
|
return [pos2idx[tag] for _, tag in tagged_sentence]
|
||||||
|
|
||||||
|
encoded_sentences = [encode_pos_tags(tagged_sentence) for tagged_sentence in pos_tags]
|
||||||
|
|
||||||
|
# Step 5: Encode levels (classes) into integers
|
||||||
|
le = LabelEncoder()
|
||||||
|
encoded_levels = le.fit_transform(levels)
|
||||||
|
num_classes = len(le.classes_)
|
||||||
|
|
||||||
|
# Save class encoding mapping
|
||||||
|
class_mapping = dict(zip(le.transform(le.classes_), le.classes_))
|
||||||
|
torch.save(class_mapping, 'class_mapping.pt')
|
||||||
|
|
||||||
|
# Save POS tag encoding mapping
|
||||||
|
torch.save(pos2idx, 'pos2idx.pt')
|
||||||
|
|
||||||
|
# Step 6: Pad sentences to a fixed length
|
||||||
|
max_length = 64
|
||||||
|
|
||||||
|
def pad_sequence(seq, max_len):
|
||||||
|
return seq + [-1] * (max_len - len(seq))
|
||||||
|
|
||||||
|
padded_sentences = [pad_sequence(seq, max_length) for seq in encoded_sentences]
|
||||||
|
|
||||||
|
# Step 7: Create a PyTorch Dataset and DataLoader
|
||||||
|
class POSDataset(Dataset):
|
||||||
|
def __init__(self, sentences, labels):
|
||||||
|
self.sentences = sentences
|
||||||
|
self.labels = labels
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.sentences)
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
sentence = torch.tensor(self.sentences[idx], dtype=torch.long)
|
||||||
|
label = torch.tensor(self.labels[idx], dtype=torch.long)
|
||||||
|
return sentence, label
|
||||||
|
|
||||||
|
# Split data into training and validation sets
|
||||||
|
X_train, X_val, y_train, y_val = train_test_split(padded_sentences, encoded_levels, test_size=0.2)
|
||||||
|
|
||||||
|
train_dataset = POSDataset(X_train, y_train)
|
||||||
|
val_dataset = POSDataset(X_val, y_val)
|
||||||
|
|
||||||
|
batch_size = 128
|
||||||
|
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
|
||||||
|
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
|
||||||
|
|
||||||
|
# Step 8: Initialize the model, loss function, and optimizer
|
||||||
|
embedding_dim = 32
|
||||||
|
heads = 8
|
||||||
|
num_attention_layers = 6
|
||||||
|
dim_feedforward = 256
|
||||||
|
learning_rate = 0.003
|
||||||
|
|
||||||
|
model = AttentionBasedModel(pos_vocab_size, embedding_dim, num_classes, heads, num_attention_layers, dim_feedforward)
|
||||||
|
criterion = nn.CrossEntropyLoss()
|
||||||
|
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
|
||||||
|
|
||||||
|
# Step 9: Training loop
|
||||||
|
num_epochs = 100
|
||||||
|
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
|
||||||
|
model.to(device)
|
||||||
|
|
||||||
|
step = 0
|
||||||
|
|
||||||
|
for epoch in range(num_epochs):
|
||||||
|
model.train()
|
||||||
|
running_loss = 0.0
|
||||||
|
|
||||||
|
for sentences, labels in train_loader:
|
||||||
|
sentences, labels = sentences.to(device), labels.to(device)
|
||||||
|
|
||||||
|
# Forward pass
|
||||||
|
outputs = model(sentences)
|
||||||
|
loss = criterion(outputs, labels)
|
||||||
|
|
||||||
|
# Backward pass and optimization
|
||||||
|
optimizer.zero_grad()
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
step += 1
|
||||||
|
|
||||||
|
running_loss += loss.item()
|
||||||
|
|
||||||
|
# Validation phase
|
||||||
|
model.eval()
|
||||||
|
val_loss = 0.0
|
||||||
|
correct = 0
|
||||||
|
total = 0
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
for sentences, labels in val_loader:
|
||||||
|
sentences, labels = sentences.to(device), labels.to(device)
|
||||||
|
|
||||||
|
outputs = model(sentences)
|
||||||
|
loss = criterion(outputs, labels)
|
||||||
|
val_loss += loss.item()
|
||||||
|
|
||||||
|
# Calculate accuracy
|
||||||
|
_, predicted = torch.max(outputs, 1)
|
||||||
|
total += labels.size(0)
|
||||||
|
correct += (predicted == labels).sum().item()
|
||||||
|
|
||||||
|
# Print training and validation stats
|
||||||
|
print(f'Epoch [{epoch+1}/{num_epochs}], Step {step}, Loss: {running_loss/len(train_loader):.4f}, '
|
||||||
|
f'Validation Loss: {val_loss/len(val_loader):.4f}, Accuracy: {100 * correct / total:.2f}%')
|
||||||
|
torch.save(model.state_dict(), f'checkpoints/step_{step}.pt')
|
||||||
|
|
||||||
|
# Step 10: Save the trained model
|
||||||
|
torch.save(model.state_dict(), 'model.pt')
|
46
text-difficulty/grammar/validation/inference.py
Normal file
46
text-difficulty/grammar/validation/inference.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
from training.model import AttentionBasedModel
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
import nltk
|
||||||
|
|
||||||
|
|
||||||
|
sentence = '''Smartphones have worked their way deep into our lives and have become indispensable for work and socialising.'''
|
||||||
|
|
||||||
|
pos2idx = torch.load('pos2idx.pt')
|
||||||
|
class_mapping = torch.load('class_mapping.pt')
|
||||||
|
|
||||||
|
def pad_sequence(seq, max_len):
|
||||||
|
return seq + [-1] * (max_len - len(seq))
|
||||||
|
|
||||||
|
def encode_pos_tags(tagged_sentence):
|
||||||
|
return [pos2idx[tag] if tag in pos2idx else -1 for _, tag in tagged_sentence]
|
||||||
|
|
||||||
|
|
||||||
|
max_length = 64
|
||||||
|
|
||||||
|
tagged_sentence = nltk.pos_tag(nltk.word_tokenize(sentence))
|
||||||
|
encoded_sentences = encode_pos_tags(tagged_sentence)
|
||||||
|
padded_sentence = torch.tensor(pad_sequence(encoded_sentences, max_length))
|
||||||
|
|
||||||
|
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
|
||||||
|
model = AttentionBasedModel(40, 32, 6, 8, 6, 256)
|
||||||
|
model.load_state_dict(torch.load("./model.pt", weights_only=True))
|
||||||
|
model.to(device)
|
||||||
|
model.eval() # 确保模型处于评估模式
|
||||||
|
|
||||||
|
|
||||||
|
w_list=[1.35, 1.63, 2.75, 3.64, 5.38, 6.32]
|
||||||
|
cefr_dict = [None, "A1", "A2", "B1", "B2", "C1", "C2"]
|
||||||
|
with torch.no_grad():
|
||||||
|
sentence = padded_sentence.to(device)
|
||||||
|
sentences = torch.unsqueeze(sentence, 0)
|
||||||
|
|
||||||
|
outputs = model(sentences)
|
||||||
|
print(torch.max(outputs, 1))
|
||||||
|
print(outputs[0])
|
||||||
|
print(torch.softmax(outputs[0],0))
|
||||||
|
s=0
|
||||||
|
for i in range(6):
|
||||||
|
s+=torch.softmax(outputs[0],0)[i] * w_list[i]
|
||||||
|
s=s.cpu().numpy()
|
||||||
|
# the s is the final output.
|
Loading…
Reference in New Issue
Block a user