Sentiment Analysis with Attention-Based LSTM in PyTorch
import copy
import torch
from torch import nn
from torch import optim
import torchtext
from torchtext import data
from torchtext import datasets
TEXT = data.Field(sequential=True, batch_first=True, lower=True)
LABEL = data.LabelField()
# load data splits
train_data, val_data, test_data = datasets.SST.splits(TEXT, LABEL)
# build dictionary
TEXT.build_vocab(train_data,vectors = 'glove.840B.300d',unk_init = torch.Tensor.normal_)
#TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)
# hyperparameters
vocab_size = len(TEXT.vocab)
label_size = len(LABEL.vocab)
padding_idx = TEXT.vocab.stoi['<pad>']
embedding_dim = 300
hidden_dim = 128
# build iterators
train_iter, val_iter, test_iter = data.BucketIterator.splits(
(train_data, val_data, test_data),
batch_size=32)
# your code here
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
# Training function
def train(model, train_loader, optimizer, criterion):
model.train()
total_loss = 0.0
total_correct = 0
for batch in train_loader:
text, labels = batch.text.to(device), batch.label.to(device)
optimizer.zero_grad()
logits, attention_weights = model(text)
loss = criterion(logits, labels)
loss.backward()
optimizer.step()
total_loss += loss.item() * text.size(0)
preds = logits.argmax(dim=1)
total_correct += (preds == labels).sum().item()
avg_loss = total_loss / len(train_loader.dataset)
accuracy = total_correct / len(train_loader.dataset)
return avg_loss, accuracy
def evaluate(model, iterator, criterion):
epoch_loss = 0
epoch_acc = 0
model.eval()
with torch.no_grad():
for batch in iterator:
text, labels = batch.text.to(device), batch.label.to(device)
predictions, _ = model(text)
loss = criterion(predictions, batch.label)
acc = accuracy(predictions, batch.label)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def accuracy(predictions, labels):
_, predicted_labels = torch.max(predictions, 1)
correct = (predicted_labels == labels).float()
accuracy = correct.sum() / len(correct)
return accuracy
class Attention(nn.Module):
def __init__(self, hidden_dim):
super(Attention, self).__init__()
self.hidden_dim = hidden_dim
self.attention_weights = nn.Linear(hidden_dim, 1)
def forward(self, lstm_output):
attention_scores = self.attention_weights(lstm_output).squeeze(2)
attention_weights = torch.softmax(attention_scores, dim=1)
attention_output = torch.bmm(lstm_output.transpose(1, 2), attention_weights.unsqueeze(2)).squeeze(2)
return attention_output, attention_weights
class RNNClassifier(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, label_size, padding_idx):
super(RNNClassifier, self).__init__()
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.label_size = label_size
self.num_layers = 2
self.dropout_num = 0.5
self.bidirectional=True
# Embedding Layer
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
self.embedding.weight.data.copy_(TEXT.vocab.vectors)
self.embedding.weight.requires_grad = False
#self.embedding = nn.Embedding.from_pretrained(pretrained_vectors.vectors, padding_idx=padding_idx)
self.embedding_dropout = nn.Dropout(self.dropout_num) # add embedding dropout layer
# LSTM Layer
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=self.num_layers, batch_first=True, bidirectional=self.bidirectional)
self.lstm_dropout = nn.Dropout(self.dropout_num)
# Attention Layer
self.attention = Attention(hidden_dim * (2 if self.bidirectional else 1) if self.bidirectional else hidden_dim)
# Fully Connected Layer
self.fc = nn.Linear(hidden_dim * (2 if self.bidirectional else 1) if self.bidirectional else hidden_dim, label_size)
self.fc_dropout = nn.Dropout(self.dropout_num)
self.softmax = nn.LogSoftmax(dim=1)
def zero_state(self, batch_size):
hidden = torch.zeros(self.num_layers * (2 if self.bidirectional else 1), batch_size, self.hidden_dim).to(device)
cell = torch.zeros(self.num_layers * (2 if self.bidirectional else 1), batch_size, self.hidden_dim).to(device)
return hidden, cell
def forward(self, text):
emb = self.embedding(text)
emb = self.embedding_dropout(emb)
h0, c0 = self.zero_state(text.size(0))
output, (hn, cn) = self.lstm(emb, (h0, c0))
output = self.lstm_dropout(output)
attention_output, attention_weights = self.attention(output)
output = self.fc(attention_output)
output = self.fc_dropout(output)
return output, attention_weights
model = RNNClassifier(vocab_size, embedding_dim, hidden_dim, label_size,padding_idx).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
num_epochs = 20
# ReduceLROnPlateau scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=2, verbose=True)
train_losses = []
val_losses = []
train_accs = []
val_accs = []
def model_running(model,train_iter, val_iter,optimizer, criterion):
best_val_acc = 0.0
for epoch in range(1,num_epochs+1):
train_acc = 0.0
train_loss = 0.0
val_acc = 0.0
val_loss = 0.0
test_acc = 0.0
test_loss = 0.0
train_loss, train_acc = train(model, train_iter, optimizer, criterion)
val_loss, val_acc = evaluate(model, val_iter, criterion)
# Print progress
print('Epoch [{}/{}], Train Loss: {:.4f}, Val Loss: {:.4f}, Train Acc: {:.4f}, Val Acc: {:.4f}'
.format(epoch, num_epochs, train_loss, val_loss, train_acc, val_acc))
# Save loss and accuracy to list
train_losses.append(train_loss)
val_losses.append(val_loss)
train_accs.append(train_acc)
val_accs.append(val_acc)
# Update learning rate using scheduler
scheduler.step(val_acc)
if val_acc > best_val_acc:
best_val_acc = val_acc
best_model = model.state_dict()
return best_model
best_model = model_running(model,train_iter, val_iter,optimizer, criterion)
model.load_state_dict(best_model)
test_loss, test_acc = evaluate(model, test_iter, criterion)
print(f'Test loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}')
运行结果如下:
Epoch [1/20], Train Loss: 0.9971, Val Loss: 0.8510, Train Acc: 0.5139, Val Acc: 0.6416
Epoch [2/20], Train Loss: 0.9242, Val Loss: 0.8571, Train Acc: 0.5856, Val Acc: 0.6407
Epoch [3/20], Train Loss: 0.9038, Val Loss: 0.8233, Train Acc: 0.5981, Val Acc: 0.6497
Epoch [4/20], Train Loss: 0.8912, Val Loss: 0.7961, Train Acc: 0.6001, Val Acc: 0.6639
Epoch [5/20], Train Loss: 0.8726, Val Loss: 0.7914, Train Acc: 0.6125, Val Acc: 0.6675
Epoch [6/20], Train Loss: 0.8601, Val Loss: 0.7693, Train Acc: 0.6163, Val Acc: 0.6782
Epoch [7/20], Train Loss: 0.8531, Val Loss: 0.7969, Train Acc: 0.6201, Val Acc: 0.6559
Epoch [8/20], Train Loss: 0.8400, Val Loss: 0.7654, Train Acc: 0.6256, Val Acc: 0.6684
Epoch [9/20], Train Loss: 0.8208, Val Loss: 0.7518, Train Acc: 0.6293, Val Acc: 0.6720
Epoch 00009: reducing learning rate of group 0 to 1.0000e-04.
Epoch [10/20], Train Loss: 0.7973, Val Loss: 0.7509, Train Acc: 0.6458, Val Acc: 0.6755
Epoch [11/20], Train Loss: 0.7875, Val Loss: 0.7547, Train Acc: 0.6532, Val Acc: 0.6764
Epoch [12/20], Train Loss: 0.7768, Val Loss: 0.7470, Train Acc: 0.6601, Val Acc: 0.6773
Epoch 00012: reducing learning rate of group 0 to 1.0000e-05.
Epoch [13/20], Train Loss: 0.7793, Val Loss: 0.7485, Train Acc: 0.6573, Val Acc: 0.6782
Epoch [14/20], Train Loss: 0.7747, Val Loss: 0.7477, Train Acc: 0.6575, Val Acc: 0.6773
Epoch [15/20], Train Loss: 0.7817, Val Loss: 0.7487, Train Acc: 0.6467, Val Acc: 0.6809
Epoch [16/20], Train Loss: 0.7787, Val Loss: 0.7486, Train Acc: 0.6555, Val Acc: 0.6818
Epoch [17/20], Train Loss: 0.7773, Val Loss: 0.7489, Train Acc: 0.6560, Val Acc: 0.6818
Epoch [18/20], Train Loss: 0.7804, Val Loss: 0.7475, Train Acc: 0.6567, Val Acc: 0.6800
Epoch [19/20], Train Loss: 0.7773, Val Loss: 0.7480, Train Acc: 0.6579, Val Acc: 0.6809
Epoch 00019: reducing learning rate of group 0 to 1.0000e-06.
Epoch [20/20], Train Loss: 0.7704, Val Loss: 0.7482, Train Acc: 0.6615, Val Acc: 0.6809
Test loss: 0.6876, Test accuracy: 0.7134
根据以上运行日志,结合代码,可以给出以下优化建议:
1. 建议增加训练轮数:根据运行日志可以看出,在训练轮数较少的情况下,验证集上的准确率没有明显提升。可以尝试增加训练轮数,给模型更多的训练时间。
2. 建议使用更大的学习率:根据运行日志中的学习率变化信息,学习率在训练过程中多次被降低,可能导致模型在训练初期无法充分学习。可以尝试使用较大的学习率,加快模型的收敛速度。
3. 建议尝试其他的优化算法:目前使用的优化算法是Adam,可以尝试其他的优化算法,例如SGD、RMSprop等,看是否能够提升模型的性能。
4. 建议调整模型结构:可以尝试增加LSTM的层数或隐藏单元的个数,增加模型的复杂度,提升模型的表达能力。
5. 建议使用更大的批次大小:当前使用的批次大小为32,可以尝试增加批次大小,加快训练速度。
6. 建议使用更大的预训练词向量:当前使用的是glove.840B.300d预训练词向量,可以尝试使用更大的预训练词向量,例如glove.42B.300d或glove.6B.300d,增加词向量的覆盖范围。
综上所述,以上是一些建议,具体的优化方法还需要根据实际情况进行尝试和调整。
原文地址: https://www.cveoy.top/t/topic/pgxt 著作权归作者所有。请勿转载和采集!