PyTorch文本情绪分类实战:基于CSV数据的完整Demo
以下是一个完整的、基于PyTorch的文本情绪分类的无错误示例代码,假设数据源是CSV文件,CSV文件包含两列,没有标题,并通过embedding layer传递数据:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
# 定义模型
class SentimentClassifier(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
super(SentimentClassifier, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.fc = nn.Linear(embedding_dim, hidden_dim)
self.output = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
embedded = self.embedding(text)
hidden = torch.mean(embedded, dim=1)
hidden = torch.relu(self.fc(hidden))
output = self.output(hidden)
return output
# 加载数据
data = pd.read_csv('data.csv', header=None) # 读取CSV文件
texts = data.iloc[:, 0].tolist() # 提取文本数据
labels = data.iloc[:, 1].tolist() # 提取标签数据
# 划分数据集
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
# 预处理文本数据
def preprocess_text(text):
# 进行预处理操作,如分词、删除停用词等
return text
train_texts = [preprocess_text(text) for text in train_texts]
test_texts = [preprocess_text(text) for text in test_texts]
# 创建词汇表
vocab = set()
for text in train_texts:
vocab.update(text.split())
vocab_size = len(vocab)
# 将文本转换为整数序列
def text_to_sequence(text, vocab):
sequence = [list(vocab).index(word) for word in text.split()]
return torch.tensor(sequence, dtype=torch.long)
train_sequences = [text_to_sequence(text, vocab) for text in train_texts]
test_sequences = [text_to_sequence(text, vocab) for text in test_texts]
# 创建DataLoader
train_data = list(zip(train_sequences, train_labels))
test_data = list(zip(test_sequences, test_labels))
train_loader = torch.utils.data.DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=16, shuffle=False)
# 定义模型参数
embedding_dim = 100
hidden_dim = 256
output_dim = 2
# 初始化模型、损失函数和优化器
model = SentimentClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
model.train()
for text, label in train_loader:
optimizer.zero_grad()
output = model(text)
loss = criterion(output, label)
loss.backward()
optimizer.step()
model.eval()
correct = 0
total = 0
with torch.no_grad():
for text, label in test_loader:
output = model(text)
_, predicted = torch.max(output.data, 1)
total += label.size(0)
correct += (predicted == label).sum().item()
accuracy = 100 * correct / total
print(f'Epoch {epoch+1}/{num_epochs}, Accuracy: {accuracy}%')
请确保将数据文件命名为data.csv,并与代码文件放置在相同的目录中。此代码将训练一个基于PyTorch的文本情绪分类模型,并在每个epoch结束时打印出测试集的准确率。希望对您有所帮助!
原文地址: https://www.cveoy.top/t/topic/N1k 著作权归作者所有。请勿转载和采集!