基于Bert模型的八分类文本数据集训练代码示例
以下是一个基于Bert模型的八分类训练代码的示例:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
# 定义Bert分类模型
class BertClassifier(nn.Module):
def __init__(self, num_labels):
super(BertClassifier, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-uncased')
self.dropout = nn.Dropout(0.1)
self.classifier = nn.Linear(768, num_labels)
def forward(self, input_ids, attention_mask):
outputs = self.bert(input_ids, attention_mask=attention_mask)
pooled_output = outputs.pooler_output
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
return logits
# 加载Bert模型和分词器
model = BertClassifier(num_labels=8)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# 数据预处理
def process_data(file_path):
with open(file_path, 'r') as file:
lines = file.readlines()
data = []
labels = []
for line in lines:
line = line.strip()
parts = line.split(',')
features = parts[:23]
label = int(parts[-1])
data.append(features)
labels.append(label)
return data, labels
train_data, train_labels = process_data('train.txt')
val_data, val_labels = process_data('val.txt')
test_data, test_labels = process_data('test.txt')
# 输入转换为Bert模型所需格式
def convert_data(data, labels):
input_ids = []
attention_masks = []
for text in data:
encoded = tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=128,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt'
)
input_ids.append(encoded['input_ids'])
attention_masks.append(encoded['attention_mask'])
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)
return input_ids, attention_masks, labels
train_input_ids, train_attention_masks, train_labels = convert_data(train_data, train_labels)
val_input_ids, val_attention_masks, val_labels = convert_data(val_data, val_labels)
test_input_ids, test_attention_masks, test_labels = convert_data(test_data, test_labels)
# 定义训练和评估函数
def train(model, train_input_ids, train_attention_masks, train_labels, val_input_ids, val_attention_masks, val_labels):
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
train_input_ids = train_input_ids.to(device)
train_attention_masks = train_attention_masks.to(device)
train_labels = train_labels.to(device)
val_input_ids = val_input_ids.to(device)
val_attention_masks = val_attention_masks.to(device)
val_labels = val_labels.to(device)
for epoch in range(5):
model.train()
optimizer.zero_grad()
train_logits = model(train_input_ids, train_attention_masks)
loss = criterion(train_logits, train_labels)
loss.backward()
optimizer.step()
model.eval()
val_logits = model(val_input_ids, val_attention_masks)
val_loss = criterion(val_logits, val_labels)
print('Epoch:', epoch+1, 'Train Loss:', loss.item(), 'Val Loss:', val_loss.item())
# 训练模型
train(model, train_input_ids, train_attention_masks, train_labels, val_input_ids, val_attention_masks, val_labels)
# 测试函数
def test(model, test_input_ids, test_attention_masks, test_labels):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
test_input_ids = test_input_ids.to(device)
test_attention_masks = test_attention_masks.to(device)
test_labels = test_labels.to(device)
model.eval()
with torch.no_grad():
test_logits = model(test_input_ids, test_attention_masks)
_, predicted_labels = torch.max(test_logits, 1)
correct = (predicted_labels == test_labels).sum().item()
accuracy = correct / len(test_labels)
print('Test Accuracy:', accuracy)
# 测试模型
test(model, test_input_ids, test_attention_masks, test_labels)
在运行代码之前,请确保已经安装了torch、transformers和nltk库,并下载了Bert的预训练模型。此外,需要将训练集、验证集和测试集分别保存为train.txt、val.txt和test.txt文件。
原文地址: https://www.cveoy.top/t/topic/qDBQ 著作权归作者所有。请勿转载和采集!