基于贝叶斯优化的深度学习模型构建和训练 - HIV基因分析
基于贝叶斯优化的深度学习模型构建和训练 - HIV基因分析
本代码使用贝叶斯优化来优化两个深度学习模型的超参数,第一个模型用于预测HIV基因表达水平,第二个模型用于预测HIV感染状态。代码使用了PyTorch框架,并包含了数据集预处理、模型定义、训练和评估等步骤。
数据准备和预处理
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
from bayes_opt import BayesianOptimization
#读取Excel文件,并进行标准化处理。
data = pd.read_excel('C:\Users\lenovo\Desktop\HIV\GSE6740GSE50011基因降低\output_data.xlsx')
X = data.iloc[:, 1:].values
y = data.iloc[:, 0].values
scaler = StandardScaler()
X = scaler.fit_transform(X)
定义数据集类
#定义数据集类,继承自torch.utils.data.Dataset,重载__getitem__和__len__方法。
class MyDataset(Dataset):
def __init__(self, X, y):
self.X = X
self.y = y
def __getitem__(self, index):
return torch.tensor(self.X[index]).float(), torch.tensor(self.y[index]).long()
def __len__(self):
return len(self.X)
定义第一个模型
#定义第一个模型,使用贝叶斯优化对其进行超参数优化。
class Model1(nn.Module):
def __init__(self, input_size, hidden_size, output_size, dropout=0.5):
super(Model1, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, hidden_size)
self.fc4 = nn.Linear(hidden_size, output_size)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc3(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc4(x)
return x
def train_model1(hidden_size, dropout):
model = Model1(input_size=X.shape[1], hidden_size=int(hidden_size), output_size=4, dropout=dropout)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True)
dataset = MyDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
model.train()
for epoch in range(100):
running_loss = 0.0
running_corrects = 0
for inputs, labels in dataloader:
optimizer.zero_grad()
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
scheduler.step(running_loss)
epoch_loss = running_loss / len(dataset)
epoch_acc = running_corrects.double() / len(dataset)
print('Epoch1.0 {}/{} - Loss: {:.4f} Acc: {:.4f}'.format(epoch+1, 100, epoch_loss, epoch_acc))
return epoch_acc.item()
超参数优化
#调用贝叶斯优化对模型进行超参数优化。
pbounds = {'hidden_size': (100, 500), 'dropout': (0.3, 0.7)}
optimizer = BayesianOptimization(
f=train_model1,
pbounds=pbounds,
verbose=2,
random_state=1,
)
optimizer.maximize(n_iter=10)
训练第一个模型
#得到最优的超参数,使用其训练第一个模型。
hidden_size = int(optimizer.max['params']['hidden_size'])
dropout = optimizer.max['params']['dropout']
model1 = Model1(input_size=X.shape[1], hidden_size=hidden_size, output_size=4, dropout=dropout)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model1.parameters(), lr=0.001)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True)
dataset = MyDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
model1.train()
for epoch in range(100):
running_loss = 0.0
running_corrects = 0
for inputs, labels in dataloader:
optimizer.zero_grad()
outputs = model1(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
scheduler.step(running_loss)
epoch_loss = running_loss / len(dataset)
epoch_acc = running_corrects.double() / len(dataset)
print('Epoch1.1 {}/{} - Loss: {:.4f} Acc: {:.4f}'.format(epoch+1, 100, epoch_loss, epoch_acc))
定义第二个模型
#定义第二个模型,使用贝叶斯优化对其进行超参数优化。
class Model2(nn.Module):
def __init__(self, input_size, hidden_size, output_size, dropout=0.5):
super(Model2, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, hidden_size)
self.fc4 = nn.Linear(hidden_size, output_size)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc3(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc4(x)
return x
def train_model2(hidden_size, dropout):
model2 = Model2(input_size=4, hidden_size=int(hidden_size), output_size=1, dropout=dropout)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model2.parameters(), lr=0.001)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True)
dataset = MyDataset(model1_outputs, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
model2.train()
for epoch in range(100):
running_loss = 0.0
running_corrects = 0
for inputs, labels in dataloader:
optimizer.zero_grad()
outputs = model2(inputs)
loss = criterion(outputs, labels.float().unsqueeze(1))
loss.backward()
optimizer.step()
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(torch.round(torch.sigmoid(outputs)) == labels.unsqueeze(1).float())
scheduler.step(running_loss)
epoch_loss = running_loss / len(dataset)
epoch_acc = running_corrects.double() / len(dataset)
print('Epoch2.0 {}/{} - Loss: {:.4f} Acc: {:.4f}'.format(epoch+1, 100, epoch_loss, epoch_acc))
return epoch_acc.item()
训练第二个模型
#将第一个模型的输出作为第二个模型的输入,调用贝叶斯优化对第二个模型进行超参数优化。
model1_outputs = []
for inputs, _ in dataloader:
outputs = model1(inputs)
model1_outputs.append(outputs.detach().numpy())
model1_outputs = np.concatenate(model1_outputs, axis=0)
pbounds = {'hidden_size': (100, 500), 'dropout': (0.3, 0.7)}
optimizer = BayesianOptimization(
f=train_model2,
pbounds=pbounds,
verbose=2,
random_state=1,
)
optimizer.maximize(n_iter=10)
评估模型
#得到最优的超参数,使用其训练第二个模型。
hidden_size = int(optimizer.max['params']['hidden_size'])
dropout = optimizer.max['params']['dropout']
model2 = Model2(input_size=4, hidden_size=hidden_size, output_size=1, dropout=dropout)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model2.parameters(), lr=0.001)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True)
dataset = MyDataset(model1_outputs, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
model2.train()
for epoch in range(100):
running_loss = 0.0
running_corrects = 0
for inputs, labels in dataloader:
optimizer.zero_grad()
outputs = model2(inputs)
loss = criterion(outputs, labels.float().unsqueeze(1))
loss.backward()
optimizer.step()
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(torch.round(torch.sigmoid(outputs)) == labels.unsqueeze(1).float())
scheduler.step(running_loss)
epoch_loss = running_loss / len(dataset)
epoch_acc = running_corrects.double() / len(dataset)
print('Epoch2.1 {}/{} - Loss: {:.4f} Acc: {:.4f}'.format(epoch+1, 100, epoch_loss, epoch_acc))
输出结果
#最后,将结果输出。
inputs = torch.tensor(scaler.transform(X)).float()
model1.eval()
model2.eval()
outputs1 = model1(inputs)
outputs2 = model2(outputs1.detach())
outputs = torch.round(torch.sigmoid(outputs2.detach())).squeeze().numpy()
accuracy = np.sum(outputs == y) / len(y)
loss = nn.BCEWithLogitsLoss()(outputs2, torch.tensor(y).float().unsqueeze(1)).item()
print('Accuracy: {:.4f} Loss: {:.4f}'.format(accuracy, loss))
代码说明
代码中 print('Epoch1.0 {}/{} - Loss: {:.4f} Acc: {:.4f}'.format(epoch+1, 100, epoch_loss, epoch_acc)) 循环了 14 * 100 次,是因为:
- 该代码位于第一个模型的训练循环内部,共循环了 100 次。
- 在每个 epoch 结束时,都会输出当前的 epoch 数、总的 epoch 数、损失值和准确率。
- 由于贝叶斯优化在外层循环中进行了 14 次迭代,因此第一个模型的训练循环被执行了 14 次,总共输出 14 * 100 = 1400 次。
原文地址: https://www.cveoy.top/t/topic/m8fF 著作权归作者所有。请勿转载和采集!