使用 Python 编写 DNN 神经网络根据基因表达量预测患者患病状态
import pandas as pd import numpy as np import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import Dataset, DataLoader from sklearn.model_selection import train_test_split from skopt import gp_minimize from skopt.space import Real, Integer from skopt.utils import use_named_args from skopt.plots import plot_convergence, plot_objective from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt
读取数据
data = pd.read_excel('C:\Users\lenovo\Desktop\HIV\DNN神经网络测试\data1.xlsx')
划分训练集和测试集
feature_cols = data.columns[1:] target_col = data.columns[0] X_train, X_test, y_train, y_test = train_test_split(data[feature_cols], data[target_col], test_size=0.2, random_state=42)
定义数据集类
class GeneDataset(Dataset): def init(self, X, y): self.X = torch.tensor(X.values).float() self.y = torch.tensor(y.values).float()
def __len__(self):
return len(self.y)
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
定义注意力层
class Attention(nn.Module): def init(self, input_dim, hidden_dim): super().init() self.input_dim = input_dim self.hidden_dim = hidden_dim self.linear1 = nn.Linear(input_dim, hidden_dim, bias=False) self.linear2 = nn.Linear(hidden_dim, 1, bias=False) self.tanh = nn.Tanh() self.softmax = nn.Softmax(dim=1)
def forward(self, x):
x = self.linear1(x)
x = self.tanh(x)
x = self.linear2(x)
x = self.softmax(x)
return x
定义神经网络
class DNN(nn.Module): def init(self, input_dim, hidden_dim, output_dim, num_layers): super().init() self.input_dim = input_dim self.hidden_dim = hidden_dim self.output_dim = output_dim self.num_layers = num_layers self.attention = Attention(input_dim, hidden_dim) self.linears = nn.ModuleList([nn.Linear(input_dim, hidden_dim) if i==0 else nn.Linear(hidden_dim, hidden_dim) for i in range(num_layers)]) self.out = nn.Linear(hidden_dim, output_dim) self.activation = nn.ReLU()
def forward(self, x):
attn_weights = self.attention(x)
x = x * attn_weights
for i, linear in enumerate(self.linears):
x = linear(x)
x = self.activation(x)
x = self.out(x)
return x
定义训练函数
def train(model, optimizer, criterion, dataloader): model.train() epoch_loss = 0 epoch_acc = 0 for X, y in dataloader: optimizer.zero_grad() y_pred = model(X) loss = criterion(y_pred, y.unsqueeze(1)) acc = ((y_pred > 0.5).int() == y.unsqueeze(1)).sum().item() / len(y) loss.backward() optimizer.step() epoch_loss += loss.item() epoch_acc += acc epoch_loss /= len(dataloader) epoch_acc /= len(dataloader) return epoch_loss, epoch_acc
定义测试函数
def test(model, criterion, dataloader): model.eval() epoch_loss = 0 epoch_acc = 0 with torch.no_grad(): for X, y in dataloader: y_pred = model(X) loss = criterion(y_pred, y.unsqueeze(1)) acc = ((y_pred > 0.5).int() == y.unsqueeze(1)).sum().item() / len(y) epoch_loss += loss.item() epoch_acc += acc epoch_loss /= len(dataloader) epoch_acc /= len(dataloader) return epoch_loss, epoch_acc
定义贝叶斯优化的损失函数
@use_named_args([ ('lr', Real(1e-5, 1e-2, prior='log-uniform')), ('hidden_dim', Integer(16, 64)), ('num_layers', Integer(3, 5)), ]) def objective(lr, hidden_dim, num_layers): # 构建模型 dnn = DNN(16, hidden_dim, 1, num_layers) # 定义损失函数和优化器 criterion = nn.BCEWithLogitsLoss() optimizer = optim.Adam(dnn.parameters(), lr=lr) # 定义数据集和数据加载器 train_dataset = GeneDataset(X_train, y_train) train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) # 训练模型 for epoch in range(50): train_loss, train_acc = train(dnn, optimizer, criterion, train_loader) print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}') # 在测试集上测试 test_dataset = GeneDataset(X_test, y_test) test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False) test_loss, test_acc = test(dnn, criterion, test_loader) print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}') # 返回测试集上的准确率作为优化目标 return -test_acc
进行贝叶斯优化
res = gp_minimize(objective, [(1e-5, 1e-2), (16, 64), (3, 5)], n_calls=20, n_random_starts=5)
输出最优参数
print(f'Best Accuracy: {-res.fun:.4f}') print(f'Best Parameters: {res.x}')
绘制收敛图
plot_convergence(res)
绘制优化目标函数的变化
plot_objective(res)
训练最优模型
lr, hidden_dim, num_layers = res.x dnn = DNN(16, hidden_dim, 1, num_layers) criterion = nn.BCEWithLogitsLoss() optimizer = optim.Adam(dnn.parameters(), lr=lr) train_dataset = GeneDataset(X_train, y_train) train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) for epoch in range(50): train_loss, train_acc = train(dnn, optimizer, criterion, train_loader) print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}') test_dataset = GeneDataset(X_test, y_test) test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False) test_loss, test_acc = test(dnn, criterion, test_loader) print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}')
绘制ROC曲线
y_prob = torch.sigmoid(dnn(torch.tensor(X_test.values).float())).detach().numpy() fpr, tpr, thresholds = roc_curve(y_test, y_prob) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=2, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random guess') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.show()
原文地址: https://www.cveoy.top/t/topic/ndg9 著作权归作者所有。请勿转载和采集!