基于基因表达量预测患者患病状态的 DNN 神经网络模型
基于基因表达量预测患者患病状态的 DNN 神经网络模型
本项目使用 Python 和 PyTorch 框架构建深度神经网络模型,根据基因表达量预测患者是否患病。模型包含两个子网络,分别进行多分类和二分类,并使用贝叶斯优化方法调整模型参数,以提升预测准确率。
导入所需库
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from torch import nn
from torch.utils.data import Dataset, DataLoader
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args
定义数据集类
class GeneDataset(Dataset):
def __init__(self, data, transform=None):
self.data = data
self.transform = transform
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
x = self.data.iloc[idx, 1:].values.astype(float) # 提取基因表达量数据
y = self.data.iloc[idx, 0] # 提取患者状态标志数据
if self.transform:
x = self.transform(x)
return x, y
定义神经网络模型
模型 1: 多分类模型
class Model1(nn.Module):
def __init__(self, input_size, hidden_size, output_size, dropout_rate):
super(Model1, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(dropout_rate)
self.fc2 = nn.Linear(hidden_size, output_size)
def forward(self, x):
out = self.fc1(x)
out = self.relu(out)
out = self.dropout(out)
out = self.fc2(out)
return out
模型 2: 二分类模型
class Model2(nn.Module):
def __init__(self, input_size, hidden_size, output_size, dropout_rate):
super(Model2, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(dropout_rate)
self.fc2 = nn.Linear(hidden_size, output_size)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
out = self.fc1(x)
out = self.relu(out)
out = self.dropout(out)
out = self.fc2(out)
out = self.sigmoid(out)
return out
定义数据预处理函数
def transform(x):
scaler = StandardScaler()
x = scaler.fit_transform(x.reshape(1, -1))
return x.reshape(-1)
定义模型训练和测试函数
模型 1 训练和测试函数
def train_and_test_model1(model, dataloader, criterion, optimizer):
total_loss = 0
total_correct = 0
total_samples = 0
model.train()
for i, (inputs, labels) in enumerate(dataloader):
optimizer.zero_grad()
outputs = model(inputs.float())
loss = criterion(outputs, labels.long())
loss.backward()
optimizer.step()
total_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total_correct += (predicted == labels).sum().item()
total_samples += labels.size(0)
train_loss = total_loss / total_samples
train_acc = total_correct / total_samples
return train_loss, train_acc
模型 2 训练和测试函数
def train_and_test_model2(model, dataloader, criterion, optimizer):
total_loss = 0
total_correct = 0
total_samples = 0
model.train()
for i, (inputs, labels) in enumerate(dataloader):
optimizer.zero_grad()
outputs = model(inputs.float())
loss = criterion(outputs, labels.float())
loss.backward()
optimizer.step()
total_loss += loss.item()
predicted = torch.round(outputs.data)
total_correct += (predicted == labels).sum().item()
total_samples += labels.size(0)
train_loss = total_loss / total_samples
train_acc = total_correct / total_samples
return train_loss, train_acc
定义模型训练函数
def train_models(model1, model2, trainloader, criterion1, criterion2, optimizer1, optimizer2):
train_loss1, train_acc1, train_loss2, train_acc2 = 0, 0, 0, 0
train_loss1, train_acc1 = train_and_test_model1(model1, trainloader, criterion1, optimizer1)
train_inputs = torch.zeros(len(trainloader), 4)
train_labels = torch.zeros(len(trainloader), 1)
for i, (inputs, labels) in enumerate(trainloader):
train_inputs[i] = model1(inputs.float())
train_labels[i] = labels
train_dataset = torch.utils.data.TensorDataset(train_inputs, train_labels)
trainloader2 = DataLoader(train_dataset, batch_size=32, shuffle=True)
train_loss2, train_acc2 = train_and_test_model2(model2, trainloader2, criterion2, optimizer2)
return train_loss1, train_acc1, train_loss2, train_acc2
加载并预处理数据
filename = 'C:\Users\lenovo\Desktop\HIV\GSE6740GSE50011基因降低\output_data.xlsx'
data = pd.read_excel(filename, sheet_name='Sheet1')
data.iloc[:, 1:] = data.iloc[:, 1:].apply(lambda x: transform(x))
train_data = data
定义超参数空间
space = [Integer(10, 100, name='hidden_size'),
Real(0.01, 0.5, name='dropout_rate'),
Integer(10, 100, name='hidden_size2'),
Real(0.01, 0.5, name='dropout_rate2'),
Real(0.001, 0.01, name='lr'),
Real(0.001, 0.01, name='lr2')]
定义模型训练函数(用于贝叶斯优化)
@use_named_args(space)
def train_model(hidden_size, dropout_rate, hidden_size2, dropout_rate2, lr, lr2):
# 定义模型
model1 = Model1(71, hidden_size, 4, dropout_rate)
model2 = Model2(4, hidden_size2, 1, dropout_rate2)
# 定义损失函数和优化器
criterion1 = nn.CrossEntropyLoss()
criterion2 = nn.BCELoss()
optimizer1 = torch.optim.Adam(model1.parameters(), lr=lr)
optimizer2 = torch.optim.Adam(model2.parameters(), lr=lr2)
# 定义数据集和数据加载器
train_dataset = GeneDataset(train_data, transform=transform)
trainloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# 训练模型
train_loss1, train_acc1, train_loss2, train_acc2 = train_models(model1, model2, trainloader, criterion1, criterion2, optimizer1, optimizer2)
# 输出训练结果
print('hidden_size:', hidden_size)
print('dropout_rate:', dropout_rate)
print('hidden_size2:', hidden_size2)
print('dropout_rate2:', dropout_rate2)
print('lr:', lr)
print('lr2:', lr2)
print('train_loss1:', train_loss1)
print('train_acc1:', train_acc1)
print('train_loss2:', train_loss2)
print('train_acc2:', train_acc2)
# 返回损失值
return train_loss2
进行贝叶斯优化
res_gp = gp_minimize(train_model, space, n_calls=50, random_state=0)
# 输出最优超参数
print("Best: %f" % res_gp.fun)
print("Best parameters: " + str(res_gp.x))
代码说明
- 数据加载与预处理: 从 Excel 文件中读取数据,并将基因表达量进行标准化处理。
- 模型定义: 定义两个神经网络模型,分别用于多分类和二分类。
- 训练函数: 定义训练和测试模型的函数,分别用于模型 1 和模型 2 的训练和测试。
- 模型训练函数: 定义训练两个模型的函数,将模型 1 的输出作为模型 2 的输入,进行串联训练。
- 超参数空间定义: 定义贝叶斯优化搜索的超参数空间,包括隐藏层神经元数量、Dropout 率和学习率。
- 贝叶斯优化: 使用
gp_minimize函数进行贝叶斯优化,以找到最佳的超参数组合,使模型的损失值最小。 - 输出结果: 输出最优的超参数组合以及对应的损失值。
总结
本项目构建了一个基于基因表达量预测患者患病状态的 DNN 神经网络模型,并使用贝叶斯优化方法调整模型参数,以提升预测准确率。模型包含两个子网络,分别进行多分类和二分类,能够有效地识别和预测患者的患病状态。该项目提供了一个使用深度学习方法分析基因表达量数据的案例,为相关领域的研究提供参考。
原文地址: https://www.cveoy.top/t/topic/mODE 著作权归作者所有。请勿转载和采集!