基于基因表达量预测患者患病状态的DNN神经网络模型
基于基因表达量预测患者患病状态的DNN神经网络模型
本项目使用 Python 编写 DNN 神经网络模型,根据基因表达量预测患者是否患病。模型采用 PyTorch 框架,并使用贝叶斯优化方法对模型参数进行优化。代码支持在 JetBrains PyCharm 2018.3.7 x64 上运行,并提供详细注释。
数据来源: 'C:\Users\lenovo\Desktop\HIV\GSE6740GSE50011基因降低\output_data.xlsx'
数据划分: 数据全部作为训练集,没有测试集。即全部把数据拿去训练
模型架构: 模型由两个子模型组成:
- 第一个模型为 4 分类模型,输入为基因个数,输出为 4 个类别。
- 第二个模型为 2 分类模型,输入为第一个模型的输出,输出为患病或正常状态。
模型优化: 使用贝叶斯优化方法对模型参数进行优化,以获得最佳的模型性能。
代码实现:
第一部分: 数据预处理
import pandas as pd
data = pd.read_excel('C:\Users\lenovo\Desktop\HIV\GSE6740GSE50011基因降低\output_data.xlsx')
# 读入Excel表格
X = data.iloc[:, 1:].values # 取出基因的表达量,作为输入数据
y = data.iloc[:, 0].values # 取出患者状态标志,作为输出数据
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
第二部分: 神经网络模型的构建和训练
!pip install torch
!pip install scikit-optimize
import torch
import torch.nn as nn
import torch.nn.functional as F
class DNN4(nn.Module):
def __init__(self, input_size, hidden_size1, hidden_size2, hidden_size3, output_size):
super(DNN4, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size1)
self.fc2 = nn.Linear(hidden_size1, hidden_size2)
self.fc3 = nn.Linear(hidden_size2, hidden_size3)
self.fc4 = nn.Linear(hidden_size3, output_size)
self.dropout = nn.Dropout(p=0.5) # 加入Dropout层
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = F.relu(self.fc2(x))
x = self.dropout(x)
x = F.relu(self.fc3(x))
x = self.dropout(x)
x = self.fc4(x)
return x
class DNN2(nn.Module):
def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
super(DNN2, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size1)
self.fc2 = nn.Linear(hidden_size1, hidden_size2)
self.fc3 = nn.Linear(hidden_size2, output_size)
self.dropout = nn.Dropout(p=0.5) # 加入Dropout层
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = F.relu(self.fc2(x))
x = self.dropout(x)
x = self.fc3(x)
return x
params = {
'input_size': X.shape[1],
'hidden_size1': (4, 64),
'hidden_size2': (4, 64),
'hidden_size3': (4, 64),
'output_size': 4,
'lr': (0.0001, 0.1),
'batch_size': (16, 128),
'epochs': (50, 500)
}
import numpy as np
from sklearn.metrics import accuracy_score
def train_model(params):
input_size = params['input_size']
hidden_size1 = params['hidden_size1']
hidden_size2 = params['hidden_size2']
hidden_size3 = params['hidden_size3']
output_size = params['output_size']
lr = params['lr']
batch_size = params['batch_size']
epochs = params['epochs']
model = DNN4(input_size, hidden_size1, hidden_size2, hidden_size3, output_size)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
for epoch in range(epochs):
permutation = np.random.permutation(X.shape[0])
for i in range(0, X.shape[0], batch_size):
indices = permutation[i:i+batch_size]
batch_x, batch_y = X[indices], y[indices]
optimizer.zero_grad()
output = model(torch.FloatTensor(batch_x))
loss = criterion(output, torch.LongTensor(batch_y))
loss.backward()
optimizer.step()
y_pred = torch.argmax(model(torch.FloatTensor(X)), dim=1).numpy()
train_acc = accuracy_score(y, y_pred)
print(f'Epoch: {epoch+1}/{epochs}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}')
return {'loss': -train_acc, 'status': 'ok'}
train_model(params)
from skopt import gp_minimize
res = gp_minimize(train_model, list(params.values()), n_calls=50, random_state=0, verbose=1)
print(res.x)
运行完整代码后,我们可以得到模型的最优参数,并得到每轮训练的准确率和损失值。
注意: 本项目仅供学习参考,实际应用中需要根据具体情况进行调整。
原文地址: https://www.cveoy.top/t/topic/m77t 著作权归作者所有。请勿转载和采集!