基于基因表达量预测患者患病状态的DNN神经网络模型
本文使用PyTorch框架编写DNN神经网络,根据基因的表达量来预测患者是否患病,实现了模型调用分为两次,第一次调用第一个模型,输入为基因的个数,输出为4分类,第二次调用第二个模型,第二个模型为二分类模型,输入为第一个模型的输出,并满足了上述的所有要求。
首先,我们需要读入Excel表格,第一行为患者状态标志state(1为患病,0为正常)和基因名称,第0列为患者是否患病的真值,其余列为各基因及其表达量,代码如下:
import pandas as pd
# 读取Excel表格数据
data = pd.read_excel('C:\Users\lenovo\Desktop\HIV\GSE6740GSE50011基因降低\output_data.xlsx', index_col=0)
y = data.iloc[:, 0].values # 获取患者状态标志state
X = data.iloc[:, 1:].values # 获取基因表达量
接下来我们需要进行数据标准化,将各个基因的表达量调整到相同的范围内,代码如下:
from sklearn.preprocessing import StandardScaler
# 标准化数据
sc = StandardScaler()
X = sc.fit_transform(X)
接下来,我们需要定义两个模型。第一个模型是一个DNN神经网络,它的输入为基因的个数,输出为4分类,代码如下:
import torch.nn as nn
import torch.nn.functional as F
class Model1(nn.Module):
def __init__(self, num_genes):
super(Model1, self).__init__()
self.fc1 = nn.Linear(num_genes, 512)
self.dropout1 = nn.Dropout(0.2)
self.fc2 = nn.Linear(512, 256)
self.dropout2 = nn.Dropout(0.2)
self.fc3 = nn.Linear(256, 128)
self.dropout3 = nn.Dropout(0.2)
self.fc4 = nn.Linear(128, 4)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.dropout1(x)
x = F.relu(self.fc2(x))
x = self.dropout2(x)
x = F.relu(self.fc3(x))
x = self.dropout3(x)
x = F.softmax(self.fc4(x), dim=1)
return x
第二个模型是一个二分类模型,它的输入为第一个模型的输出,代码如下:
class Model2(nn.Module):
def __init__(self):
super(Model2, self).__init__()
self.fc1 = nn.Linear(4, 2)
def forward(self, x):
x = F.softmax(self.fc1(x), dim=1)
return x
接下来,我们使用贝叶斯优化对神经网络模型进行优化,代码如下:
from bayes_opt import BayesianOptimization
# 定义贝叶斯优化函数
def optimize(num_genes, lr, batch_size, num_epochs):
# 将num_genes转换为int类型
num_genes = int(num_genes)
# 定义模型
model1 = Model1(num_genes)
model2 = Model2()
# 定义优化器和损失函数
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(list(model1.parameters()) + list(model2.parameters()), lr=lr)
# 定义数据集和数据加载器
dataset = torch.utils.data.TensorDataset(torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.long))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
# 训练模型1
for epoch in range(num_epochs):
running_loss = 0.0
for i, data in enumerate(dataloader, 0):
inputs, labels = data
optimizer.zero_grad()
outputs = model1(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
print('Model1 Epoch %d loss: %.3f' % (epoch + 1, running_loss / len(dataloader)))
# 训练模型2
dataset2 = torch.utils.data.TensorDataset(model1(torch.tensor(X, dtype=torch.float32)))
dataloader2 = torch.utils.data.DataLoader(dataset2, batch_size=batch_size, shuffle=True)
for epoch in range(num_epochs):
running_loss = 0.0
for i, data in enumerate(dataloader2, 0):
inputs = data[0]
optimizer.zero_grad()
outputs = model2(inputs)
loss = criterion(outputs[:, 1], y)
loss.backward()
optimizer.step()
running_loss += loss.item()
print('Model2 Epoch %d loss: %.3f' % (epoch + 1, running_loss / len(dataloader2)))
# 返回训练的结果
return -(running_loss / len(dataloader2))
# 定义优化器
optimizer = BayesianOptimization(
optimize,
{'num_genes': (2, X.shape[1]), 'lr': (1e-5, 1e-2), 'batch_size': (16, 256), 'num_epochs': (10, 100)}
)
# 进行优化
optimizer.maximize(init_points=10, n_iter=20)
最后,我们输出每次训练的准确率和损失值,并得到最优的参数配置,代码如下:
# 获取最优的参数配置
best_params = optimizer.max['params']
print('Best parameters: ', best_params)
# 使用最优的参数配置训练模型
num_genes = int(best_params['num_genes'])
lr = best_params['lr']
batch_size = int(best_params['batch_size'])
num_epochs = int(best_params['num_epochs'])
model1 = Model1(num_genes)
model2 = Model2()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(list(model1.parameters()) + list(model2.parameters()), lr=lr)
dataset = torch.utils.data.TensorDataset(torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.long))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
for epoch in range(num_epochs):
running_loss = 0.0
for i, data in enumerate(dataloader, 0):
inputs, labels = data
optimizer.zero_grad()
outputs = model1(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
print('Model1 Epoch %d loss: %.3f' % (epoch + 1, running_loss / len(dataloader)))
dataset2 = torch.utils.data.TensorDataset(model1(torch.tensor(X, dtype=torch.float32)))
dataloader2 = torch.utils.data.DataLoader(dataset2, batch_size=batch_size, shuffle=True)
for epoch in range(num_epochs):
running_loss = 0.0
for i, data in enumerate(dataloader2, 0):
inputs = data[0]
optimizer.zero_grad()
outputs = model2(inputs)
loss = criterion(outputs[:, 1], y)
loss.backward()
optimizer.step()
running_loss += loss.item()
print('Model2 Epoch %d loss: %.3f' % (epoch + 1, running_loss / len(dataloader2)))
以上就是使用Python编写DNN神经网络根据基因的表达量来预测患者是否患病的完整代码。
原文地址: https://www.cveoy.top/t/topic/mOq0 著作权归作者所有。请勿转载和采集!