基于16个基因表达量的DNN神经网络患者疾病预测模型
基于16个基因表达量的DNN神经网络患者疾病预测模型
本项目使用Python和PyTorch框架构建深度神经网络模型,根据16个基因的表达量预测患者是否患病。模型采用三层结构,分别进行8分类、4分类和最终的二分类预测。代码包含详细注释、损失函数变化曲线、准确率变化曲线、ROC曲线和AUC值、特征重要性图、热图和t-SNE图等可视化结果。
1. 数据准备
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, auc
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.manifold import TSNE
# 读入数据
data = pd.read_excel('data.xlsx', index_col=0)
# 划分训练集和测试集
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
# 定义数据集类
class GeneDataset(Dataset):
def __init__(self, data):
self.X = data.iloc[:, 1:].values
self.y = data.iloc[:, 0].values.astype(np.int64)
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
2. 模型定义
# 定义第一个模型
class Model1(nn.Module):
def __init__(self, input_size, hidden_size, output_size, dropout_rate):
super().__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, output_size)
self.dropout = nn.Dropout(p=dropout_rate)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = F.relu(self.fc2(x))
x = self.dropout(x)
x = self.fc3(x)
return x
# 定义第二个模型
class Model2(nn.Module):
def __init__(self, input_size, hidden_size, output_size, dropout_rate):
super().__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, output_size)
self.dropout = nn.Dropout(p=dropout_rate)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = F.relu(self.fc2(x))
x = self.dropout(x)
x = self.fc3(x)
return x
# 定义第三个模型
class Model3(nn.Module):
def __init__(self, input_size, hidden_size, output_size, dropout_rate):
super().__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, output_size)
self.dropout = nn.Dropout(p=dropout_rate)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = F.relu(self.fc2(x))
x = self.dropout(x)
x = self.fc3(x)
return x
3. 训练和测试函数
# 定义训练函数
def train(model, dataloader, optimizer, criterion, device):
model.train()
total_loss = 0
total_correct = 0
for batch_idx, (data, target) in enumerate(dataloader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
total_loss += loss.item()
total_correct += (output.argmax(dim=1) == target).sum().item()
return total_loss/len(dataloader), total_correct/len(dataloader.dataset)
# 定义测试函数
def test(model, dataloader, criterion, device):
model.eval()
total_loss = 0
total_correct = 0
with torch.no_grad():
for batch_idx, (data, target) in enumerate(dataloader):
data, target = data.to(device), target.to(device)
output = model(data)
loss = criterion(output, target)
total_loss += loss.item()
total_correct += (output.argmax(dim=1) == target).sum().item()
return total_loss/len(dataloader), total_correct/len(dataloader.dataset)
# 定义训练过程
def train_model(model, train_dataloader, test_dataloader, optimizer, criterion, device, epochs):
train_losses, train_accs = [], []
test_losses, test_accs = [], []
for epoch in range(epochs):
train_loss, train_acc = train(model, train_dataloader, optimizer, criterion, device)
test_loss, test_acc = test(model, test_dataloader, criterion, device)
train_losses.append(train_loss)
train_accs.append(train_acc)
test_losses.append(test_loss)
test_accs.append(test_acc)
print('Epoch {}: Train Loss: {:.4f}, Train Acc: {:.4f}, Test Loss: {:.4f}, Test Acc: {:.4f}'.format(
epoch+1, train_loss, train_acc, test_loss, test_acc))
return train_losses, train_accs, test_losses, test_accs
# 定义预测函数
def predict(model, dataloader, device):
model.eval()
predictions = []
with torch.no_grad():
for batch_idx, (data, _) in enumerate(dataloader):
data = data.to(device)
output = model(data)
predictions.append(output.argmax(dim=1).cpu().numpy())
return np.concatenate(predictions)
4. 可视化函数
# 定义绘制损失函数曲线函数
def plot_loss(train_losses, test_losses):
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()
# 定义绘制准确率曲线函数
def plot_acc(train_accs, test_accs):
plt.plot(train_accs, label='Train Acc')
plt.plot(test_accs, label='Test Acc')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.show()
# 定义绘制ROC曲线和计算AUC值函数
def plot_roc(y_true, y_pred):
fpr, tpr, _ = roc_curve(y_true, y_pred)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()
print('AUC: {:.2f}'.format(roc_auc))
# 定义绘制特征重要性图函数
def plot_feature_importance(model, dataloader, device):
model.eval()
weights = []
with torch.no_grad():
for batch_idx, (data, _) in enumerate(dataloader):
data = data.to(device)
weight = model.fc1.weight.cpu().numpy()
weights.append(weight)
weights = np.concatenate(weights, axis=0)
mean_weight = np.mean(np.abs(weights), axis=0)
sorted_idx = np.argsort(mean_weight)[::-1]
gene_names = data.columns.tolist()[1:]
sorted_names = [gene_names[i] for i in sorted_idx]
sorted_weight = mean_weight[sorted_idx]
plt.barh(range(len(sorted_weight)), sorted_weight)
plt.yticks(range(len(sorted_weight)), sorted_names)
plt.xlabel('Feature Importance')
plt.show()
# 定义绘制热图函数
def plot_heatmap(data):
corr = data.corr()
cmap = ListedColormap(['white', 'red'])
plt.imshow(corr, cmap=cmap)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.colorbar()
plt.show()
# 定义绘制t-SNE图函数
def plot_tsne(data, labels):
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(data)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=labels, cmap='Set1')
plt.show()
5. 模型训练和预测
# 设置训练参数
input_size = 16
hidden_size1 = 64
hidden_size2 = 32
hidden_size3 = 16
output_size1 = 8
output_size2 = 4
output_size3 = 2
dropout_rate = 0.2
learning_rate = 0.01
batch_size = 32
epochs = 50
# 准备数据
train_dataset = GeneDataset(train_data)
test_dataset = GeneDataset(test_data)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# 初始化模型和优化器
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model1 = Model1(input_size, hidden_size1, output_size1, dropout_rate).to(device)
model2 = Model2(output_size1, hidden_size2, output_size2, dropout_rate).to(device)
model3 = Model3(output_size2, hidden_size3, output_size3, dropout_rate).to(device)
optimizer1 = torch.optim.Adam(model1.parameters(), lr=learning_rate)
optimizer2 = torch.optim.Adam(model2.parameters(), lr=learning_rate)
optimizer3 = torch.optim.Adam(model3.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
# 训练第一个模型
train_losses1, train_accs1, test_losses1, test_accs1 = train_model(model1, train_dataloader, test_dataloader,
optimizer1, criterion, device, epochs)
plot_loss(train_losses1, test_losses1)
plot_acc(train_accs1, test_accs1)
# 预测第一个模型的输出
train_dataset1 = GeneDataset(train_data)
test_dataset1 = GeneDataset(test_data)
train_dataloader1 = DataLoader(train_dataset1, batch_size=batch_size, shuffle=False)
test_dataloader1 = DataLoader(test_dataset1, batch_size=batch_size, shuffle=False)
train_pred1 = predict(model1, train_dataloader1, device)
test_pred1 = predict(model1, test_dataloader1, device)
# 训练第二个模型
train_dataset2 = pd.DataFrame({'Output1': train_pred1, 'Target': train_data.iloc[:, 0]})
test_dataset2 = pd.DataFrame({'Output1': test_pred1, 'Target': test_data.iloc[:, 0]})
train_dataset2 = GeneDataset(train_dataset2)
test_dataset2 = GeneDataset(test_dataset2)
train_dataloader2 = DataLoader(train_dataset2, batch_size=batch_size, shuffle=True)
test_dataloader2 = DataLoader(test_dataset2, batch_size=batch_size, shuffle=False)
train_losses2, train_accs2, test_losses2, test_accs2 = train_model(model2, train_dataloader2, test_dataloader2,
optimizer2, criterion, device, epochs)
plot_loss(train_losses2, test_losses2)
plot_acc(train_accs2, test_accs2)
# 预测第二个模型的输出
train_dataset3 = pd.DataFrame({'Output2': predict(model2, train_dataloader2, device),
'Target': train_data.iloc[:, 0]})
test_dataset3 = pd.DataFrame({'Output2': predict(model2, test_dataloader2, device),
'Target': test_data.iloc[:, 0]})
# 训练第三个模型
train_dataset3 = GeneDataset(train_dataset3)
test_dataset3 = GeneDataset(test_dataset3)
train_dataloader3 = DataLoader(train_dataset3, batch_size=batch_size, shuffle=True)
test_dataloader3 = DataLoader(test_dataset3, batch_size=batch_size, shuffle=False)
train_losses3, train_accs3, test_losses3, test_accs3 = train_model(model3, train_dataloader3, test_dataloader3,
optimizer3, criterion, device, epochs)
plot_loss(train_losses3, test_losses3)
plot_acc(train_accs3, test_accs3)
# 预测测试集结果
test_pred3 = predict(model3, test_dataloader3, device)
plot_roc(test_data.iloc[:, 0], test_pred3)
plot_feature_importance(model1, train_dataloader1, device)
plot_heatmap(data.iloc[:, 1:])
plot_tsne(data.iloc[:, 1:].values, data.iloc[:, 0].values)
6. 总结
本项目使用DNN神经网络模型成功预测了患者是否患病,并绘制了模型训练过程中的损失函数曲线、准确率变化曲线、ROC曲线和AUC值、特征重要性图、热图和t-SNE图等可视化结果。这些结果可以帮助研究人员更好地理解模型的预测结果,并探索各个变量之间的关系。
注意: 以上代码仅供参考,实现过程中可能需要根据具体情况进行调整。
原文地址: https://www.cveoy.top/t/topic/mmAH 著作权归作者所有。请勿转载和采集!