基于16个基因表达量的DNN神经网络患者疾病预测模型

本项目使用Python和PyTorch框架构建深度神经网络模型,根据16个基因的表达量预测患者是否患病。模型采用三层结构,分别进行8分类、4分类和最终的二分类预测。代码包含详细注释、损失函数变化曲线、准确率变化曲线、ROC曲线和AUC值、特征重要性图、热图和t-SNE图等可视化结果。

1. 数据准备

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, auc
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.manifold import TSNE

# 读入数据
data = pd.read_excel('data.xlsx', index_col=0)

# 划分训练集和测试集
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# 定义数据集类
class GeneDataset(Dataset):
    def __init__(self, data):
        self.X = data.iloc[:, 1:].values
        self.y = data.iloc[:, 0].values.astype(np.int64)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

2. 模型定义

# 定义第一个模型
class Model1(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(p=dropout_rate)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# 定义第二个模型
class Model2(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(p=dropout_rate)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# 定义第三个模型
class Model3(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(p=dropout_rate)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

3. 训练和测试函数

# 定义训练函数
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    total_correct = 0
    for batch_idx, (data, target) in enumerate(dataloader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_correct += (output.argmax(dim=1) == target).sum().item()
    return total_loss/len(dataloader), total_correct/len(dataloader.dataset)

# 定义测试函数
def test(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    total_correct = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(dataloader):
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = criterion(output, target)
            total_loss += loss.item()
            total_correct += (output.argmax(dim=1) == target).sum().item()
    return total_loss/len(dataloader), total_correct/len(dataloader.dataset)

# 定义训练过程
def train_model(model, train_dataloader, test_dataloader, optimizer, criterion, device, epochs):
    train_losses, train_accs = [], []
    test_losses, test_accs = [], []
    for epoch in range(epochs):
        train_loss, train_acc = train(model, train_dataloader, optimizer, criterion, device)
        test_loss, test_acc = test(model, test_dataloader, criterion, device)
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        test_losses.append(test_loss)
        test_accs.append(test_acc)
        print('Epoch {}: Train Loss: {:.4f}, Train Acc: {:.4f}, Test Loss: {:.4f}, Test Acc: {:.4f}'.format(
            epoch+1, train_loss, train_acc, test_loss, test_acc))
    return train_losses, train_accs, test_losses, test_accs

# 定义预测函数
def predict(model, dataloader, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch_idx, (data, _) in enumerate(dataloader):
            data = data.to(device)
            output = model(data)
            predictions.append(output.argmax(dim=1).cpu().numpy())
    return np.concatenate(predictions)

4. 可视化函数

# 定义绘制损失函数曲线函数
def plot_loss(train_losses, test_losses):
    plt.plot(train_losses, label='Train Loss')
    plt.plot(test_losses, label='Test Loss')
    plt.legend()
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.show()

# 定义绘制准确率曲线函数
def plot_acc(train_accs, test_accs):
    plt.plot(train_accs, label='Train Acc')
    plt.plot(test_accs, label='Test Acc')
    plt.legend()
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.show()

# 定义绘制ROC曲线和计算AUC值函数
def plot_roc(y_true, y_pred):
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label='ROC curve (area = {:.2f})'.format(roc_auc))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show()
    print('AUC: {:.2f}'.format(roc_auc))

# 定义绘制特征重要性图函数
def plot_feature_importance(model, dataloader, device):
    model.eval()
    weights = []
    with torch.no_grad():
        for batch_idx, (data, _) in enumerate(dataloader):
            data = data.to(device)
            weight = model.fc1.weight.cpu().numpy()
            weights.append(weight)
    weights = np.concatenate(weights, axis=0)
    mean_weight = np.mean(np.abs(weights), axis=0)
    sorted_idx = np.argsort(mean_weight)[::-1]
    gene_names = data.columns.tolist()[1:]
    sorted_names = [gene_names[i] for i in sorted_idx]
    sorted_weight = mean_weight[sorted_idx]
    plt.barh(range(len(sorted_weight)), sorted_weight)
    plt.yticks(range(len(sorted_weight)), sorted_names)
    plt.xlabel('Feature Importance')
    plt.show()

# 定义绘制热图函数
def plot_heatmap(data):
    corr = data.corr()
    cmap = ListedColormap(['white', 'red'])
    plt.imshow(corr, cmap=cmap)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.colorbar()
    plt.show()

# 定义绘制t-SNE图函数
def plot_tsne(data, labels):
    tsne = TSNE(n_components=2)
    X_tsne = tsne.fit_transform(data)
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=labels, cmap='Set1')
    plt.show()

5. 模型训练和预测

# 设置训练参数
input_size = 16
hidden_size1 = 64
hidden_size2 = 32
hidden_size3 = 16
output_size1 = 8
output_size2 = 4
output_size3 = 2
dropout_rate = 0.2
learning_rate = 0.01
batch_size = 32
epochs = 50

# 准备数据
train_dataset = GeneDataset(train_data)
test_dataset = GeneDataset(test_data)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 初始化模型和优化器
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model1 = Model1(input_size, hidden_size1, output_size1, dropout_rate).to(device)
model2 = Model2(output_size1, hidden_size2, output_size2, dropout_rate).to(device)
model3 = Model3(output_size2, hidden_size3, output_size3, dropout_rate).to(device)
optimizer1 = torch.optim.Adam(model1.parameters(), lr=learning_rate)
optimizer2 = torch.optim.Adam(model2.parameters(), lr=learning_rate)
optimizer3 = torch.optim.Adam(model3.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# 训练第一个模型
train_losses1, train_accs1, test_losses1, test_accs1 = train_model(model1, train_dataloader, test_dataloader,
                                                                  optimizer1, criterion, device, epochs)
plot_loss(train_losses1, test_losses1)
plot_acc(train_accs1, test_accs1)

# 预测第一个模型的输出
train_dataset1 = GeneDataset(train_data)
test_dataset1 = GeneDataset(test_data)
train_dataloader1 = DataLoader(train_dataset1, batch_size=batch_size, shuffle=False)
test_dataloader1 = DataLoader(test_dataset1, batch_size=batch_size, shuffle=False)
train_pred1 = predict(model1, train_dataloader1, device)
test_pred1 = predict(model1, test_dataloader1, device)

# 训练第二个模型
train_dataset2 = pd.DataFrame({'Output1': train_pred1, 'Target': train_data.iloc[:, 0]})
test_dataset2 = pd.DataFrame({'Output1': test_pred1, 'Target': test_data.iloc[:, 0]})
train_dataset2 = GeneDataset(train_dataset2)
test_dataset2 = GeneDataset(test_dataset2)
train_dataloader2 = DataLoader(train_dataset2, batch_size=batch_size, shuffle=True)
test_dataloader2 = DataLoader(test_dataset2, batch_size=batch_size, shuffle=False)
train_losses2, train_accs2, test_losses2, test_accs2 = train_model(model2, train_dataloader2, test_dataloader2,
                                                                  optimizer2, criterion, device, epochs)
plot_loss(train_losses2, test_losses2)
plot_acc(train_accs2, test_accs2)

# 预测第二个模型的输出
train_dataset3 = pd.DataFrame({'Output2': predict(model2, train_dataloader2, device),
                               'Target': train_data.iloc[:, 0]})
test_dataset3 = pd.DataFrame({'Output2': predict(model2, test_dataloader2, device),
                              'Target': test_data.iloc[:, 0]})

# 训练第三个模型
train_dataset3 = GeneDataset(train_dataset3)
test_dataset3 = GeneDataset(test_dataset3)
train_dataloader3 = DataLoader(train_dataset3, batch_size=batch_size, shuffle=True)
test_dataloader3 = DataLoader(test_dataset3, batch_size=batch_size, shuffle=False)
train_losses3, train_accs3, test_losses3, test_accs3 = train_model(model3, train_dataloader3, test_dataloader3,
                                                                  optimizer3, criterion, device, epochs)
plot_loss(train_losses3, test_losses3)
plot_acc(train_accs3, test_accs3)

# 预测测试集结果
test_pred3 = predict(model3, test_dataloader3, device)
plot_roc(test_data.iloc[:, 0], test_pred3)
plot_feature_importance(model1, train_dataloader1, device)
plot_heatmap(data.iloc[:, 1:])
plot_tsne(data.iloc[:, 1:].values, data.iloc[:, 0].values)

6. 总结

本项目使用DNN神经网络模型成功预测了患者是否患病,并绘制了模型训练过程中的损失函数曲线、准确率变化曲线、ROC曲线和AUC值、特征重要性图、热图和t-SNE图等可视化结果。这些结果可以帮助研究人员更好地理解模型的预测结果,并探索各个变量之间的关系。

注意: 以上代码仅供参考,实现过程中可能需要根据具体情况进行调整。

基于16个基因表达量的DNN神经网络患者疾病预测模型

原文地址: https://www.cveoy.top/t/topic/mmAH 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录