基于PyTorch的DNN神经网络模型,使用16个基因表达量预测患者疾病状态
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
# 读取Excel表格
data = pd.read_excel('gene_expression_data.xlsx')
genes = list(data.columns[1:])
# 定义第一个模型,输入为16个基因的表达量,输出为8分类
class Model1(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(Model1, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, output_size)
self.relu = nn.ReLU()
self.softmax = nn.Softmax(dim=1)
def forward(self, x):
out = self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
out = self.softmax(out)
return out
# 定义第二个模型,输入为第一个模型的8分类输出,输出为4分类
class Model2(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(Model2, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, output_size)
self.relu = nn.ReLU()
self.softmax = nn.Softmax(dim=1)
def forward(self, x):
out = self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
out = self.softmax(out)
return out
# 定义第三个模型,输入为第二个模型的4分类输出,输出为二分类,即患者是否患病
class Model3(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(Model3, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, output_size)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self, x):
out = self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
out = self.sigmoid(out)
return out
# 定义训练函数
def train(model, criterion, optimizer, num_epochs, X_train, y_train, X_test, y_test):
train_loss = []
test_loss = []
train_acc = []
test_acc = []
for epoch in range(num_epochs):
# 训练模型
model.train()
optimizer.zero_grad()
outputs = model(X_train)
loss = criterion(outputs, y_train)
loss.backward()
optimizer.step()
train_loss.append(loss.item())
# 计算训练集准确率
_, predicted = torch.max(outputs.data, 1)
total_train = y_train.size(0)
correct_train = (predicted == y_train).sum().item()
train_acc.append(correct_train / total_train)
# 在测试集上测试模型
model.eval()
with torch.no_grad():
outputs = model(X_test)
loss = criterion(outputs, y_test)
test_loss.append(loss.item())
# 计算测试集准确率
_, predicted = torch.max(outputs.data, 1)
total_test = y_test.size(0)
correct_test = (predicted == y_test).sum().item()
test_acc.append(correct_test / total_test)
return train_loss, test_loss, train_acc, test_acc
# 数据预处理
X = data[genes].values
y = data['state'].values
y = np.where(y == 'disease', 1, 0)
X = torch.from_numpy(X).float()
y = torch.from_numpy(y).long()
# 划分训练集和测试集
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]
# 定义模型参数
input_size = 16
hidden_size = 10
output_size1 = 8
output_size2 = 4
output_size3 = 1
learning_rate = 0.01
num_epochs = 100
# 初始化模型和优化器
model1 = Model1(input_size, hidden_size, output_size1)
model2 = Model2(output_size1, hidden_size, output_size2)
model3 = Model3(output_size2, hidden_size, output_size3)
criterion = nn.CrossEntropyLoss()
optimizer1 = optim.Adam(model1.parameters(), lr=learning_rate)
optimizer2 = optim.Adam(model2.parameters(), lr=learning_rate)
optimizer3 = optim.Adam(model3.parameters(), lr=learning_rate)
# 训练第一个模型
train_loss1, test_loss1, train_acc1, test_acc1 = train(model1, criterion, optimizer1, num_epochs, X_train, y_train, X_test, y_test)
# 绘制损失函数变化曲线和准确率变化曲线
plt.plot(train_loss1, label='train')
plt.plot(test_loss1, label='test')
plt.legend()
plt.title('Model 1 Loss')
plt.show()
plt.plot(train_acc1, label='train')
plt.plot(test_acc1, label='test')
plt.legend()
plt.title('Model 1 Accuracy')
plt.show()
# 计算第一个模型在测试集上的准确率
model1.eval()
with torch.no_grad():
outputs = model1(X_test)
_, predicted = torch.max(outputs.data, 1)
total_test = y_test.size(0)
correct_test = (predicted == y_test).sum().item()
print('Model 1 Test Accuracy: {:.2f}%'.format(correct_test / total_test * 100))
# 训练第二个模型
train_loss2, test_loss2, train_acc2, test_acc2 = train(model2, criterion, optimizer2, num_epochs, outputs, y_train, outputs_test, y_test)
# 绘制损失函数变化曲线和准确率变化曲线
plt.plot(train_loss2, label='train')
plt.plot(test_loss2, label='test')
plt.legend()
plt.title('Model 2 Loss')
plt.show()
plt.plot(train_acc2, label='train')
plt.plot(test_acc2, label='test')
plt.legend()
plt.title('Model 2 Accuracy')
plt.show()
# 计算第二个模型在测试集上的准确率
model2.eval()
with torch.no_grad():
outputs = model1(X_test)
outputs = model2(outputs)
_, predicted = torch.max(outputs.data, 1)
total_test = y_test.size(0)
correct_test = (predicted == y_test).sum().item()
print('Model 2 Test Accuracy: {:.2f}%'.format(correct_test / total_test * 100))
# 训练第三个模型
train_loss3, test_loss3, train_acc3, test_acc3 = train(model3, criterion, optimizer3, num_epochs, outputs, y_train, outputs_test, y_test)
# 绘制损失函数变化曲线和准确率变化曲线
plt.plot(train_loss3, label='train')
plt.plot(test_loss3, label='test')
plt.legend()
plt.title('Model 3 Loss')
plt.show()
plt.plot(train_acc3, label='train')
plt.plot(test_acc3, label='test')
plt.legend()
plt.title('Model 3 Accuracy')
plt.show()
# 计算第三个模型在测试集上的准确率和AUC值
model3.eval()
with torch.no_grad():
outputs = model1(X_test)
outputs = model2(outputs)
outputs = model3(outputs)
_, predicted = torch.max(outputs.data, 1)
total_test = y_test.size(0)
correct_test = (predicted == y_test).sum().item()
print('Model 3 Test Accuracy: {:.2f}%'.format(correct_test / total_test * 100))
fpr, tpr, _ = roc_curve(y_test, outputs[:, 0])
auc = roc_auc_score(y_test, outputs[:, 0])
plt.plot(fpr, tpr, label='ROC curve (AUC = {:.2f})'.format(auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Model 3 ROC Curve')
plt.legend()
plt.show()
# 计算每个基因对于模型预测的重要性
model1.eval()
model2.eval()
importance = []
for gene in genes:
with torch.no_grad():
x = data[gene].values
x = torch.from_numpy(x).float()
x = x.unsqueeze(0)
outputs = model1(x)
outputs = model2(outputs)
importance.append(outputs[0].detach().numpy())
importance = np.array(importance)
importance = np.mean(importance, axis=1)
plt.bar(genes, importance)
plt.xticks(rotation=90)
plt.title('Gene Importance')
plt.show()
# 绘制基因表达量和临床数据之间的相关性热图
import seaborn as sns
sns.heatmap(data.corr(), cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()
# 使用t-SNE算法将高维基因表达数据转换为二维空间
from sklearn.manifold import TSNE
X_tsne = TSNE(n_components=2).fit_transform(X)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y)
plt.title('t-SNE Plot')
plt.show()
原文地址: https://www.cveoy.top/t/topic/mmyI 著作权归作者所有。请勿转载和采集!