from Bio import SeqIO import numpy as np import matplotlib.pyplot as plt

读取fasta文件，计算状态转移矩阵

def calc_transition_matrix(fasta_file): sequences = list(SeqIO.parse(fasta_file, 'fasta')) alphabet = list(set(''.join([str(seq.seq) for seq in sequences]))) # 使用单引号 states = len(alphabet) matrix = np.zeros((states, states)) for seq in sequences: seq_str = str(seq.seq) for i in range(len(seq_str) - 1): from_state = alphabet.index(seq_str[i]) to_state = alphabet.index(seq_str[i+1]) matrix[from_state, to_state] += 1 return matrix

对矩阵进行z-score标准化

def standardize(matrix): standardized_matrix = (matrix - np.mean(matrix, axis=0)) / np.std(matrix, axis=0) return standardized_matrix

对标准化后的矩阵进行PCA分析

def pca(matrix): cov = np.cov(matrix.T) eig_vals, eig_vecs = np.linalg.eig(cov) idx = np.argsort(eig_vals)[::-1] eig_vecs = eig_vecs[:, idx] projection = np.dot(matrix, eig_vecs) return projection

测试代码

if name == 'main': # 读取fasta文件，计算状态转移矩阵 matrix1 = calc_transition_matrix('WBKG00000000.fasta') # 使用单引号 matrix2 = calc_transition_matrix('WHPN00000000.fasta') # 使用单引号 matrix3 = calc_transition_matrix('WSRO00000000.fasta') # 使用单引号 # 对矩阵进行标准化 standardized_matrix1 = standardize(matrix1) standardized_matrix2 = standardize(matrix2) standardized_matrix3 = standardize(matrix3) # 对标准化后的矩阵进行PCA分析 pca_result1 = pca(standardized_matrix1) pca_result2 = pca(standardized_matrix2) pca_result3 = pca(standardized_matrix3) # 绘制PCA散点图 plt.scatter(pca_result1[:, 0], pca_result1[:, 1], label='WBKG00000000') # 使用单引号 plt.scatter(pca_result2[:, 0], pca_result2[:, 1], label='WHPN00000000') # 使用单引号 plt.scatter(pca_result3[:, 0], pca_result3[:, 1], label='WSRO00000000') # 使用单引号 plt.legend() plt.show()

使用Python进行DNA序列状态转移矩阵的PCA分析与可视化

读取fasta文件，计算状态转移矩阵

对矩阵进行z-score标准化

对标准化后的矩阵进行PCA分析

测试代码