from Bio import SeqIO import numpy as np import matplotlib.pyplot as plt

读取fasta文件，计算状态转移矩阵

def calc_transition_matrix(fasta_file): sequences = list(SeqIO.parse(fasta_file, 'fasta')) alphabet = list(set(''.join([str(seq.seq) for seq in sequences]))) states = len(alphabet) matrix = np.zeros((states, states)) for seq in sequences: seq_str = str(seq.seq) for i in range(len(seq_str) - 1): from_state = alphabet.index(seq_str[i]) to_state = alphabet.index(seq_str[i+1]) matrix[from_state, to_state] += 1 return matrix

对矩阵进行z-score标准化

def standardize(matrix): standardized_matrix = (matrix - np.mean(matrix, axis=0)) / np.std(matrix, axis=0) return standardized_matrix

对标准化后的矩阵进行PCA分析

def pca(matrix): cov = np.cov(matrix.T) eig_vals, eig_vecs = np.linalg.eig(cov) idx = np.argsort(eig_vals)[::-1] eig_vecs = eig_vecs[:, idx] projection = np.dot(matrix, eig_vecs) return projection

画出PCA散点图

def plot_pca(projection, label): plt.scatter(projection[:, 0], projection[:, 1], c=label) plt.xlabel('PC1') plt.ylabel('PC2') plt.show()

测试代码

if name == 'main': # 读取fasta文件，计算状态转移矩阵 matrix1 = calc_transition_matrix('WBKG00000000.fasta') matrix2 = calc_transition_matrix('WHPN00000000.fasta') matrix3 = calc_transition_matrix('WSRO00000000.fasta') # 对矩阵进行标准化 standardized_matrix1 = standardize(matrix1) standardized_matrix2 = standardize(matrix2) standardized_matrix3 = standardize(matrix3) # 对标准化后的矩阵进行PCA分析 pca_result1 = pca(standardized_matrix1) pca_result2 = pca(standardized_matrix2) pca_result3 = pca(standardized_matrix3) # 画出PCA散点图 plot_pca(pca_result1, 0) plot_pca(pca_result2, 1) plot_pca(pca_result3, 2)

使用PCA分析法绘制fasta序列状态转移矩阵的pAC散点图

读取fasta文件，计算状态转移矩阵

对矩阵进行z-score标准化

对标准化后的矩阵进行PCA分析

画出PCA散点图

测试代码