from Bio import SeqIO import numpy as np import matplotlib.pyplot as plt import os

读取fasta文件，计算状态转移矩阵

def calc_transition_matrix(fasta_file): sequences = list(SeqIO.parse(fasta_file, 'fasta')) alphabet = list(set(''.join([str(seq.seq) for seq in sequences]))) states = len(alphabet) matrix = np.zeros((states, states)) for seq in sequences: seq_str = str(seq.seq) for i in range(len(seq_str) - 1): from_state = alphabet.index(seq_str[i]) to_state = alphabet.index(seq_str[i+1]) matrix[from_state, to_state] += 1 return matrix

对矩阵进行z-score标准化

def standardize(matrix): standardized_matrix = (matrix - np.mean(matrix, axis=0)) / np.std(matrix, axis=0) return standardized_matrix

对标准化后的矩阵进行PCA分析

def pca(matrix): cov = np.cov(matrix.T) eig_vals, eig_vecs = np.linalg.eig(cov) idx = np.argsort(eig_vals)[::-1] eig_vecs = eig_vecs[:, idx] projection = np.dot(matrix, eig_vecs) return projection

测试代码

if name == 'main': # 获取文件夹中的所有fasta文件 fasta_folder = 'fasta_files' fasta_files = [os.path.join(fasta_folder, f) for f in os.listdir(fasta_folder) if f.endswith('.fasta')]

# 对每个fasta文件进行状态转移矩阵的计算、标准化和PCA分析
for fasta_file in fasta_files:
    # 读取fasta文件，计算状态转移矩阵
    matrix = calc_transition_matrix(fasta_file)
    print(matrix)

    # 对矩阵进行标准化
    standardized_matrix = standardize(matrix)
    print(standardized_matrix)

    # 对标准化后的矩阵进行PCA分析
    pca_result = pca(standardized_matrix)

    # 绘制PCA散点图
    plt.scatter(pca_result[:, 0], pca_result[:, 1], label=fasta_file)

plt.legend()
plt.show()

使用Python分析多个fasta文件的状态转移矩阵

读取fasta文件，计算状态转移矩阵

对矩阵进行z-score标准化

对标准化后的矩阵进行PCA分析

测试代码