DNA 序列状态转移分析：使用 Python 计算状态转移矩阵、绘制 PAC 图

from Bio import SeqIO
import numpy as np
import matplotlib.pyplot as plt
import os

# 读取 FASTA 文件
def read_fasta_file(file_path):
    sequences = {}
    with open(file_path) as f:
        sequence_name = ''
        sequence = ''
        for line in f:
            if line.startswith('>'):
                if sequence_name:
                    sequences[sequence_name] = sequence
                sequence_name = line.strip()[1:]
                sequence = ''
            else:
                sequence += line.strip()
        if sequence_name:
            sequences[sequence_name] = sequence
    return sequences

# 获取状态转移矩阵
def get_transition_matrix(sequences):
    # 初始化状态转移频率矩阵
    transition_matrix = np.zeros((4, 4))
    # 遍历所有序列
    for sequence in sequences.values():
        # 遍历序列中的所有碱基对
        for i in range(len(sequence)-1):
            # 获取当前碱基和下一个碱基的状态
            current_state = get_base_state(sequence[i])
            next_state = get_base_state(sequence[i+1])
            # 更新状态转移频率矩阵
            transition_matrix[current_state, next_state] += 1
    # 将状态转移频率矩阵转换为状态转移概率矩阵
    transition_matrix = transition_matrix / np.sum(transition_matrix, axis=1, keepdims=True)
    return transition_matrix

def get_base_state(base):
    if base == 'A':
        return 0
    elif base == 'C':
        return 1
    elif base == 'G':
        return 2
    elif base == 'T':
        return 3
    else:
        return -1

# 计算相邻状态转移频率
def count_adjacent_transitions(sequences):
    # 初始化相邻状态转移计数器
    transition_count = {
        'AA': 0,
        'AC': 0,
        'AG': 0,
        'AT': 0,
        'CA': 0,
        'CC': 0,
        'CG': 0,
        'CT': 0,
        'GA': 0,
        'GC': 0,
        'GG': 0,
        'GT': 0,
        'TA': 0,
        'TC': 0,
        'TG': 0,
        'TT': 0,
    }
    # 遍历所有序列
    for sequence in sequences.values():
        # 遍历序列中的所有碱基对
        for i in range(len(sequence)-1):
            # 获取当前碱基和下一个碱基的状态
            current_base = sequence[i]
            next_base = sequence[i+1]
            # 更新相邻状态转移计数器
            transition = current_base + next_base
            transition_count[transition] += 1
    return transition_count

# 绘制 PAC 图
def plot_pac(matrix, fig_title):
    cov = np.cov(matrix.T)
    pac = np.zeros_like(cov)
    for i in range(cov.shape[0]):
        for j in range(cov.shape[1]):
            pac[i, j] = cov[i, j] / np.sqrt(cov[i, i] * cov[j, j])
    plt.imshow(pac, cmap='coolwarm')
    plt.colorbar()
    plt.title(fig_title)
    plt.show()

# 测试代码
if __name__ == "__main__":
    # 读取 FASTA 文件
    fasta_file = "example.fasta"
    sequences = read_fasta_file(fasta_file)

    # 计算状态转移概率矩阵
    transition_matrix = get_transition_matrix(sequences)
    print(transition_matrix)

    # 绘制 PAC 图
    fig_title = os.path.splitext(os.path.basename(fasta_file))[0]
    plot_pac(transition_matrix, fig_title)

    # 计算相邻状态转移频率
    transition_count = count_adjacent_transitions(sequences)
    print(transition_count)