DNA 序列状态转移分析:使用 Python 计算状态转移矩阵、绘制 PAC 图
from Bio import SeqIO
import numpy as np
import matplotlib.pyplot as plt
import os
# 读取 FASTA 文件
def read_fasta_file(file_path):
sequences = {}
with open(file_path) as f:
sequence_name = ''
sequence = ''
for line in f:
if line.startswith('>'):
if sequence_name:
sequences[sequence_name] = sequence
sequence_name = line.strip()[1:]
sequence = ''
else:
sequence += line.strip()
if sequence_name:
sequences[sequence_name] = sequence
return sequences
# 获取状态转移矩阵
def get_transition_matrix(sequences):
# 初始化状态转移频率矩阵
transition_matrix = np.zeros((4, 4))
# 遍历所有序列
for sequence in sequences.values():
# 遍历序列中的所有碱基对
for i in range(len(sequence)-1):
# 获取当前碱基和下一个碱基的状态
current_state = get_base_state(sequence[i])
next_state = get_base_state(sequence[i+1])
# 更新状态转移频率矩阵
transition_matrix[current_state, next_state] += 1
# 将状态转移频率矩阵转换为状态转移概率矩阵
transition_matrix = transition_matrix / np.sum(transition_matrix, axis=1, keepdims=True)
return transition_matrix
def get_base_state(base):
if base == 'A':
return 0
elif base == 'C':
return 1
elif base == 'G':
return 2
elif base == 'T':
return 3
else:
return -1
# 计算相邻状态转移频率
def count_adjacent_transitions(sequences):
# 初始化相邻状态转移计数器
transition_count = {
'AA': 0,
'AC': 0,
'AG': 0,
'AT': 0,
'CA': 0,
'CC': 0,
'CG': 0,
'CT': 0,
'GA': 0,
'GC': 0,
'GG': 0,
'GT': 0,
'TA': 0,
'TC': 0,
'TG': 0,
'TT': 0,
}
# 遍历所有序列
for sequence in sequences.values():
# 遍历序列中的所有碱基对
for i in range(len(sequence)-1):
# 获取当前碱基和下一个碱基的状态
current_base = sequence[i]
next_base = sequence[i+1]
# 更新相邻状态转移计数器
transition = current_base + next_base
transition_count[transition] += 1
return transition_count
# 绘制 PAC 图
def plot_pac(matrix, fig_title):
cov = np.cov(matrix.T)
pac = np.zeros_like(cov)
for i in range(cov.shape[0]):
for j in range(cov.shape[1]):
pac[i, j] = cov[i, j] / np.sqrt(cov[i, i] * cov[j, j])
plt.imshow(pac, cmap='coolwarm')
plt.colorbar()
plt.title(fig_title)
plt.show()
# 测试代码
if __name__ == "__main__":
# 读取 FASTA 文件
fasta_file = "example.fasta"
sequences = read_fasta_file(fasta_file)
# 计算状态转移概率矩阵
transition_matrix = get_transition_matrix(sequences)
print(transition_matrix)
# 绘制 PAC 图
fig_title = os.path.splitext(os.path.basename(fasta_file))[0]
plot_pac(transition_matrix, fig_title)
# 计算相邻状态转移频率
transition_count = count_adjacent_transitions(sequences)
print(transition_count)
原文地址: https://www.cveoy.top/t/topic/lM2j 著作权归作者所有。请勿转载和采集!