Python 计算 DNA 序列状态转移频率矩阵和相邻状态转移计数
import numpy as np
def read_fasta_file(file_path):
sequences = {}
with open(file_path) as f:
sequence_name = ''
sequence = ''
for line in f:
if line.startswith('>'):
if sequence_name:
sequences[sequence_name] = sequence
sequence_name = line.strip()[1:]
sequence = ''
else:
sequence += line.strip()
if sequence_name:
sequences[sequence_name] = sequence
return sequences
def get_transition_matrix(sequences):
# 初始化状态转移频率矩阵
transition_matrix = np.zeros((4, 4))
# 遍历所有序列
for sequence in sequences.values():
# 遍历序列中的所有碱基对
for i in range(len(sequence)-1):
# 获取当前碱基和下一个碱基的状态
current_state = get_base_state(sequence[i])
next_state = get_base_state(sequence[i+1])
# 更新状态转移频率矩阵
transition_matrix[current_state, next_state] += 1
# 将状态转移频率矩阵转换为状态转移概率矩阵
transition_matrix = transition_matrix / np.sum(transition_matrix, axis=1, keepdims=True)
return transition_matrix
def get_base_state(base):
if base == 'A':
return 0
elif base == 'C':
return 1
elif base == 'G':
return 2
elif base == 'T':
return 3
else:
return -1
def count_adjacent_transitions(sequences):
# 初始化相邻状态转移计数器
transition_count = {
'AA': 0,
'AC': 0,
'AG': 0,
'AT': 0,
'CA': 0,
'CC': 0,
'CG': 0,
'CT': 0,
'GA': 0,
'GC': 0,
'GG': 0,
'GT': 0,
'TA': 0,
'TC': 0,
'TG': 0,
'TT': 0,
}
# 遍历所有序列
for sequence in sequences.values():
# 遍历序列中的所有碱基对
for i in range(len(sequence)-1):
# 获取当前碱基和下一个碱基的状态
current_state = get_base_state(sequence[i])
next_state = get_base_state(sequence[i+1])
# 判断当前和下一个碱基是否都合法
if current_state != -1 and next_state != -1:
# 统计相邻状态转移
transition_key = '{}{}'.format(get_state_base(current_state), get_state_base(next_state))
transition_count[transition_key] += 1
return transition_count
def get_state_base(state):
if state == 0:
return 'A'
elif state == 1:
return 'C'
elif state == 2:
return 'G'
elif state == 3:
return 'T'
else:
raise ValueError('Invalid state: {}'.format(state))
# 读取FASTA文件
sequences = read_fasta_file('CP085300.fasta')
# 计算状态转移频率矩阵
transition_matrix = get_transition_matrix(sequences)
print('Transition Matrix:')
print(transition_matrix)
# 统计两个相邻状态转移的次数
transition_count = count_adjacent_transitions(sequences)
print('Adjacent Transition Count:')
print(transition_count)
# 将结果输出到文件
with open('CP085300.txt', 'w') as f:
# 输出状态转移矩阵
f.write('Transition Matrix:
')
f.write(np.array2string(transition_matrix, separator=', ') + '
')
# 输出相邻状态转移计数
f.write('Adjacent Transition Count:
')
for key, value in transition_count.items():
f.write('{}: {}
'.format(key, value))
原文地址: https://www.cveoy.top/t/topic/lLNM 著作权归作者所有。请勿转载和采集!