Python Biopython 提取 FASTA 文件特征
from Bio import SeqIO
import pandas as pd
# 读取FASTA文件
sequences = SeqIO.parse('CP015849.fasta', 'fasta')
# 创建空的数据框
df = pd.DataFrame()
# 提取GC含量特征
for seq_record in sequences:
gc_content = (seq_record.seq.count('G') + seq_record.seq.count('C')) / len(seq_record.seq)
df = df.append({'id': seq_record.id, 'gc_content': gc_content}, ignore_index=True)
# 重新读取FASTA文件
sequences = SeqIO.parse('CP015849.fasta', 'fasta')
# 提取二核苷酸频率特征
for seq_record in sequences:
dinucleotide_freq = {}
for i in range((len(seq_record.seq))-1):
dinucleotide = seq_record.seq[i:i+2]
if dinucleotide in dinucleotide_freq:
dinucleotide_freq[dinucleotide] += 1
else:
dinucleotide_freq[dinucleotide] = 1
df = df.append({'id': seq_record.id, 'dinucleotide_freq':dinucleotide_freq}, ignore_index=True)
# 重新读取FASTA文件
sequences = SeqIO.parse('CP015849.fasta', 'fasta')
# 提取反向互补序列特征
for seq_record in sequences:
reverse_complement = seq_record.seq.reverse_complement()
df = df.append({'id': seq_record.id,'reverse_complement':reverse_complement}, ignore_index=True)
# 重新读取FASTA文件
sequences = SeqIO.parse('CP015849.fasta', 'fasta')
# 提取氨基酸序列特征
for seq_record in sequences:
protein_seq = seq_record.seq.translate()
df = df.append({'id':seq_record.id, 'protein_seq':protein_seq}, ignore_index=True)
# 输出数据框
print(df)
原文地址: https://www.cveoy.top/t/topic/lLMo 著作权归作者所有。请勿转载和采集!