from Bio import SeqIO import pandas as pd

读取FASTA文件

sequences = SeqIO.parse('CP015849.fasta', 'fasta')

创建空的数据框

df = pd.DataFrame()

提取GC含量特征

for seq_record in sequences: gc_content = (seq_record.seq.count('G') + seq_record.seq.count('C')) / len(seq_record.seq) df = df.append({'id': seq_record.id, 'gc_content': gc_content}, ignore_index=True)

重新读取FASTA文件

sequences = SeqIO.parse('CP015849.fasta', 'fasta')

提取二核苷酸频率特征

for seq_record in sequences: dinucleotide_freq = {} for i in range(len(seq_record.seq) - 1): dinucleotide = seq_record.seq[i:i + 2] if dinucleotide in dinucleotide_freq: dinucleotide_freq[dinucleotide] += 1 else: dinucleotide_freq[dinucleotide] = 1 df.loc[df['id'] == seq_record.id, 'dinucleotide_freq'] = str(dinucleotide_freq)

重新读取FASTA文件

sequences = SeqIO.parse('CP015849.fasta', 'fasta')

提取反向互补序列特征

for seq_record in sequences: reverse_complement = seq_record.seq.reverse_complement() df.loc[df['id'] == seq_record.id, 'reverse_complement'] = str(reverse_complement)

重新读取FASTA文件

sequences = SeqIO.parse('CP015849.fasta', 'fasta')

提取氨基酸序列特征

for seq_record in sequences: protein_seq = seq_record.seq.translate() df.loc[df['id'] == seq_record.id, 'protein_seq'] = str(protein_seq)

将数据框输出到文件

df.to_csv('features.csv', index=False)

从FASTA文件中提取序列特征并输出到CSV文件

原文地址: https://www.cveoy.top/t/topic/lLMz 著作权归作者所有。请勿转载和采集!

免费AI点我,无需注册和登录