从FASTA文件中提取序列特征并输出到CSV文件
from Bio import SeqIO import pandas as pd
读取FASTA文件
sequences = SeqIO.parse('CP015849.fasta', 'fasta')
创建空的数据框
df = pd.DataFrame()
提取GC含量特征
for seq_record in sequences: gc_content = (seq_record.seq.count('G') + seq_record.seq.count('C')) / len(seq_record.seq) df = df.append({'id': seq_record.id, 'gc_content': gc_content}, ignore_index=True)
重新读取FASTA文件
sequences = SeqIO.parse('CP015849.fasta', 'fasta')
提取二核苷酸频率特征
for seq_record in sequences: dinucleotide_freq = {} for i in range(len(seq_record.seq) - 1): dinucleotide = seq_record.seq[i:i + 2] if dinucleotide in dinucleotide_freq: dinucleotide_freq[dinucleotide] += 1 else: dinucleotide_freq[dinucleotide] = 1 df.loc[df['id'] == seq_record.id, 'dinucleotide_freq'] = str(dinucleotide_freq)
重新读取FASTA文件
sequences = SeqIO.parse('CP015849.fasta', 'fasta')
提取反向互补序列特征
for seq_record in sequences: reverse_complement = seq_record.seq.reverse_complement() df.loc[df['id'] == seq_record.id, 'reverse_complement'] = str(reverse_complement)
重新读取FASTA文件
sequences = SeqIO.parse('CP015849.fasta', 'fasta')
提取氨基酸序列特征
for seq_record in sequences: protein_seq = seq_record.seq.translate() df.loc[df['id'] == seq_record.id, 'protein_seq'] = str(protein_seq)
将数据框输出到文件
df.to_csv('features.csv', index=False)
原文地址: https://www.cveoy.top/t/topic/lLMz 著作权归作者所有。请勿转载和采集!