from Bio import Entrez from Bio import SeqIO def fetch_gb_by_id(rec_id): handle = Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=rec_id) gb_record = SeqIO.read(handle, "gb") handle.close() return gb_record mito = fetch_gb_by_id('NC_012920') assert mito.description == 'Homo sapiens mitochondrion, complete genome.' mito_features = mito.features print(type(mito_features)) print(type(mito_features[0])) print('************') print(mito_features[0]) print('************') print(mito_features[1]) print('************') print(mito_features[2]) gene = mito.features[2] print(gene.type) print(gene.location) print(gene.qualifiers) print(type(gene.qualifiers)) def extract_genes(gb_record): genes = [] for feat in gb_record.features: # if feature is gene if feat.type == 'gene': gene_dict = {} try: # get start and end positions location = feat.location start = location.start end = location.end gene_name = feat.qualifiers['gene'][0] gene_dict['start'] = start gene_dict['end'] = end gene_dict['name'] = gene_name except: print('Failed to obtain stats for feature') print(feat.qualifiers.keys()) if gene_dict != {}: genes.append(gene_dict) return genes mito_genes = extract_genes(mito) assert len(mito_genes) == 37 assert type(mito_genes[0]) == dict def genes_records(gb_record,gene_list): gene_records = [] for gene in gene_list: start = gene['start'] end = gene['end'] name = gene['name'] rec = gb_record[start:end] rec.description = name rec.id = str(len(gene_records)+1) gene_records.append(rec) return gene_records mito_gene_records = genes_records(mito,mito_genes) assert len(mito_gene_records) == 37 out_file = "mito_genes.fasta" # print to output file SeqIO.write(mito_gene_records,out_file,'fasta')