#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().system('rm -rf gambiae.g?f.gz ag.db 2>/dev/null') #!wget http://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gtfgz -O gambiae.gtf.gz get_ipython().system('wget http://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gff3gz -O gambiae.gff.gz') # In[1]: import gffutils import sqlite3 # In[3]: get_ipython().system('rm -f ag.db') # In[2]: try: db = gffutils.create_db('gambiae.gff.gz', 'ag.db') except sqlite3.OperationalError: db = gffutils.FeatureDB('ag.db') # In[3]: print(list(db.featuretypes())) for feat_type in db.featuretypes(): print(feat_type, db.count_features_of_type(feat_type)) # In[4]: for contig in db.features_of_type('contig'): print(contig) # In[5]: from collections import defaultdict num_mRNAs = defaultdict(int) num_exons = defaultdict(int) max_exons = 0 max_span = 0 for contig in db.features_of_type('contig'): cnt = 0 for gene in db.region((contig.seqid, contig.start, contig.end), featuretype='gene'): cnt += 1 span = abs(gene.start - gene.end) # strand if span > max_span: max_span = span max_span_gene = gene my_mRNAs = list(db.children(gene, featuretype='mRNA')) num_mRNAs[len(my_mRNAs)] += 1 if len(my_mRNAs) == 0: exon_check = [gene] else: exon_check = my_mRNAs for check in exon_check: num_exons[len(my_exons)] += 1 if len(my_exons) > max_exons: max_exons = len(my_exons) max_exons_gene = gene print('contig %s, number of genes %d' % (contig.seqid, cnt)) print('Max number of exons: %s (%d)' % (max_exons_gene.id, max_exons)) print('Max span: %s (%d)' % (max_span_gene.id, max_span)) print(num_mRNAs) print(num_exons) # In[ ]: