!rm -rf gambiae.g?f.gz ag.db 2>/dev/null
#!wget http://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gtfgz -O gambiae.gtf.gz
!wget http://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gff3gz -O gambiae.gff.gz
--2015-02-18 14:00:45-- http://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gff3gz Resolving www.vectorbase.org (www.vectorbase.org)... 129.74.255.228 Connecting to www.vectorbase.org (www.vectorbase.org)|129.74.255.228|:80... connected. HTTP request sent, awaiting response... 302 Found Location: https://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gff3gz [following] --2015-02-18 14:00:45-- https://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gff3gz Connecting to www.vectorbase.org (www.vectorbase.org)|129.74.255.228|:443... connected. HTTP request sent, awaiting response... 302 Found Location: https://www.vectorbase.org/sites/default/files/ftp/downloads/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.2.gff3.gz [following] --2015-02-18 14:00:47-- https://www.vectorbase.org/sites/default/files/ftp/downloads/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.2.gff3.gz Reusing existing connection to www.vectorbase.org:443. HTTP request sent, awaiting response... 200 OK Length: 2702601 (2.6M) [application/x-gzip] Saving to: ‘gambiae.gff.gz’ 100%[======================================>] 2,702,601 1.46MB/s in 1.8s 2015-02-18 14:00:49 (1.46 MB/s) - ‘gambiae.gff.gz’ saved [2702601/2702601]
import gffutils
import sqlite3
!rm -f ag.db
try:
db = gffutils.create_db('gambiae.gff.gz', 'ag.db')
except sqlite3.OperationalError:
db = gffutils.FeatureDB('ag.db')
print(list(db.featuretypes()))
for feat_type in db.featuretypes():
print(feat_type, db.count_features_of_type(feat_type))
['CDS', 'RNase_P_RNA', 'SRP_RNA', 'contig', 'exon', 'five_prime_UTR', 'gene', 'mRNA', 'miRNA', 'misc_RNA', 'pseudogene', 'rRNA', 'snRNA', 'snoRNA', 'tRNA', 'tRNA_pseudogene', 'three_prime_UTR'] ('CDS', 62408) ('RNase_P_RNA', 1) ('SRP_RNA', 3) ('contig', 8) ('exon', 66485) ('five_prime_UTR', 10520) ('gene', 13624) ('mRNA', 14697) ('miRNA', 187) ('misc_RNA', 10) ('pseudogene', 5) ('rRNA', 53) ('snRNA', 38) ('snoRNA', 12) ('tRNA', 463) ('tRNA_pseudogene', 9) ('three_prime_UTR', 7281)
for contig in db.features_of_type('contig'):
print(contig)
2L VectorBase contig 1 49364325 . . . translation_table=1;topology=linear;ID=2L;molecule_type=dsDNA;localization=chromosomal 3R VectorBase contig 1 53200684 . . . translation_table=1;topology=linear;ID=3R;molecule_type=dsDNA;localization=chromosomal UNKN VectorBase contig 1 42389979 . . . translation_table=1;topology=linear;ID=UNKN;molecule_type=dsDNA;localization=chromosomal X VectorBase contig 1 24393108 . . . translation_table=1;topology=linear;ID=X;molecule_type=dsDNA;localization=chromosomal Y_unplaced VectorBase contig 1 237045 . . . translation_table=1;topology=linear;ID=Y_unplaced;molecule_type=dsDNA;localization=chromosomal Mt VectorBase contig 1 15363 . . . translation_table=1;topology=linear;ID=Mt;molecule_type=dsDNA;localization=chromosomal 2R VectorBase contig 1 61545105 . . . translation_table=1;topology=linear;ID=2R;molecule_type=dsDNA;localization=chromosomal 3L VectorBase contig 1 41963435 . . . translation_table=1;topology=linear;ID=3L;molecule_type=dsDNA;localization=chromosomal
from collections import defaultdict
num_mRNAs = defaultdict(int)
num_exons = defaultdict(int)
max_exons = 0
max_span = 0
for contig in db.features_of_type('contig'):
cnt = 0
for gene in db.region((contig.seqid, contig.start, contig.end), featuretype='gene'):
cnt += 1
span = abs(gene.start - gene.end) # strand
if span > max_span:
max_span = span
max_span_gene = gene
my_mRNAs = list(db.children(gene, featuretype='mRNA'))
num_mRNAs[len(my_mRNAs)] += 1
if len(my_mRNAs) == 0:
exon_check = [gene]
else:
exon_check = my_mRNAs
for check in exon_check:
num_exons[len(my_exons)] += 1
if len(my_exons) > max_exons:
max_exons = len(my_exons)
max_exons_gene = gene
print('contig %s, number of genes %d' % (contig.seqid, cnt))
print('Max number of exons: %s (%d)' % (max_exons_gene.id, max_exons))
print('Max span: %s (%d)' % (max_span_gene.id, max_span))
print(num_mRNAs)
print(num_exons)
contig 2L, number of genes 3105 contig 3R, number of genes 2763 contig UNKN, number of genes 567 contig X, number of genes 1097 contig Y_unplaced, number of genes 0 contig Mt, number of genes 37 contig 2R, number of genes 3834 contig 3L, number of genes 2221 Max number of exons: AGAP001660 (67) Max span: AGAP006656 (365621) defaultdict(<type 'int'>, {0: 781, 1: 11595, 2: 910, 3: 211, 4: 74, 5: 27, 6: 9, 7: 5, 8: 4, 9: 1, 10: 1, 11: 3, 12: 1, 13: 1, 20: 1}) defaultdict(<type 'int'>, {1: 2019, 2: 3359, 3: 2838, 4: 2091, 5: 1411, 6: 1039, 7: 718, 8: 454, 9: 419, 10: 298, 11: 202, 12: 159, 13: 106, 14: 65, 15: 65, 16: 45, 17: 53, 18: 22, 19: 28, 20: 19, 21: 9, 22: 7, 23: 6, 24: 6, 25: 9, 26: 3, 27: 2, 28: 5, 29: 4, 30: 5, 31: 5, 32: 1, 33: 1, 34: 1, 42: 1, 49: 1, 50: 1, 67: 1})