import gffutils
import gzip
from Bio import Alphabet, Seq, SeqIO
!rm -rf gambiae.gff.gz ag.db 2>/dev/null
!wget ftp://ftp.vectorbase.org/public_data/organism_data/agambiae/Genome/agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz -O gambiae.fa.gz
!wget http://www.vectorbase.org/download/anopheles-gambiae-pestbasefeaturesagamp42gff3gz -O gambiae.gff.gz
--2015-06-26 14:46:59-- ftp://ftp.vectorbase.org/public_data/organism_data/agambiae/Genome/agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz => ‘gambiae.fa.gz’ Resolving ftp.vectorbase.org (ftp.vectorbase.org)... 129.74.255.228 Connecting to ftp.vectorbase.org (ftp.vectorbase.org)|129.74.255.228|:21... connected. Logging in as anonymous ... Logged in! ==> SYST ... done. ==> PWD ... done. ==> TYPE I ... done. ==> CWD (1) /public_data/organism_data/agambiae/Genome ... done. ==> SIZE agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz ... 81591806 ==> PASV ... done. ==> RETR agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz ... done. Length: 81591806 (78M) (unauthoritative) HROMOSOMES-PEST.Aga 67%[=============> ] 52.57M 3.08MB/s eta 12s
!rm -f ag.db
db = gffutils.create_db('gambiae.gff.gz', 'ag.db')
gene_id = 'AGAP004707'
gene = db[gene_id]
print(gene)
print(gene.seqid, gene.strand)
2L VectorBase gene 2358158 2431617 . + . ID=AGAP004707;biotype=protein_coding ('2L', '+')
recs = SeqIO.parse(gzip.open('gambiae.fa.gz'), 'fasta')
for rec in recs:
print(rec.description)
if rec.description.split(':')[2] == gene.seqid:
my_seq = rec.seq
break
print(my_seq.alphabet)
chromosome:AgamP3:2L:1:49364325:1 chromosome 2L SingleLetterAlphabet()
def get_sequence(chrom_seq, CDSs, strand):
seq = Seq.Seq('', alphabet=Alphabet.IUPAC.unambiguous_dna)
for CDS in CDSs:
#FRAME???
my_cds = Seq.Seq(str(my_seq[CDS.start - 1: CDS.end]), alphabet=Alphabet.IUPAC.unambiguous_dna)
seq += my_cds
return seq if strand == '+' else seq.reverse_complement()
mRNAs = db.children(gene, featuretype='mRNA')
for mRNA in mRNAs:
print(mRNA.id)
if mRNA.id.endswith('RA'):
break
CDSs = db.children(mRNA, featuretype='CDS', order_by='start')
gene_seq = get_sequence(my_seq, CDSs, gene.strand)
print(len(gene_seq), gene_seq)
prot = gene_seq.translate()
print(len(prot), prot)
AGAP004707-RA (6357, Seq('ATGACCGAAGACTCCGATTCGATATCTGAGGAAGAACGTAGTTTGTTCCGTCCT...TGA', IUPACUnambiguousDNA())) (2119, Seq('MTEDSDSISEEERSLFRPFTRESLQAIEARIADEEAKQRELERKRAEGEDEDEG...DV*', HasStopCodon(IUPACProtein(), '*')))
reverse_transcript_id = 'AGAP004708-RA'
reverse_CDSs = db.children(reverse_transcript_id, featuretype='CDS', order_by='start')
reverse_seq = get_sequence(my_seq, reverse_CDSs, '-')
print(len(reverse_seq), reverse_seq)
reverse_prot = reverse_seq.translate()
print(len(reverse_prot), reverse_prot)
(1992, Seq('ATGGCTGACTTCGATAGTGCCACTAAATGTATCAGAAACATTGAAAAAGAAATT...TAA', IUPACUnambiguousDNA())) (664, Seq('MADFDSATKCIRNIEKEILLLQSEVLKTREGLGLEDDNVELKKLMEENTRLKHR...KI*', HasStopCodon(IUPACProtein(), '*')))