#!/usr/bin/env python # coding: utf-8 # ## GenomeTools Scripts # # ``` # Author: Florian Wagner # Email: florian.wagner@duke.edu # ``` # This notebook demonstrates the use of the scripts `extact_protein_coding_genes.py` and `extract_entrez2gene.py` from the GenomeTools package. # In[1]: # get genometools version from pkg_resources import require print 'Package versions' print '----------------' print require('genometools')[0] # ### Running `extract_protein_coding_genes.py` # In[2]: gene_annotation_file = 'Homo_sapiens.GRCh38.79.gtf.gz' protein_coding_gene_file = 'protein_coding_genes_human.tsv.gz' # In[3]: get_ipython().system('curl -o "$gene_annotation_file" "ftp://ftp.ensembl.org/pub/release-79/gtf/homo_sapiens/Homo_sapiens.GRCh38.79.gtf.gz"') # In[4]: # reading annotation file from stdin get_ipython().system('gunzip -c "$gene_annotation_file" | extract_protein_coding_genes.py -q -a - -o - | gzip > "$protein_coding_gene_file"') get_ipython().system('gunzip -c "$protein_coding_gene_file" | head -n 10') # In[5]: # alternatively: reading the annotation file directly get_ipython().system('extract_protein_coding_genes.py -a "$gene_annotation_file" -o - | gzip > "$protein_coding_gene_file"') # ### Running `extract_gene2entrez.py` # In[6]: gene2accession_file = 'gene2accession_2015-05-26_human.tsv.gz' entrez2gene_file = 'entrez2gene_human.tsv' # In[7]: get_ipython().system('curl -L -o "$gene2accession_file" "https://www.dropbox.com/s/ggjrvnigtrfue3x/gene2accession_human_2015-05-26.tsv.gz?dl=1"') # In[8]: get_ipython().system('extract_entrez2gene.py -f "$gene2accession_file" -o "$entrez2gene_file"') get_ipython().system('head -n 10 "$entrez2gene_file"')