#variables to set
fa="../data/fa/Hkamtschatkana_v1.fasta"
db="/Users/sr320/data-genomic/blast/db/uniprot_sprot_r2015_01"
sqls="/Users/sr320/sqlshare-pythonclient/tools/"
usr="sr320@washington.edu"
!head -2 {fa}
>Haliotis_kam_contig1 TAGAATAGGTCCATCGCATAGTATTTGGATTAATTTAATTACAGGAAATCTGTAGATAGGGAGATTGTGGCTAGGACTAAGAAAGCTCCGGCAAAATTAATAATGAAAATTGTTGGGGTTAATAGTGTGTTTACGGATAGTGGGGGATTGATGGGATTAGGTTGATTTAGGAAGTTAGTAGAGATAAGTGTTATGCTAGAAGAGAATGTTATTCTAAATAGTAGGGTTAAGTAATAAAATAAGCTTATTAAGCTTCCTAGAAGTAATGGGAAAATTAGTGCTGGGGAGGAGATGGCGCAAGAGAAAGAGATAGCTGTTCATTTTGCGGCAAACCCGAGGAGAGGGGGTATTCCGCCGAGAGATAGAAGAATAAAAATTAAAGTTAGTTGGTTTATTTTAGGATTTTCGCTTTTTAATGCTCCAATATGACGAAAGGATTTTGTATCAGAAATTCAAAGGTTTATAAAGAGACAGATTGAGATTAGGGCATAGATAAGAAAGTAGATTTTTAACGCCCTTTCTCTTGTTAGTGCACAAAACGCTATCCAGCCTACGTGGCCAATTGATGAGTAAGCGAGGAGTGTTCGAATTTGGGTTTGGTTGATTCCACCTAGTCCGCCGATAAGGCTCGATATTCCTGCTATAATGAATAGCGTAGAGATGATGGATGGGTAATTTCATCTTTGTGTTATGGCAGTTAGCAGGAATACTGGAGCTAATTTCTGTCAGGTGGTGAGAATGAGGCAGTTTAATCAGGAAATGTTTATTATTACTTCTGGGAGCCAAAAATGGAATGGGAATGCACCAAGTTTTAGGAATAGGCTAAGGATAATTAGGGTTAGTCCGAGGACGCTTGTGTTTTGTTGATAAACTTCCCATGAGAGAGACATGTTAAAGGAGAGAAGGCTACCAAATATTAGTATTCTTGACCCTAGTGCTTGTATGATAAAATATTTGATCCTAGATTCGGTTTCTATCGTTAGTCCCCGGTAGATTAGTAGGGGGATAAATCCTATTATATTAATTTCTAGTCCTATCCAGATTCTCAGTCAGTGGATGGATGACAGGGAGAGAAGGGAGCCAAGGAAGGTAATGAAAGAAAATAAGAATCCGAACGGCATTATTGATAGCATTTT
#needs attention for blastoutput
sp= !head -1 {fa} | rev | cut -c 9- | rev | tail -c +2 | tr '' 'j'
sp2='\n'.join(sp)
sp2
'Haliotis_kam'
!fgrep -c ">" {fa}
8641
!blastx \
-query {fa} \
-db {db} \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6 \
-evalue 1E-05 \
-num_threads 6 \
-out ../analyses/{sp}_sprot.tab
!ls ../analyses/
[Haliotis_kam]GOSlim.png [Haliotis_kam]_sprot.tab [Haliotis_kam]_sprot_GOdescriptions-stress.tab [Haliotis_kam]_sprot_GOdescriptions.tab [Haliotis_kam]_sprot_sql.tab name
!wc -l ../analyses/{sp}_sprot.tab
1262 ../analyses/[Haliotis_kam]_sprot.tab
!tr '|' "\t" <../analyses/{sp}_sprot.tab> ../analyses/{sp}_sprot_sql.tab
!head -1 ../analyses/{sp}_sprot.tab
!echo SQLShare ready version has Pipes converted to Tabs ....
!head -1 ../analyses/{sp}_sprot_sql.tab
Haliotis_kam_contig1 sp|O79428|NU2M_RABIT 34.55 220 137 3 1055 399 25 238 3e-21 97.1 SQLShare ready version has Pipes converted to Tabs .... Haliotis_kam_contig1 sp O79428 NU2M_RABIT 34.55 220 137 3 1055 399 25 238 3e-21 97.1
!python {sqls}singleupload.py \
-d _blast_sprot \
../analyses/{sp}_sprot_sql.tab
processing chunk line 0 to 1262 (0.000244140625 s elapsed) pushing ../analyses/[Haliotis_kam]_sprot_sql.tab... parsing B7408C56... finished _blast_sprot
!python {sqls}fetchdata.py \
-s "SELECT Column1 as ContigID, term, GOSlim_bin, aspect, ProteinName \
FROM [{usr}].[_blast_sprot]md \
left join \
[samwhite@washington.edu].[UniprotProtNamesReviewed_yes20130610]sp \
on md.Column3=sp.SPID \
left join \
[sr320@washington.edu].[SPID and GO Numbers]go \
on md.Column3=go.SPID \
left join \
[sr320@washington.edu].[GO_to_GOslim]slim on go.GOID=slim.GO_id \
where aspect like 'P'" \
-f tsv \
-o ../analyses/{sp2}_sprot_GOdescriptions.tab
!head ../analyses/{sp2}_sprot_GOdescriptions.tab
!grep -c "stress response" ../analyses/{sp2}_sprot_GOdescriptions.tab
175
!grep "stress response" ../analyses/{sp2}_sprot_GOdescriptions.tab \
> ../analyses/{sp2}_sprot_GOdescriptions-stress.tab
!cut -f2 ../analyses/{sp2}_sprot_GOdescriptions-stress.tab | sort | uniq -c
4 "DNA damage response, signal transduction by p53 class mediator resulting in cell cycle arrest" 1 "DNA damage response, signal transduction by p53 class mediator resulting in induction of apoptosis" 1 "DNA damage response, signal transduction by p53 class mediator" 1 "base-excision repair, DNA ligation" 1 "base-excision repair, gap-filling" 1 "defense response to fungus, incompatible interaction" 1 "negative regulation of DNA damage response, signal transduction by p53 class mediator" 1 "positive regulation of DNA damage response, signal transduction by p53 class mediator" 1 "regulation of complement activation, lectin pathway" 1 DNA damage checkpoint 1 DNA double-strand break processing 10 DNA repair 1 JNK cascade 1 active induction of host immune response by virus 1 acute inflammatory response 1 age-dependent response to oxidative stress 1 age-dependent response to reactive oxygen species during chronological cell aging 1 autophagic vacuole formation 1 blood coagulation 2 bypass DNA synthesis 1 cellular response to amino acid starvation 2 cellular response to heat 2 cellular response to oxidative stress 1 cellular response to reactive oxygen species 1 cellular response to starvation 1 cellular response to stress 2 defense response 2 defense response to Gram-negative bacterium 5 defense response to bacterium 1 defense response to virus 2 double-strand break repair 1 double-strand break repair via homologous recombination 1 endoplasmic reticulum unfolded protein response 1 heat acclimation 5 hydrogen peroxide catabolic process 1 hyperosmotic response 2 inflammatory response 2 innate immune response 5 macroautophagy 1 melanization defense response 1 mismatch repair 2 mitotic cell cycle G2/M transition DNA damage checkpoint 1 natural killer cell mediated cytotoxicity 2 negative regulation of inflammatory response to antigenic stimulus 1 negative regulation of translational initiation in response to stress 3 platelet activation 1 positive regulation of DNA repair 2 positive regulation of biosynthetic process of antibacterial peptides active against Gram-negative bacteria 1 positive regulation of blood coagulation 1 positive regulation of innate immune response 1 regulation of cellular defense response 1 regulation of inflammatory response 1 regulation of innate immune response 1 regulation of translational initiation by eIF2 alpha phosphorylation 1 regulation of translational initiation in response to stress 1 respiratory burst during acute inflammatory response 16 response to DNA damage stimulus 1 response to axon injury 1 response to cold 1 response to endoplasmic reticulum stress 3 response to heat 15 response to oxidative stress 1 response to reactive oxygen species 6 response to salt stress 1 response to starvation 28 response to stress 2 response to water deprivation 1 response to wounding 2 sorocarp development 4 wound healing
pylab inline
Populating the interactive namespace from numpy and matplotlib
#FIXME - below I added '[Haliotis_kam]'
from pandas import *
gs = read_table('../analyses/{sp2}_sprot_GOdescriptions.tab')
--------------------------------------------------------------------------- IOError Traceback (most recent call last) <ipython-input-68-286f72e2a23c> in <module>() 1 from pandas import * 2 ----> 3 gs = read_table('../analyses/{sp2}_sprot_GOdescriptions.tab') /Users/sr320/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, nrows, iterator, chunksize, verbose, encoding, squeeze) 399 buffer_lines=buffer_lines) 400 --> 401 return _read(filepath_or_buffer, kwds) 402 403 parser_f.__name__ = name /Users/sr320/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds) 207 208 # Create the parser. --> 209 parser = TextFileReader(filepath_or_buffer, **kwds) 210 211 if nrows is not None: /Users/sr320/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, f, engine, **kwds) 507 self.options['has_index_names'] = kwds['has_index_names'] 508 --> 509 self._make_engine(self.engine) 510 511 def _get_options_with_defaults(self, engine): /Users/sr320/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in _make_engine(self, engine) 609 def _make_engine(self, engine='c'): 610 if engine == 'c': --> 611 self._engine = CParserWrapper(self.f, **self.options) 612 else: 613 if engine == 'python': /Users/sr320/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, src, **kwds) 891 # #2442 892 kwds['allow_leading_cols'] = self.index_col is not False --> 893 self._reader = _parser.TextReader(src, **kwds) 894 895 # XXX /Users/sr320/anaconda/lib/python2.7/site-packages/pandas/_parser.so in pandas._parser.TextReader.__cinit__ (pandas/src/parser.c:2771)() /Users/sr320/anaconda/lib/python2.7/site-packages/pandas/_parser.so in pandas._parser.TextReader._setup_parser_source (pandas/src/parser.c:4803)() IOError: File ../analyses/{sp2}_sprot_GOdescriptions.tab does not exist
gs
<class 'pandas.core.frame.DataFrame'> Int64Index: 3983 entries, 0 to 3982 Data columns (total 5 columns): ContigID 3983 non-null values term 3983 non-null values GOSlim_bin 3983 non-null values aspect 3983 non-null values ProteinName 3960 non-null values dtypes: object(5)
gs.groupby('GOSlim_bin').ContigID.count().plot(kind='barh', color=list('y'))
savefig('../analyses/[Haliotis_kam]GOSlim.png', bbox_inches='tight')