!head /Volumes/web/scaphapoda/Grace/Transcriptomes/mdonacium/M_donacium_fasta.fa
>GQ18NFG04H441D CTCAGACGTAGATTAAGCGTAAAGATCCAGGAACTCGAAGATATGGTTGAACAACAGAGA ACAAGAGCTGCCAACTTGGAGAAGGCCAAGAACAGGCTTACTATTGAACTCCGTGAAGTC ACCATTGAACTCGAAAATACTCAGATCATTGTTCAAG >GQ18NFG04J02W3 ACTAGAGATGTAATACATAGATGAGGAGTGGCTGAGTTGGCGT >GQ18NFG04JNKQJ AATAGAATAGATTCGGTATAAGTGAAGACGCCATATTTATATGACATGGACGGCCATATT GATTTAA >GQ18NFG04H4OBE
!fgrep -c ">" /Volumes/web/scaphapoda/Grace/Transcriptomes/mdonacium/M_donacium_fasta.fa
180159
wd="/Volumes/web/scaphapoda/Grace/Transcriptomes/mdonacium"
dircode="md"
cd {wd}
/Volumes/web/scaphapoda/Grace/Transcriptomes/mdonacium
!blastx \
-query query.fa \
-db /Volumes/Data/blast_db/uniprot_sprot \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6 \
-num_threads 8 \
-out blast_sprot.tab
!wc -l blast_sprot.tab
123986 blast_sprot.tab
!tr '|' "\t" <blast_sprot.tab> blast_sprot_sql.tab
!head blast_sprot_sql.tab
GQ18NFG04H441D sp O96064 MYSP_MYTGA 75.00 52 13 0 1 156 314 365 1e-16 77.0 GQ18NFG04JZK7Y sp Q61KR9 CALR_CAEBR 64.00 25 9 0 2 76 289 313 0.002 37.4 GQ18NFG04JAS9D sp P34825 EF1A_HYPJE 76.19 21 4 1 9 68 21 41 0.047 33.1 GQ18NFG04J1R4X sp P63245 GBLP_RAT 89.47 57 6 0 223 53 230 286 5e-31 115 GQ18NFG04JUAT9 sp A4J7L5 ENGB_DESRM 50.00 20 10 0 71 12 61 80 2.2 27.7 GQ18NFG04I64X7 sp Q6LZL5 ATGT_METMP 46.43 28 15 0 131 48 462 489 6.6 27.7 GQ18NFG04JQZZX sp A7S641 ST7_NEMVE 52.17 23 9 1 81 13 265 285 7.0 27.3 GQ18NFG04JN0H1 sp Q94890 H15_DROME 38.89 36 22 0 10 117 320 355 0.44 31.6 GQ18NFG04J111H sp A8GAD1 Y965_SERP5 59.09 22 8 1 73 11 191 212 8.4 26.6 GQ18NFG04JWI3T sp P48867 COX1_CYACA 71.43 21 6 0 63 125 114 134 0.011 35.8
!python /Applications/sqlshare-pythonclient-master/tools/singleupload.py \
-d {dircode}_uniprot \
blast_sprot_sql.tab
processing chunk line 0 to 123986 (0.0490660667419 s elapsed) pushing blast_sprot_sql.tab... parsing BC15E3EB... finished md_uniprot
!python /Applications/sqlshare-pythonclient-master/tools/fetchdata.py \
-s "SELECT Column1, term, GOSlim_bin, aspect, ProteinName FROM [graceac9@washington.edu].[md_uniprot]md left join [samwhite@washington.edu].[UniprotProtNamesReviewed_yes20130610]sp on md.Column3=sp.SPID left join [sr320@washington.edu].[SPID and GO Numbers]go on md.Column3=go.SPID left join [sr320@washington.edu].[GO_to_GOslim]slim on go.GOID=slim.GO_id where aspect like 'P'" \
-f tsv \
-o {dircode}_descriptions.txt
!head {dircode}_descriptions.txt
pylab inline
Populating the interactive namespace from numpy and matplotlib
from pandas import *
gs = read_table('md_descriptions.txt')
gs.groupby('GOSlim_bin').Column1.count().plot(kind='barh', color=list('y'))
<matplotlib.axes.AxesSubplot at 0x10ea6a650>
!egrep --color "male|female|genitalia|gonad|ovarian|reproduction|estrogen|testosterone|gametogenesis|germination|ovulation|penile|prostate|vulval" {dircode}_descriptions.txt / {dircode}_reprot.txt
!head -2 {dircode}_reprot.txt
#counting list of associated GO terms
!cut -f 2 {dircode}_reprot.txt | sort | uniq -c
!wc -l {dircode}_reprot.txt
4415 md_reprot.txt