#from 00 notebook
!head ../data/Piura_v1_GOslim.csv
!perl -e '$count=0; $len=0; while(<>) {s/\r?\n//; s/\t/ /g; if (s/^>//) { if ($. != 1) {print "\n"} s/ |$/\t/; $count++; $_ .= "\t";} else {s/ //g; $len += length($_)} print $_;} print "\n"; warn "\nConverted $count FASTA records in $. lines to tabular format\nTotal sequence length: $len\n\n";' \
../data/Piura_v1_contigs.fa > ../data/Piura_v1_contigs.tab
Converted 15022 FASTA records in 384556 lines to tabular format Total sequence length: 21729367
!head -1 ../data/Piura_v1_contigs.tab
PiuraChilensis_v1_contig_1 ATTTACAATACGAAGTAAAATAGATAACGTGAAAATAATCTTGGTGCTGGATGATCGATCAAGTTCACCAATATTTTATTGTAAAAAATCATTCTAAACAGCATGAAATCGTGTACAATGTATAAACAAGCAAATATATAACACTAAAGCAAGAGGGCGTAAGTGGGGGGGTGGGTGAGAGTAAAAAATTCAAACATGTCAAATACCCCGGCGTTAGCCTTAAAAGCACCATGGACTTCTGCCTTCAATAAGCATAAAATTAAAACACCTAATACACAATGAATATACAGATAAAACAGATTTATGAATAGTTGGTGTTACATCTTTTACAGCCATAAGCCTTCATTTTGCTTCCAAACGTATAAAATCTGACTTGGAACAATATACAGCCATGAGATATGACACAGCGAGCACTACAATATATATTTATCTTGTACTATACAGCCTGTACAAGAAAATTCTGGAATTGTCTTCACAAGAGACAGAAAAATAGTTGCAATGTGAATGCTAGTCTACTATTTGATCACAATTGGATAGAAAAGTACAGCACATAAATGTTGGTGATACCTTAAAGAAAAGTGCAACAATATCAAAGGAATTAGTACCAGCATGCATTAGAAAAGTAAAAGTCTTGCTTATTACACAAAGCTGACTATATGATGTTCACCGCTTCTGGTGTGCAAAGAATTAAAAACAATGCAATTTCGGTCAGTTTTAACAAGGAATTAACAATTCATAGGAAAAATACAAGCATATGGTCTCAGGCCAATTGCTAGGACATAAAAAAAGCCTGCATATCACGAAAAGCCAAGTGCATGCATCGTATCCTGAAGACACCTTGATATTAACATGTAAGAAATTTAGCTTGCCACATTTCCATATTCCATAATTTCATTTTGAACACCGTGCCAGCAAATTCATCTGATATAAACACACAGGCAACTAATTTGGACACTTTCTAACTAGGTAGTTCAGAAAATACAGCTTTCAACAGGTACACATTTCTATAATAATAATAATAGCAAATGTCAGTGTGGCAGTTTTTGGACAAGTCCCTTTCAGAGGCCAAAATATCTATTTTGTATTAATTAATTATCCATTTTTTGGACTATACGGCTGTATCAAAACCATGGGTAACTGGGACTTGCCTAGCTTTGGGGGTGGAGTC
#temp replace name so c or g will not confound
!sed 's/PiuraChilensis_v1_contig/999999/g' <../data/Piura_v1_contigs.tab> ../data/Piura_v1-99_contigs.tab
!head -1 ../data/Piura_v1-99_contigs.tab
999999_1 ATTTACAATACGAAGTAAAATAGATAACGTGAAAATAATCTTGGTGCTGGATGATCGATCAAGTTCACCAATATTTTATTGTAAAAAATCATTCTAAACAGCATGAAATCGTGTACAATGTATAAACAAGCAAATATATAACACTAAAGCAAGAGGGCGTAAGTGGGGGGGTGGGTGAGAGTAAAAAATTCAAACATGTCAAATACCCCGGCGTTAGCCTTAAAAGCACCATGGACTTCTGCCTTCAATAAGCATAAAATTAAAACACCTAATACACAATGAATATACAGATAAAACAGATTTATGAATAGTTGGTGTTACATCTTTTACAGCCATAAGCCTTCATTTTGCTTCCAAACGTATAAAATCTGACTTGGAACAATATACAGCCATGAGATATGACACAGCGAGCACTACAATATATATTTATCTTGTACTATACAGCCTGTACAAGAAAATTCTGGAATTGTCTTCACAAGAGACAGAAAAATAGTTGCAATGTGAATGCTAGTCTACTATTTGATCACAATTGGATAGAAAAGTACAGCACATAAATGTTGGTGATACCTTAAAGAAAAGTGCAACAATATCAAAGGAATTAGTACCAGCATGCATTAGAAAAGTAAAAGTCTTGCTTATTACACAAAGCTGACTATATGATGTTCACCGCTTCTGGTGTGCAAAGAATTAAAAACAATGCAATTTCGGTCAGTTTTAACAAGGAATTAACAATTCATAGGAAAAATACAAGCATATGGTCTCAGGCCAATTGCTAGGACATAAAAAAAGCCTGCATATCACGAAAAGCCAAGTGCATGCATCGTATCCTGAAGACACCTTGATATTAACATGTAAGAAATTTAGCTTGCCACATTTCCATATTCCATAATTTCATTTTGAACACCGTGCCAGCAAATTCATCTGATATAAACACACAGGCAACTAATTTGGACACTTTCTAACTAGGTAGTTCAGAAAATACAGCTTTCAACAGGTACACATTTCTATAATAATAATAATAGCAAATGTCAGTGTGGCAGTTTTTGGACAAGTCCCTTTCAGAGGCCAAAATATCTATTTTGTATTAATTAATTATCCATTTTTTGGACTATACGGCTGTATCAAAACCATGGGTAACTGGGACTTGCCTAGCTTTGGGGGTGGAGTC
#add column with length of sequence
!perl -e '$col = 2;' -e 'while (<>) { s/\r?\n//; @F = split /\t/, $_; $len = length($F[$col]); print "$_\t$len\n" } warn "\nAdded column with length of column $col for $. lines.\n\n";' \
../data/Piura_v1-99_contigs.tab > ../data/Piura_v1-99-l_contigs.tab
Added column with length of column 2 for 15022 lines.
!head -1 ../data/Piura_v1-99-l_contigs.tab
999999_1 ATTTACAATACGAAGTAAAATAGATAACGTGAAAATAATCTTGGTGCTGGATGATCGATCAAGTTCACCAATATTTTATTGTAAAAAATCATTCTAAACAGCATGAAATCGTGTACAATGTATAAACAAGCAAATATATAACACTAAAGCAAGAGGGCGTAAGTGGGGGGGTGGGTGAGAGTAAAAAATTCAAACATGTCAAATACCCCGGCGTTAGCCTTAAAAGCACCATGGACTTCTGCCTTCAATAAGCATAAAATTAAAACACCTAATACACAATGAATATACAGATAAAACAGATTTATGAATAGTTGGTGTTACATCTTTTACAGCCATAAGCCTTCATTTTGCTTCCAAACGTATAAAATCTGACTTGGAACAATATACAGCCATGAGATATGACACAGCGAGCACTACAATATATATTTATCTTGTACTATACAGCCTGTACAAGAAAATTCTGGAATTGTCTTCACAAGAGACAGAAAAATAGTTGCAATGTGAATGCTAGTCTACTATTTGATCACAATTGGATAGAAAAGTACAGCACATAAATGTTGGTGATACCTTAAAGAAAAGTGCAACAATATCAAAGGAATTAGTACCAGCATGCATTAGAAAAGTAAAAGTCTTGCTTATTACACAAAGCTGACTATATGATGTTCACCGCTTCTGGTGTGCAAAGAATTAAAAACAATGCAATTTCGGTCAGTTTTAACAAGGAATTAACAATTCATAGGAAAAATACAAGCATATGGTCTCAGGCCAATTGCTAGGACATAAAAAAAGCCTGCATATCACGAAAAGCCAAGTGCATGCATCGTATCCTGAAGACACCTTGATATTAACATGTAAGAAATTTAGCTTGCCACATTTCCATATTCCATAATTTCATTTTGAACACCGTGCCAGCAAATTCATCTGATATAAACACACAGGCAACTAATTTGGACACTTTCTAACTAGGTAGTTCAGAAAATACAGCTTTCAACAGGTACACATTTCTATAATAATAATAATAGCAAATGTCAGTGTGGCAGTTTTTGGACAAGTCCCTTTCAGAGGCCAAAATATCTATTTTGTATTAATTAATTATCCATTTTTTGGACTATACGGCTGTATCAAAACCATGGGTAACTGGGACTTGCCTAGCTTTGGGGGTGGAGTC 1168
!awk -F\CG '{print NF-1}' ../data/Piura_v1-99-l_contigs.tab > ../data/Piura_v1-99-l_contigs__CG.tab
!awk -F\C '{print NF-1}' ../data/Piura_v1-99-l_contigs.tab > ../data/Piura_v1-99-l_contigs__C.tab
!awk -F\G '{print NF-1}' ../data/Piura_v1-99-l_contigs.tab > ../data/Piura_v1-99-l_contigs__G.tab
!paste ../data/Piura_v1-99-l_contigs.tab \
../data/Piura_v1-99-l_contigs__CG.tab \
../data/Piura_v1-99-l_contigs__C.tab \
../data/Piura_v1-99-l_contigs__G.tab \
> ../data/Piura_v1-99-l_contigs__C-G.tab
!head -1 ../data/Piura_v1-99-l_contigs__C-G.tab
999999_1 ATTTACAATACGAAGTAAAATAGATAACGTGAAAATAATCTTGGTGCTGGATGATCGATCAAGTTCACCAATATTTTATTGTAAAAAATCATTCTAAACAGCATGAAATCGTGTACAATGTATAAACAAGCAAATATATAACACTAAAGCAAGAGGGCGTAAGTGGGGGGGTGGGTGAGAGTAAAAAATTCAAACATGTCAAATACCCCGGCGTTAGCCTTAAAAGCACCATGGACTTCTGCCTTCAATAAGCATAAAATTAAAACACCTAATACACAATGAATATACAGATAAAACAGATTTATGAATAGTTGGTGTTACATCTTTTACAGCCATAAGCCTTCATTTTGCTTCCAAACGTATAAAATCTGACTTGGAACAATATACAGCCATGAGATATGACACAGCGAGCACTACAATATATATTTATCTTGTACTATACAGCCTGTACAAGAAAATTCTGGAATTGTCTTCACAAGAGACAGAAAAATAGTTGCAATGTGAATGCTAGTCTACTATTTGATCACAATTGGATAGAAAAGTACAGCACATAAATGTTGGTGATACCTTAAAGAAAAGTGCAACAATATCAAAGGAATTAGTACCAGCATGCATTAGAAAAGTAAAAGTCTTGCTTATTACACAAAGCTGACTATATGATGTTCACCGCTTCTGGTGTGCAAAGAATTAAAAACAATGCAATTTCGGTCAGTTTTAACAAGGAATTAACAATTCATAGGAAAAATACAAGCATATGGTCTCAGGCCAATTGCTAGGACATAAAAAAAGCCTGCATATCACGAAAAGCCAAGTGCATGCATCGTATCCTGAAGACACCTTGATATTAACATGTAAGAAATTTAGCTTGCCACATTTCCATATTCCATAATTTCATTTTGAACACCGTGCCAGCAAATTCATCTGATATAAACACACAGGCAACTAATTTGGACACTTTCTAACTAGGTAGTTCAGAAAATACAGCTTTCAACAGGTACACATTTCTATAATAATAATAATAGCAAATGTCAGTGTGGCAGTTTTTGGACAAGTCCCTTTCAGAGGCCAAAATATCTATTTTGTATTAATTAATTATCCATTTTTTGGACTATACGGCTGTATCAAAACCATGGGTAACTGGGACTTGCCTAGCTTTGGGGGTGGAGTC 1168 15 203 202
!awk '{print $1, "\t", (($4)/($5*$6))*(($3**2)/($3-1))}' \
../data/Piura_v1-99-l_contigs__C-G.tab \
| sed 's/999999/PiuraChilensis_v1_contig/g' > ../data/Piura_v1_CpG.tab
!head ../data/Piura_v1_CpG.tab
PiuraChilensis_v1_contig_1 0.427621 PiuraChilensis_v1_contig_2 0.600881 PiuraChilensis_v1_contig_3 0.750945 PiuraChilensis_v1_contig_4 1.22757 PiuraChilensis_v1_contig_5 0.848172 PiuraChilensis_v1_contig_6 0.86292 PiuraChilensis_v1_contig_7 0.718647 PiuraChilensis_v1_contig_8 1.03897 PiuraChilensis_v1_contig_9 0.97759 PiuraChilensis_v1_contig_10 0.817866
!ls ../data
Cgigas-HS-count.txt
PiuraC_Coq_Trinity.fasta
PiuraC_Val_Trinity.fasta
PiuraC_Val_Trinity_2ndhalf.fasta
PiuraC_Val_Trinity_uniprot_sprot.tab
PiuraC_Val_Trinity_uniprot_sprot_2ndhalf.tab
Piura_counts.txt
Piura_v1-99-l_contigs.tab
Piura_v1-99-l_contigs__C-G.tab
Piura_v1-99-l_contigs__C.tab
Piura_v1-99-l_contigs__CG.tab
Piura_v1-99-l_contigs__G.tab
Piura_v1-99_contigs.tab
Piura_v1_CpG.tab
Piura_v1_GOslim.csv
Piura_v1_contigs.fa
Piura_v1_contigs.tab
Piura_v1_uniprot_sprot.tab
Piura_v1_uniprot_sprot_sql.tab
wd
!rm ../data/Piura_v1-99*
!head ../data/Piura_v1_GOslim.csv
!tr ',' "\t" <../data/Piura_v1_GOslim.csv> ../data/Piura_v1_GOslim.tab
!sort ../data/Piura_v1_GOslim.tab | tail -n +2 > ../data/Piura_v1_GOslim.sorted
!awk -F $'\t' '{print $1, "\t", $2}' ../data/Piura_v1_GOslim.sorted > ../data/Piura_v1_GOslim.sortedtab
!head ../data/Piura_v1_GOslim.sortedtab
!sort ../data/Piura_v1_CpG.tab > ../data/Piura_v1_CpG.sorted
!awk -F $'\t' '{print $1, "\t", $2}' ../data/Piura_v1_CpG.sorted > ../data/Piura_v1_CpG.sortedtab
!head ../data/Piura_v1_CpG.sortedtab
PiuraChilensis_v1_contig_1 0.427621 PiuraChilensis_v1_contig_10 0.817866 PiuraChilensis_v1_contig_100 0.914473 PiuraChilensis_v1_contig_1000 0.792597 PiuraChilensis_v1_contig_10000 0.947282 PiuraChilensis_v1_contig_10001 0.691634 PiuraChilensis_v1_contig_10002 0.936689 PiuraChilensis_v1_contig_10003 0.371111 PiuraChilensis_v1_contig_10004 0.89432 PiuraChilensis_v1_contig_10005 0.873687
SELECT * FROM [sr320@washington.edu].[Piura_v1_CpG.sorted]cpg
left join
[sr320@washington.edu].[Piura_v1_GOslim.sorted]go
on
cpg.Column1=go.Column1
!head ../data/Piura_v1_CpG-GOslim.csv
!tr ',' "\t" <../data/Piura_v1_CpG-GOslim.csv> ../data/Piura_v1_CpG-GOslim.tab
!head ../data/Piura_v1_CpG-GOslim.tab
!awk -F $'\t' '{print $1, "\t", $2," \t", $4}' ../data/Piura_v1_CpG-GOslim.tab | tail -n +2 > ../data/Piura_v1_CpG-slim.tab
!head ../data/Piura_v1_CpG-slim.tab
import pandas as pd
Piura = pd.read_table('../data/Piura_v1_CpG-slim.tab', header=None)
Piura
<class 'pandas.core.frame.DataFrame'> Int64Index: 27267 entries, 0 to 27266 Data columns (total 3 columns): 0 27267 non-null values 1 27267 non-null values 2 27267 non-null values dtypes: float64(1), object(2)
%matplotlib inline
import matplotlib.pyplot as plt
Piura.groupby(2)[1].mean().plot(kind='barh', color=list('myb'))
plt.axis([0.7, 0.9, 0, 15])
[0.7, 0.9, 0, 15]
# pandas density plot
Piura[1].plot(kind='kde', linewidth=3);
plt.axis([0.2, 1.5, 0, 1.9])
[0.2, 1.5, 0, 1.9]