#from 00 notebook
!head ../analyses/Ahyacinthus_GOslim.tab
!perl -e '$count=0; $len=0; while(<>) {s/\r?\n//; s/\t/ /g; if (s/^>//) { if ($. != 1) {print "\n"} s/ |$/\t/; $count++; $_ .= "\t";} else {s/ //g; $len += length($_)} print $_;} print "\n"; warn "\nConverted $count FASTA records in $. lines to tabular format\nTotal sequence length: $len\n\n";' \
../data/fa/Ahyacinthus_v1.fasta > ../analyses/Ahyacinthus_v1.tab
Converted 33496 FASTA records in 66992 lines to tabular format Total sequence length: 17056543
!head -1 ../analyses/Ahyacinthus_v1.tab
contig27 CAAAATTCCAGCACTCCGTTTTGCATGGTAAACTTGTCTTAGTAGGACACTGTGGAAGATGTACAGCGCAAGACATCACAGTTGCAAGCGCCGACGAACAGCTGTTAAACTCTCCTCTCATATTCTCGAACAAACCAAATATTTCTTCCTCTCTGTTGTTGCTAACCTTTGAATATATGAAGCTGGCATTAGCACAGGACTCAAAGTTTCCGCCGAGCAGTTT
#temp replace name so c or g will not confound
#!sed 's/PiuraChilensis_v1_contig/999999/g' <../data/Piura_v1_contigs.tab> ../data/Piura_v1-99_contigs.tab
#!head -1 ../data/Piura_v1-99_contigs.tab
999999_1 ATTTACAATACGAAGTAAAATAGATAACGTGAAAATAATCTTGGTGCTGGATGATCGATCAAGTTCACCAATATTTTATTGTAAAAAATCATTCTAAACAGCATGAAATCGTGTACAATGTATAAACAAGCAAATATATAACACTAAAGCAAGAGGGCGTAAGTGGGGGGGTGGGTGAGAGTAAAAAATTCAAACATGTCAAATACCCCGGCGTTAGCCTTAAAAGCACCATGGACTTCTGCCTTCAATAAGCATAAAATTAAAACACCTAATACACAATGAATATACAGATAAAACAGATTTATGAATAGTTGGTGTTACATCTTTTACAGCCATAAGCCTTCATTTTGCTTCCAAACGTATAAAATCTGACTTGGAACAATATACAGCCATGAGATATGACACAGCGAGCACTACAATATATATTTATCTTGTACTATACAGCCTGTACAAGAAAATTCTGGAATTGTCTTCACAAGAGACAGAAAAATAGTTGCAATGTGAATGCTAGTCTACTATTTGATCACAATTGGATAGAAAAGTACAGCACATAAATGTTGGTGATACCTTAAAGAAAAGTGCAACAATATCAAAGGAATTAGTACCAGCATGCATTAGAAAAGTAAAAGTCTTGCTTATTACACAAAGCTGACTATATGATGTTCACCGCTTCTGGTGTGCAAAGAATTAAAAACAATGCAATTTCGGTCAGTTTTAACAAGGAATTAACAATTCATAGGAAAAATACAAGCATATGGTCTCAGGCCAATTGCTAGGACATAAAAAAAGCCTGCATATCACGAAAAGCCAAGTGCATGCATCGTATCCTGAAGACACCTTGATATTAACATGTAAGAAATTTAGCTTGCCACATTTCCATATTCCATAATTTCATTTTGAACACCGTGCCAGCAAATTCATCTGATATAAACACACAGGCAACTAATTTGGACACTTTCTAACTAGGTAGTTCAGAAAATACAGCTTTCAACAGGTACACATTTCTATAATAATAATAATAGCAAATGTCAGTGTGGCAGTTTTTGGACAAGTCCCTTTCAGAGGCCAAAATATCTATTTTGTATTAATTAATTATCCATTTTTTGGACTATACGGCTGTATCAAAACCATGGGTAACTGGGACTTGCCTAGCTTTGGGGGTGGAGTC
#add column with length of sequence
!perl -e '$col = 2;' -e 'while (<>) { s/\r?\n//; @F = split /\t/, $_; $len = length($F[$col]); print "$_\t$len\n" } warn "\nAdded column with length of column $col for $. lines.\n\n";' \
../analyses/Ahyacinthus_v1.tab > ../analyses/Ahyacinthus_v1_len.tab
Added column with length of column 2 for 33496 lines.
!head -1 ../analyses/Ahyacinthus_v1_len.tab
contig27 CAAAATTCCAGCACTCCGTTTTGCATGGTAAACTTGTCTTAGTAGGACACTGTGGAAGATGTACAGCGCAAGACATCACAGTTGCAAGCGCCGACGAACAGCTGTTAAACTCTCCTCTCATATTCTCGAACAAACCAAATATTTCTTCCTCTCTGTTGTTGCTAACCTTTGAATATATGAAGCTGGCATTAGCACAGGACTCAAAGTTTCCGCCGAGCAGTTT 223
!awk -F\CG '{print NF-1}' ../analyses/Ahyacinthus_v1_len.tab > ../analyses/Ahyacinthus_v1_CG.tab
!awk -F\C '{print NF-1}' ../analyses/Ahyacinthus_v1_len.tab > ../analyses/Ahyacinthus_v1_C.tab
!awk -F\G '{print NF-1}' ../analyses/Ahyacinthus_v1_len.tab > ../analyses/Ahyacinthus_v1_G.tab
!paste ../analyses/Ahyacinthus_v1_len.tab \
../analyses/Ahyacinthus_v1_CG.tab \
../analyses/Ahyacinthus_v1_C.tab \
../analyses/Ahyacinthus_v1_G.tab \
> ../analyses/Ahyacinthus_v1_C-G.tab
!head -1 ../analyses/Ahyacinthus_v1_C-G.tab
contig27 CAAAATTCCAGCACTCCGTTTTGCATGGTAAACTTGTCTTAGTAGGACACTGTGGAAGATGTACAGCGCAAGACATCACAGTTGCAAGCGCCGACGAACAGCTGTTAAACTCTCCTCTCATATTCTCGAACAAACCAAATATTTCTTCCTCTCTGTTGTTGCTAACCTTTGAATATATGAAGCTGGCATTAGCACAGGACTCAAAGTTTCCGCCGAGCAGTTT 223 8 55 42
!awk '{print $1, "\t", (($4)/($5*$6))*(($3**2)/($3-1))}' \
../analyses/Ahyacinthus_v1_C-G.tab > ../analyses/Ahyacinthus_v1_CpG.tab
!head ../analyses/Ahyacinthus_v1_CpG.tab
contig27 0.775773 contig88 0.459903 contig100 0.254614 contig211 0.885658 contig405 0.689373 contig443 1.34126 contig470 0.323368 contig503 0.941889 contig583 0.625727 contig590 1.21135
!ls ../analyses/A*
../analyses/Ahyacinthus_sprot.tab ../analyses/Ahyacinthus_v1_CG.tab ../analyses/Ahyacinthus_v1.tab ../analyses/Ahyacinthus_v1_CpG.tab ../analyses/Ahyacinthus_v1_C-G.tab ../analyses/Ahyacinthus_v1_G.tab ../analyses/Ahyacinthus_v1_C.tab ../analyses/Ahyacinthus_v1_len.tab
!head ../analyses/Ahyacinthus_GOslim.tab
#!tr ',' "\t" <../data/Piura_v1_GOslim.csv> ../data/Piura_v1_GOslim.tab
!sort ../analyses/Ahyacinthus_GOslim.tab | tail -n +2 > ../analyses/Ahyacinthus_GOslim.sorted
!head ../analyses/Ahyacinthus_GOslim.sorted
!awk -F $'\t' '{print $1, "\t", $2}' ../analyses/Ahyacinthus_GOslim.sorted > ../analyses/Ahyacinthus_GOslim.sortedtab
!head ../analyses/Ahyacinthus_GOslim.sortedtab
!sort ../analyses/Ahyacinthus_v1_CpG.tab > ../analyses/Ahyacinthus_v1_CpG.sorted
!head ../analyses/Ahyacinthus_v1_CpG.sorted
#!awk -F $'\t' '{print $1, "\t", $2}' ../data/Piura_v1_CpG.sorted > ../data/Piura_v1_CpG.sortedtab
#!head ../data/Piura_v1_CpG.sortedtab
contig100 0.254614 contig100001 0.431531 contig100008 0.276093 contig100010 0.476931 contig100021_110093_105915 2.0758 contig100025 0.299187 contig100026 1.0599 contig100030 0.854552 contig100031 0.64616 contig100038_111047 1.60515
!join -t $'\t' ../analyses/Ahyacinthus_v1_CpG.sorted ../analyses/Ahyacinthus_GOslim.sortedtab \
> ../analyses/Ahyacinthus_v1_CpG_GOslim
import pandas as pd
Ahya = pd.read_table('../analyses/Ahyacinthus_v1_CpG_GOslim', header=None)
Ahya
<class 'pandas.core.frame.DataFrame'> Int64Index: 25258 entries, 0 to 25257 Data columns (total 3 columns): 0 25258 non-null values 1 25258 non-null values 2 25258 non-null values dtypes: float64(1), object(2)
%matplotlib inline
import matplotlib.pyplot as plt
Ahya.groupby(2)[1].mean().plot(kind='barh', color=list('myb'))
plt.axis([0.7, 0.9, 0, 15])
[0.7, 0.9, 0, 15]
# pandas density plot
Ahya[1].plot(kind='kde', linewidth=3);
plt.axis([0, 1.5, 0, 1.9])
[0, 1.5, 0, 1.9]
#using whole transcriptome data
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
Ahya_all = pd.read_table('../analyses/Ahyacinthus_v1_CpG.tab', header=None)
# pandas density plot
Ahya_all[1].plot(kind='kde', linewidth=3);
plt.axis([0, 1.5, 0, 1.9])
[0, 1.5, 0, 1.9]