# configure paths etc.
import sys
import os
os.environ['PYTHONPATH'] = '/usr/local/share/khmer/python'
sys.path.insert(0, '/usr/local/share/khmer/python')
import khmer
import screed
cd /mnt
/mnt
# check out simulation code for making reads, from the diginorm paper repo on github
!git clone git://github.com/ged-lab/2012-paper-diginorm.git
!mkdir kmer
fatal: destination path '2012-paper-diginorm' already exists and is not an empty directory. mkdir: cannot create directory `kmer': File exists
cd kmer
/mnt/kmer
# build a random genome (8kb) with equal mixes of A/C/G/T
import random
random.seed(1)
x = ["A"] + ["G"] + ["C"] + ["T"]
x = x*2000
random.shuffle(x)
x = "".join(x)
x
'CCACCCTTGGGTCCTTCCTGATGTGGCCACCCCAACCACCACATGTATTTACGAAGCTATCACCCTGGTTTAGGTAGCAATTTCCAACGGCCTAATAACCTTTCCTCGAGCATCGCCTTGGCTGTTACACACAGCGAGCAGCCTAGTCGTCCAGGTTTGGACGGTAGCGCGTTTGAATTTACCACAAAGAATCTGGTGAGCGATTTACTTAACATCATAGCGCTATGGACGCCTCATTCAGCCACAACCGAGTTTGACCAGGTCCACAGCTAGACCACCATTATCACATCGTTGTCTCACAAGTCTTGCAGCAACGTCAGTTGGGAAGTTGTAAGGTGAAGAAGACAAGCCGCCATGTGACACGGAGCTACTCTGTCAATCGTTGCGTCTATATAGACGAATTAGTAAGGAGTTCTTGTGGCGGGCTGTCTAGTTGCGGGGGACTGCTACCGGCGCGGCCTTCCCTCGAATGGGGATACTCTCCTCCTTTCTTGGTTACCCCTAACCCAAGACTGCGAGGCAGGCAATTCCACCTGCCCTCGTTGGAAGGTTAATTGGTAGGTATACCTGAGACAGATTGCACCGTATAGCACTTGTCATAGCAGGAGCGAGTAGGTAATATTCTTAAACCGTACGTACCAGAATGACATAACCCAATGTAGTGTGGCAGATTAAGCATGACACATCAGTGAGGGGCAATACGCACTTCATACTTGATTGTAGGGTTTTATTTGCCGGATTGGGATTAACTGAGTGTTACCACAACGCTAGTCCCGAAAAAAGGGAAGTGCACGTAGCCTCGTGTGCACCGATATTTTATAACCGAGGGAGGCAGCCTGTACATTGAGTCCAGCCTTGAAAAGGCTAGTTGAGTCCTTTCTACCGAATTTGTCTGACGGCGAAAAGGGCTCCTCTAACGCTCGTACGTATAGGAGTGCTGGTGATTCCAAAAACCAATCATTGTCACAACCGCCGTTGGCCTGCGCTGCGTTAGGCGTCATAACTCACTTCGCGGCGTGCTGTGTCATAGCCGAATCCTCCAATCTTTAAGAATGCGCGCTAATTTCCATCCCAGAGGGTGTTGGAGATGCACGAATAATAAAAGGTCGCGAGTCCAAACAAAGTTTACCGTTCGGTCGAGACGGTCAAGACTGCCACTAAAGAAGTCAATATTTCCTCGCCCTTCGACAATGGTTCGTGACAGGGCTATTGAGCAGTGGTGACCGCAGATTGATACAAGAAAACGCCACTCTAGCCAGACACGACCCGGTGCCCACCGTATGCAGGGCTGGGGGGAATGTTGAGGGATCGTGAAAGAAAATCACCTTGCCCCAGAACCGCAGTCAATTGTGCTCATTGAAACGTATCTGTCCAAAACTTAACGCCGACAGGGGTCGTCGCATAACGCCGCTAATAATCCGGGGCCACGGAGAGGCTGGGTCTGTATCTACCACCGACTAGAGAAGCGGGCATATGTGTCTTGTTTCGATCTTTCGTCGGTTATGTACTGTGAATTAAGCAAAAGCAAGCAGTAAGTGCGCGGAAGTGTATCCCCCGCCCCTGCTGTTACAACTCCTGATCTCGGTTCAGCATACGTATTCGCGGAAGTAAGCTACTAGGTTTACCATACGGGTTAGTGAAGGTACGTTTAATATATCCTTAATCGCTTGCAGCGAGTGGACACAGAAGCAAGGTCCGAGCATAACGGAGGATCAAGTATTAGAAGCTCAGGAAAAAAAGTACGCATCGATATATAGGGAAGGTGTCTTGCTTGCCGTGTTTCAGATTGCCAAGGAACCAGGAGTGCCCCATTTTCACCACTCTCTTGGAACTGTCAAGTCACTTATGCCGCCTTGCTCCCCGAAACGCGCATAAAAGGTGACAAAAGACGCGTGTAAAGGGCGCCAATGCACCAACGTTTTCTCCGGAGCTTGTCACCCATCGCCGAGGGCCAGTCGCCAACTCACCATCATCTTCATGAGGATTCACGCCCCGTACGGGGAGAGCGCACGGGAGCGGCCTGTTGCCGGTCTTACTCTTCACGATGTCGATTATAAGAAGGAGCTTGGAAGCTTGTGGCTTTGAGGTATGACAGACAATTTTGGATGCAAGTGTTAGCCATACAGGCTTTCAAGTGTTAATTCGTGTAGCGGGCAGTGAAAACGAAAGGTACGAGGGGTTATCTAGAATGAGTAGGGAAATTTGTATAAAGAGCAGTTGAAAGTTTGTGATGCTAATGTGACTGGGATCCGACCGAGCGGCTCATGGCAGAGAACGAATAAGTAAACTAAAGGGACCTGCATGACAGGAATGAGCAACTGACCAATCTTATGTTTAGCGTAATATTCTTTTGCCCCTGCATGAGATTCTGGTGTACTAGCACCGCTTATAGCCCGGTAGGACATCATCCATGGTCTGGCATCCAGGGAGGAGTCAACTACTCTAAGACCACCGCTCGGGCACCATCAGTATACTAATGTTGCACGAACAACAGTCGATCCCGGAACATGCACAGGGGTCTCCCTCGGTCCGTATCGGACATATTGAACGACTATCGGGGATGACCATTGCGTCCCAAGAGCACTATACCAGTTCCAATGTTGTCGGGGCACAGCCGCAGAACGGACTAGTAGCTTTCGGCAAAGAATCCCTAGACCTCACGACGCCTCCGACTGGGACCATTTGGCCTATAAAAAACATGTCAGCCACAATGGCTGCGACATGAATGGGTCGTAGAACTTTGCGTAGTAGCACATGGTAGTAGATCACACAGCCGGGTTCAGTGGCCCTATTTGACTGTTAACCCTCTGTTAAGTGTTCCCGAACACTAGTTCGGCTGTATTGACACTTGGGAATACGTTTGTAGGTTGAGCATTACGGTAGTTGTCCTTTACTGCCGGAGCAGAGGATCCTTCTATGTGTATCGTTCTGAATACTTGCTAAAAGATATAGACTTTTTCCTTTCACCGGGAGTGGTGCCTTGCAGCTAGCAGTGTGTCTTGGTCCTATGTGCAACTCTGCAAAGCCTCACGTGGCAGCAGGTATAGTCAGGAAAAGAGACAAGCACTGTCGGCACACCTACCTCGATATAAATGTACGCCAGTTGATGTGTGTCGCAGTGTAACAGGCAAAAGTACGTAAACGGCGTCGAACGGAGAAGCTGAGACCGCCTGTGTTTGATAGTACCTGCCGGCCTCTACCTGCTGAACTAGAAGGTAGCAACTACCTGTTCGTAAGCCCTCAGAGGTCCCAAACCGCCAAATCGTTAGTTTACAGCCCTATGCGCGGTTAGCCAGACGGGGACAGTGTGCTGTTTCCTTCCGCCATCTTTTTGCTTGATCGCCACGGGACCAAATCTTGGGGGCCTGGATCATATATACTTTACGGTACACAAACACTTGCATTCGGAGCATTAAACCAGAGTTGATCTGAAGTACAGGTGTACACTGTTTCCCGCTATTGCACCGGCGAGCGAGGCTGGCTCTCGGCCCGCATATTCATAAGTTTGGTCGAGGGCCCGTCGGTCATTCATCTTATTCTAGGTCCAAGAACGGGTAGCCACGGGGATTTCGCATATGTCTAGCATCGTTCCGTCGGCAAGCTCAAGGACCGCCTATAGTTCCCAAACATAACCTGAGTCGTTGCTCATAACATAAAGTAGAGATTGCGGTCTTACCATCTAATAACTTGTTTATCTTGACCTGGGGGACATGCCCAAAGTCCTTCAGGGTTTCGCCGACTGCTGATTACAAGCTATCCCATCCCAGTAAGACTAGATGCAGGTAGGAGTTTAAATCGGTGTCTTTTTCAACGAGAAGTCTGGGTCGTGCCTGCCGCCCCCGGATTTGATAAGCTCCTGTGCCTCAAAGGGGTTACTTGAGGCTACCTTATTTTGCACCGACCTTGAGCGTCCTAACGTGGCGGGTCAATATACTGAACGTTCTAATTCGCGACATAGTGTCAACCGACTGTAGTGCGGGAGTTTAGGACTTATACTCTAGCGCCATCGTGCATTGAAGCCAACTTAGCCATCTTCAAAAACTACCCCAAAGAGTTCGAGAGAAACGCCACGCCCTTAACCACTTCTAACGCAAAGGAGTCATTATCCCTCCCCTGTTCCGGCACTGGCTTCATTTTAAGCGCCTGTCTCTATCTGGGGTGCTCGTCCGGAGCAACATAAGGGTGTGTGACGAGAGTTAAGCCCCCTAACCTTGAGACAATGTATTTCAGCTCTACCGGTCCGGTAACCCTGTAGGATGGCACGCTGTGTGTCCGGGTGCCTTTTAAGAACTCGCGAGATTTCCTCTCACAGCTAGACTTGCAGATAATTGGTTGGCCCCGAGTCTGCGCATTTTGATAGGGTATGCTACTCTTAACTGCAAGGATTTACGAGTTCCGATACGTGCCCCTCTTTACTTGTTCAGGGGGCATACCCGTTGCTACGCCGCTAGAATTGCCCCTCAAGGAGAAGAATCCGCCAGTGCATATCGAGTTAGATTAAACGTAATCAGGGCACCGGAGGAACTATACGTGCTGCTCACACGGGCCGTCCGAGCGTATGCCGTTCAATGGTCCATCTTCTGATAACATGGCGCAATCTAGGATGGCGGCATCTGTCCTTCGGTTTACTAGCCAGGAGGCGCTGGAGACGTGTCGATCGAAAAAGCCTAGTATTCCCGCGGGGCCCTGGAATAAGAGGCTCCCTTGAGCCGTTAGGACGGTAAAACGGACGCCCCTTTCGACTCATAGTTGTAGCGGGGAGGCACGCTGGGCCGGAGTGCACGACTACTAAGGACAATAAGCTACCTGTTGGCGCAGTTTGAGCGTCGCTCTGCAAGGCTGATGAGCACCGTCCAGCGGTACCTCCGATTCGTCCAGTTTTGACCGAAATAACACCGATTCACTCAAGTACTCGGATTAAATTAGACTCTAACTTCCGAACATGCCCTCGGAGTTGCAAAACATCTTTGGATCCAATAAGGATTTGTAGGACCCGACCACTAGGACCATCTGAAAAACGAAATTGAGGAAGACCGCTCAGTCGGTAATCGTACCCACTAAAAATGTGGGCGACAGTTCAGCCCTCCGGTTTCCCTATCCTATGATCTGTTGCAAGTTGGTGTAGATTAATCCAGCAGGGACAGATTGCGTACCTCACGGTAGGCATGGAAAAGTGCTTCACTAAGGGGAAGTTTCCTGGATATTGAAGGCCGTATGGCAATAGTCGAAGTGAAGAGAAACCTCCCCGTCCACCTTAGCCACACCCGAAAGTGAGGGTCCTGAACTTGTCGTTTGCGGATTGGCTAGCAGACGACATAGGGCTGCAGTGACGGTCGTGTCGGGGCCAGCCGCGGGTTTTTTGGCTTGCCTCGGATAGTAGTGTCGTAGAAAGCGCACGAGCGGCTGTGGCTCCAAAGTAAGAAGGCAAAGCGGTATAACACAATCACACTATCACCATTAACAGGGACTTAAACCGAGAAAGTTCATTTGTCATGAACCCCAGTTTCAACGTTTCCTACCGTCTCCTATACCTCTTCTTGAGGCATGTCAGTGAGCGATTGACCTTTACCCTGGTGATTATTGTCACACGGTTTGGGGTACTTAACTTGCCACATATTACTTCGGAACTCTTCCAATCAGAAGGAAAGGTCACTTGGGTTCTAGATAGTAAGTCAACCAGTATTGGATGATGAGTCAAGATCTGGTAAGTCCGACCGAGCGGGTCGACCGCTGTAGGTATCCCTTACTTCCCGACAGTTTGGCACATGGTCCTCCCGATCTAACTCGGACTGCCCAGGGTTAAGCAAAATATCTGGATGGACAGAACATAGACGTACTGACACTTCGGGTCTAATTAACGTAGCGTATGTTGCTACTGTCGATACAGCAATCTGGCTAGTCTGGACTAGCGGACAGGGTACCAATCCACGGGTCGTAGCAAAACATTCTAACACAGAAAGATGATAGTCTTTGCTCGGGCTTTCAAGATGAGCGGATGCCATGCGAGCAAAAGAGGTAATAGCCAGAGTAGCCGTGAGAAAGAAAGGTTCTGCCGACGTGCATAGTGCATAGAAAGTTAGTACAACCGTATCCTTCGCCGCTCTGACCATGCTCCCAAAGGTAAGCTTACTCGGGCTACGCCGTAATAAGACACGCTTATTGTAAGCGTTTGTGGCAACTGTATCGGATAGCCGTTGTCTAGAGGGCTGCTACGCGGCTATTGACCGACCGCTAGGTATGGATTCCGGTATGGGGGCGAACAATAGATCCGTCCATAACCCTACCGCTACAATGGACACTTCGGATCGGTGGTCATGTTTCTGGCAGGTACATTGATCTGCGGAGGACAGTTGGGATTCGTAAAATACGAACCGTCGTCGCAGATCCTACATTATTCTCGAGCACGCGGAGACGACTAGCACCTAGATCCCACGCAGTCGGGCTGACCGATATTGCTCTTCAAGAGGGCCTGCGATCGTGTCCCTATCCTTCGGAGAGCGCTACGGGACAAGTGCCACAAGGTGAATCATTGGCCCGGTCTCACCACCGATGAAAATGGCACTCCCCCTATCCGGTATATTCATAGTACTTTTAGGCGTGGTAAGCAACACATCTCTAGAAGAAAGGTAGAACAGTGCTTTAGACCCTAGATTCTAGACCCCTGTATTGGGCACCCCGTGTGTAGAAAAGCGGTTACGTCAAAAACCCTTCTTTGACCATAATCCCCCGCTCCGACTCGATGCGCACCGCTACGGGAATGTCCCCTGCAAGTTGAGGCGACGGGGCAAATAAAGAGCTGCTTCGAGTAGTGTGAGCTGGTCGCCAGTATTGCCTCTGTTGGATATAGAAACCGGTCCCCTTCTAAATAATGTTCGCGATATCAGGGCGGGCCCTATAATTTAGCAACCACATCTGTACAGCCATCTACGTTAGTTTAAATAGTCAGAGCCATCTTGGAGTCTATAACATAGCACTCCGAGCGGTCACTATAACCTTACTGACAACATCGGACAAACCTGTCCACCGATTCAATGACGTTCGCTGTAGATTTAGAGGGATGTAAATGCGCAGTAATTGAGTGTCTTACAGCCCTTCTGTTGTCAAGGTGCCATAGACTTTATTTCGGCTGTTCTGTGCAACTCGTTCGCTGTACGAACGATCCACACCGGCCGGTCTATCGGACGGGACTCTGATGGTTCACCGTTATAAACATGATCTGGACCCGTTCCTCCCCGACTTACCCAGTGGTCGGTTATAACCAGCGGTTCTTTAACTAGTCTGACACCCCTGATTATATGGTCAAGCTCCCAGGAGGCGTTAATTCGTTTAATAGTCAATTCACACAACCCTGTTACAATATCCTCCGTAAACAACAGCAGTTTCTCAAATGTGCAATAGTGAGAGAATTGCTTGGCGCGCAAAGCATGAAACGAGGCGCATTCACGCCAGACTTCCCGTCTCAACGGTCCACAACTCGGACCGGAGGCTCTTCAAGTCTTACTCGCATCTCGACTTAAGAGGCCTTACTTCACCGCCGGTTCCTGGTATAATCGGTACTGGCTATCATTACGGCTCACTGGCAAATTGTTCCCCGTCCTGATGTAGGTCTGAAACTCCCCGGGTCGGCTTTTGTGTATGGGCCGTACGTTGCTAACCCCGGGTTCTCACTTATTAATTTACGTGGTTTTGTTACACAGCACCTCGTTGCCGCAGACTATGGCGTGTAGTTCTCCGAACGGCGCTCTTCTGTTGCGTATTATCATGTTTGATTAGCCCAGACAGCTGCGTACTCTGCGGTGGCGACTTTCGTCTAAAGGCATCGTTGATCCCCACATCCGGCGGATTGAGATCTGCACGTGGATTTCCATTCGGAGCTTCATCCCAAGCACATGTAAGCGATCTTTGCTGGCTCGACTCTGCTCGCAATACTCATGTGGAAGCAACTCAGCTGAGAGATCAATGTCTAGTTGACTTCTAATTATCCGAAGGGCCAAGGTACC'
# write the genome to disk
fp = open('genome.fa', 'w')
fp.write('>genome\n')
fp.write(x)
fp.close()
# make reads at 200x coverage, 1% error. note, errors are indicated by lower case.
!python /mnt/2012-paper-diginorm/pipeline/make-reads.py genome.fa > reads.fa
!head reads.fa
10073 of 16000 reads mutated; 16150 total mutations >read0 AATTTCCATCCCAGAGGGTGTTGGAGATGCACGAATAATAAAAGgTCGCGAGTCCAAACAAAGTTTACCGTTCGGTCGtGACGGTCAAGACTGCCACTAA >read1 GGATGGCACGCTGTGTGTCCGGGTGCCTTTTAAGAACTCGCGAGATTTCCTCTCACAGCTAGACTTGCAGATAATTGGTTGGCCCCGaGTCTGCGCATTT >read2 GCGACATGAATGGGTCGTAGAACTTTGCGTAGTAGCACATGGTAGTAGATCACACAGCCGGGTTCAGTGGCCCTATTTGACTGTTAACCCTCTGTTAAGT >read3 GTTGTTCGTGCAACATTAGTATACTGATGGTGCCCGAGCGGTGGTCTTAGAGgAGTTGACTCCTCCCTGGATGCCAGACCATGGATGATGTCCTACCGGG >read4 TGTTCCCCGTCCTGATGTcGGTCTGAAgCTCCCCGGGTCGGCTTcTGTGTATGGGCCGTACGTTGCTAACCCCGGGTTCTCACTTATTAATTTACGTGGT
# build a database of all k-mer counts. See http://khmer.readthedocs.org/en/latest/ for documentation
!python /usr/local/share/khmer/scripts/load-into-counting.py -x 1e8 -N 4 -k 20 counts.kh reads.fa
PARAMETERS: - kmer size = 20 (-k) - n hashes = 4 (-N) - min hashsize = 1e+08 (-x) Estimated memory usage is 4e+08 bytes (n_hashes x min_hashsize) -------- Saving hashtable to counts.kh Loading kmers from sequences in ['reads.fa'] making hashtable consuming input reads.fa saving counts.kh fp rate estimated to be 0.000 DONE.
# count all k-mers in the database and extract a k-mer distribution
!python /usr/local/share/khmer/scripts/abundance-dist.py -s counts.kh reads.fa reads.dist
hashtable from counts.kh K: 20 HT sizes: [100000007, 100000037, 100000039, 100000049] outputting to reads.dist ** squashing existing file reads.dist preparing hist...
# in the dist file,
# column 1 is the k-mer count
# column 2 is the number of k-mers with that count
# column 3 is the number of k-mers with less than or equal to that count
# column 4 is the fraction of total k-mers with less than or equal to that count
!head reads.dist
0 0 0 0.0 1 131282 131282 0.805 2 21148 152430 0.934 3 2552 154982 0.95 4 210 155192 0.951 5 14 155206 0.951 6 1 155207 0.951 7 1 155208 0.951 8 1 155209 0.951 9 3 155212 0.951
# read in the numbers
dist = numpy.loadtxt('reads.dist')
# plot the first two columns
plot(dist[:,0], dist[:,1])
xlabel('k-mer abundance')
ylabel('# of k-mers with that abundance')
<matplotlib.text.Text at 0x412fb50>
# this plot is biased towards 1, due to the large number of errors.
# what does it look like up close?
plot(dist[:,0], dist[:,1])
axis(xmax=10)
(0.0, 10, 0.0, 140000.0)
# but out by ~200 you'll see a higher coverage peak.
plot(dist[:,0], dist[:,1])
axis(ymax=1000)
(0.0, 200.0, 0.0, 1000)
# what happens if we trim out all the unique k-mers? => reads.fa.abundfilt
!python /usr/local/share/khmer/scripts/filter-abund.py -C 2 counts.kh reads.fa
file with ht: counts.kh loading hashtable K: 20 filtering reads.fa starting threads starting writer loading... ... filtering 0 done loading in sequences DONE writing. processed 16000 / wrote 14368 / removed 1632 processed 1600000 bp / wrote 1199434 bp / removed 400566 bp discarded 25.0% output in reads.fa.abundfilt
# now let's look at the k-mer abundance distribution of the filtered reads
!python /usr/local/share/khmer/scripts/load-into-counting.py -x 1e8 -N 4 -k 20 counts-filt.kh reads.fa.abundfilt
!python /usr/local/share/khmer/scripts/abundance-dist.py -s counts-filt.kh reads.fa.abundfilt reads-filt.dist
PARAMETERS: - kmer size = 20 (-k) - n hashes = 4 (-N) - min hashsize = 1e+08 (-x) Estimated memory usage is 4e+08 bytes (n_hashes x min_hashsize) -------- Saving hashtable to counts-filt.kh Loading kmers from sequences in ['reads.fa.abundfilt'] making hashtable consuming input reads.fa.abundfilt saving counts-filt.kh fp rate estimated to be 0.000 DONE. hashtable from counts-filt.kh K: 20 HT sizes: [100000007, 100000037, 100000039, 100000049] outputting to reads-filt.dist ** squashing existing file reads-filt.dist preparing hist...
# load
dist_filt = numpy.loadtxt('reads-filt.dist')
# plot reads v filtered reads k-mer abundance, overall.
plot(dist[:,0], dist[:,1], label='reads')
plot(dist_filt[:,0], dist_filt[:,1], label='filtered reads')
xlabel('k-mer abundance')
ylabel('# of k-mers with that abundance')
legend()
<matplotlib.legend.Legend at 0x428fbd0>
# now plot reads v filtered reads k-mer abundance
plot(dist[:,0], dist[:,1], label='reads')
plot(dist_filt[:,0], dist_filt[:,1], label='filtered reads')
xlabel('k-mer abundance')
ylabel('# of k-mers with that abundance')
axis(ymax=1000)
legend()
<matplotlib.legend.Legend at 0x445c190>
Why is the green abundance distribution shifted left?? didn't we just eliminate abundance-1 k-mers?
## DIGRESSION - back to presentation to talk about digital normalization
# run digital normalization
!python /usr/local/share/khmer/scripts/normalize-by-median.py -x 2e8 -N 4 -k 20 -C 20 reads.fa
PARAMETERS: - kmer size = 20 (-k) - n hashes = 4 (-N) - min hashsize = 2e+08 (-x) - paired = False (-p) Estimated memory usage is 8e+08 bytes (n_hashes x min_hashsize) -------- making hashtable DONE with reads.fa ; kept 3044 of 16000 or 19 % output in reads.fa.keep fp rate estimated to be 0.000
# let's look at the abundance distribution post-diginorm
!python /usr/local/share/khmer/scripts/load-into-counting.py -x 1e8 -N 4 -k 20 counts-dn.kh reads.fa.keep
!python /usr/local/share/khmer/scripts/abundance-dist.py -s counts-dn.kh reads.fa.keep reads-dn.dist
PARAMETERS: - kmer size = 20 (-k) - n hashes = 4 (-N) - min hashsize = 1e+08 (-x) Estimated memory usage is 4e+08 bytes (n_hashes x min_hashsize) -------- Saving hashtable to counts-dn.kh Loading kmers from sequences in ['reads.fa.keep'] making hashtable consuming input reads.fa.keep saving counts-dn.kh fp rate estimated to be 0.000 DONE. hashtable from counts-dn.kh K: 20 HT sizes: [100000007, 100000037, 100000039, 100000049] outputting to reads-dn.dist ** squashing existing file reads-dn.dist preparing hist...
dist_dn = numpy.loadtxt('reads-dn.dist')
plot(dist[:,0], dist[:,1], label='reads')
plot(dist_filt[:,0], dist_filt[:,1], label='filtered reads')
plot(dist_dn[:,0], dist_dn[:,1], label='diginorm reads')
xlabel('k-mer abundance')
ylabel('# of k-mers with that abundance')
axis(ymax=1000)
legend()
<matplotlib.legend.Legend at 0x46c8c10>
!ls -la reads.fa*
-rw-r--r-- 1 root root 1780890 2013-06-19 13:56 reads.fa -rw-r--r-- 1 root root 1361833 2013-06-19 13:56 reads.fa.abundfilt -rw-r--r-- 1 root root 336948 2013-06-19 13:56 reads.fa.keep