In [1]:

# Change what's inside the quotes into your MBL STAMPS server username, or something else unique
# reminder! hit shift-ENTER to execute cell & move onto next cell
username="CHANGEME"

In [2]:

# configure paths etc.
import sys
import os
os.environ['PYTHONPATH'] = '/usr/local/share/khmer/python'
sys.path.insert(0, '/usr/local/share/khmer/python')

import khmer
import screed

In [3]:

cd /mnt

/mnt

In [4]:

mkdir -p $username

In [5]:

cd $username

/mnt/CHANGEME

You should now be in '/mnt/USERNAME', where USERNAME is not CHANGEME

Make two random genomes with a 10:1 abundance¶

In [6]:

import random
random.seed(1)

x = ["A"] + ["G"] + ["C"] + ["T"]
x = x*1000
random.shuffle(x)
x = "".join(x)

y = ["A"] + ["G"] + ["C"] + ["T"]
y = y*1000
random.shuffle(y)
y = "".join(y)

print 'x is', x[:100]
print 'y is', y[:100]

x is CAGACTTGGAAGCTGAGAGTCCGACGTCACTGCCTCAACTCGCGCAAATGTTCCCGCCAAATTGTATCCTAGGGATCTTCCATAAGCTTATATACGGGGG
y is ATTAGACCACATGTTGCTAGCTATTCGGAGGAAGGCAGTTTAGCTAAATATGTGAGGGCTGTGGGACGAATATCGGGGGATGACGGTCCCATAGCTAGTT

In [7]:

outfp = open('metagenome.fa', 'w')
print >>outfp, ">x 1"
print >>outfp, x
print >>outfp, ">y 2"
print >>outfp, y
outfp.close()

In [8]:

!python /usr/local/share/2012-paper-diginorm/pipeline/make-biased-reads.py metagenome.fa | head -100000 > reads.fa

Traceback (most recent call last):
  File "/usr/local/share/2012-paper-diginorm/pipeline/make-biased-reads.py", line 56, in <module>
    print '>read%d\n%s' % (i, read)
IOError: [Errno 32] Broken pipe

(Yes, you should see an error.)

Now, apply digital normalization & error trimming¶

In [9]:

!/usr/local/share/khmer/scripts/normalize-by-median.py -k 20 -C 20 -x 1e8 reads.fa --savehash normC20k20.kh

PARAMETERS:
 - kmer size =    20 		(-k)
 - n hashes =     4 		(-N)
 - min hashsize = 1e+08 	(-x)
 - paired = False 		(-p)

Estimated memory usage is 4e+08 bytes             (n_hashes x min_hashsize)
--------
making hashtable
DONE with reads.fa; kept 3787 of 50000 or  7%
output in reads.fa.keep
Saving hashfile through reads.fa
...saving to normC20k20.kh
fp rate estimated to be 0.000

In [10]:

!/usr/local/share/khmer/scripts/filter-abund.py normC20k20.kh reads.fa.keep

file with ht: normC20k20.kh
loading hashtable
K: 20
filtering reads.fa.keep
starting threads
starting writer
loading...
... filtering 0
done loading in sequences
DONE writing.
processed 3787 / wrote 3069 / removed 718
processed 378700 bp / wrote 211885 bp / removed 166815 bp
discarded 44.0%
output in reads.fa.keep.abundfilt

In [11]:

!/usr/local/share/khmer/scripts/normalize-by-median.py -k 20 -C 5 -x 1e8 reads.fa.keep.abundfilt

PARAMETERS:
 - kmer size =    20 		(-k)
 - n hashes =     4 		(-N)
 - min hashsize = 1e+08 	(-x)
 - paired = False 		(-p)

Estimated memory usage is 4e+08 bytes             (n_hashes x min_hashsize)
--------
making hashtable
DONE with reads.fa.keep.abundfilt; kept 1059 of 3069 or 34%
output in reads.fa.keep.abundfilt.keep
fp rate estimated to be 0.000

In [12]:

ls

group0.hist    reads.fa                      reads.group0000.fa
group1.hist    reads.fa.keep                 reads.group0001.fa
metagenome.fa  reads.fa.keep.abundfilt       reads.info
normC20k20.kh  reads.fa.keep.abundfilt.keep
reads.dist     reads.fa.part

In [13]:

!/usr/local/share/khmer/scripts/abundance-dist-single.py -x 1e8 -k 20 reads.fa.keep reads.fa.keep.hist
!/usr/local/share/khmer/scripts/abundance-dist-single.py -x 1e8 -k 20 reads.fa.keep.abundfilt reads.fa.ka.hist
!/usr/local/share/khmer/scripts/abundance-dist-single.py -x 1e8 -k 20 reads.fa.keep.abundfilt.keep reads.fa.kak.hist

PARAMETERS:
 - kmer size =    20 		(-k)
 - n hashes =     4 		(-N)
 - min hashsize = 1e+08 	(-x)

Estimated memory usage is 4e+08 bytes (n_hashes x min_hashsize)
--------
making hashtable
building tracking ht
K: 20
HT sizes: [100000007, 100000037, 100000039, 100000049]
outputting to reads.fa.keep.hist
consuming input, round 1 -- reads.fa.keep
preparing hist from reads.fa.keep...
consuming input, round 2 -- reads.fa.keep

PARAMETERS:
 - kmer size =    20 		(-k)
 - n hashes =     4 		(-N)
 - min hashsize = 1e+08 	(-x)

Estimated memory usage is 4e+08 bytes (n_hashes x min_hashsize)
--------
making hashtable
building tracking ht
K: 20
HT sizes: [100000007, 100000037, 100000039, 100000049]
outputting to reads.fa.ka.hist
consuming input, round 1 -- reads.fa.keep.abundfilt
preparing hist from reads.fa.keep.abundfilt...
consuming input, round 2 -- reads.fa.keep.abundfilt

PARAMETERS:
 - kmer size =    20 		(-k)
 - n hashes =     4 		(-N)
 - min hashsize = 1e+08 	(-x)

Estimated memory usage is 4e+08 bytes (n_hashes x min_hashsize)
--------
making hashtable
building tracking ht
K: 20
HT sizes: [100000007, 100000037, 100000039, 100000049]
outputting to reads.fa.kak.hist
consuming input, round 1 -- reads.fa.keep.abundfilt.keep
preparing hist from reads.fa.keep.abundfilt.keep...
consuming input, round 2 -- reads.fa.keep.abundfilt.keep

In [14]:

dn1 = numpy.loadtxt('reads.fa.keep.hist')
abund = numpy.loadtxt('reads.fa.ka.hist')
dn2 = numpy.loadtxt('reads.fa.kak.hist')

In [15]:

plot(dn1[:,0], dn1[:,1], label='first round')
plot(abund[:,0], abund[:,1], label='trim errors')
plot(dn2[:,0], dn2[:,1], label='second round')
axis(ymax=2500)
legend()

Out[15]:

<matplotlib.legend.Legend at 0x7f60c00f6d10>

In [15]: