# Change what's inside the quotes into your MBL STAMPS server username, or something else unique
# reminder! hit shift-ENTER to execute cell & move onto next cell
username="CHANGEME"
# configure paths etc.
import sys
import os
os.environ['PYTHONPATH'] = '/usr/local/share/khmer/python'
sys.path.insert(0, '/usr/local/share/khmer/python')
import khmer
import screed
cd /mnt
/mnt
mkdir -p $username
cd $username
/mnt/CHANGEME
You should now be in '/mnt/USERNAME', where USERNAME is not CHANGEME
import random
random.seed(1)
x = ["A"] + ["G"] + ["C"] + ["T"]
x = x*1000
random.shuffle(x)
x = "".join(x)
y = ["A"] + ["G"] + ["C"] + ["T"]
y = y*1000
random.shuffle(y)
y = "".join(y)
print 'x is', x[:100]
print 'y is', y[:100]
x is CAGACTTGGAAGCTGAGAGTCCGACGTCACTGCCTCAACTCGCGCAAATGTTCCCGCCAAATTGTATCCTAGGGATCTTCCATAAGCTTATATACGGGGG y is ATTAGACCACATGTTGCTAGCTATTCGGAGGAAGGCAGTTTAGCTAAATATGTGAGGGCTGTGGGACGAATATCGGGGGATGACGGTCCCATAGCTAGTT
outfp = open('metagenome.fa', 'w')
print >>outfp, ">x 1"
print >>outfp, x
print >>outfp, ">y 2"
print >>outfp, y
outfp.close()
!python /usr/local/share/2012-paper-diginorm/pipeline/make-biased-reads.py metagenome.fa | head -100000 > reads.fa
Traceback (most recent call last): File "/usr/local/share/2012-paper-diginorm/pipeline/make-biased-reads.py", line 56, in <module> print '>read%d\n%s' % (i, read) IOError: [Errno 32] Broken pipe
(Yes, you should see an error.)
!/usr/local/share/khmer/scripts/do-partition.py -k 32 -x 1e8 reads reads.fa
PARAMETERS: - kmer size = 32 (-k) - n hashes = 4 (-N) - min hashsize = 1e+08 (-x) Estimated memory usage is 5e+07 bytes (n_hashes x min_hashsize / 8) -------- Saving hashtable to reads Loading kmers from sequences in ['reads.fa'] -- SUBSET SIZE 100000 N THREADS 4 -- making hashtable consuming input reads.fa fp rate estimated to be 0.000 ** Traverse all the things: stop_big_traversals is false. enqueued 1 subset tasks starting 1 threads --- starting: reads 0 done starting threads saving: reads 0 exiting --- done making subsets! see reads.subset.*.pmap loading 1 pmap files (first one: reads.subset.0.pmap) merging reads.subset.0.pmap removing pmap files outputting partitions for reads.fa output 3 partitions for reads.fa partitions are in reads.fa.part
# extract partitions
!/usr/local/share/khmer/scripts/extract-partitions.py -X 5 reads reads.fa.part
--- reading partitioned files: ['reads.fa.part'] outputting to files named "reads.groupN.fa" min reads to keep a partition: 5 max size of a group file: 5 partition size distribution will go to reads.dist --- ... 0 2 groups ...x2 0
ls
metagenome.fa reads.fa reads.group0000.fa reads.info reads.dist reads.fa.part reads.group0001.fa
# calculate abundance histograms for these suckers
!/usr/local/share/khmer/scripts/abundance-dist-single.py -x 1e8 reads.group0000.fa group0.hist
!/usr/local/share/khmer/scripts/abundance-dist-single.py -x 1e8 reads.group0001.fa group1.hist
PARAMETERS: - kmer size = 32 (-k) - n hashes = 4 (-N) - min hashsize = 1e+08 (-x) Estimated memory usage is 4e+08 bytes (n_hashes x min_hashsize) -------- making hashtable building tracking ht K: 32 HT sizes: [100000007, 100000037, 100000039, 100000049] outputting to group0.hist consuming input, round 1 -- reads.group0000.fa preparing hist from reads.group0000.fa... consuming input, round 2 -- reads.group0000.fa PARAMETERS: - kmer size = 32 (-k) - n hashes = 4 (-N) - min hashsize = 1e+08 (-x) Estimated memory usage is 4e+08 bytes (n_hashes x min_hashsize) -------- making hashtable building tracking ht K: 32 HT sizes: [100000007, 100000037, 100000039, 100000049] outputting to group1.hist consuming input, round 1 -- reads.group0001.fa preparing hist from reads.group0001.fa... consuming input, round 2 -- reads.group0001.fa
hist0 = numpy.loadtxt('group0.hist')
hist1 = numpy.loadtxt('group1.hist')
plot(hist0[:,0], hist0[:,1], label='group0')
plot(hist1[:,0], hist1[:,1], label='group1')
xlabel("k-mer abundance")
ylabel("N of k-mers with that abundance")
axis(ymax=500)
legend()
<matplotlib.legend.Legend at 0x2956750>