In [1]:

# Change what's inside the quotes into your MBL STAMPS server username, or something else unique
# reminder! hit shift-ENTER to execute cell & move onto next cell
username="CHANGEME"

In [2]:

# configure paths etc.
import sys
import os
os.environ['PYTHONPATH'] = '/usr/local/share/khmer/python'
sys.path.insert(0, '/usr/local/share/khmer/python')

import khmer
import screed

In [3]:

cd /mnt

/mnt

In [4]:

mkdir -p $username

In [5]:

cd $username

/mnt/CHANGEME

You should now be in '/mnt/USERNAME', where USERNAME is not CHANGEME

Make two random genomes with a 10:1 abundance¶

In [6]:

import random
random.seed(1)

x = ["A"] + ["G"] + ["C"] + ["T"]
x = x*1000
random.shuffle(x)
x = "".join(x)

y = ["A"] + ["G"] + ["C"] + ["T"]
y = y*1000
random.shuffle(y)
y = "".join(y)

print 'x is', x[:100]
print 'y is', y[:100]

x is CAGACTTGGAAGCTGAGAGTCCGACGTCACTGCCTCAACTCGCGCAAATGTTCCCGCCAAATTGTATCCTAGGGATCTTCCATAAGCTTATATACGGGGG
y is ATTAGACCACATGTTGCTAGCTATTCGGAGGAAGGCAGTTTAGCTAAATATGTGAGGGCTGTGGGACGAATATCGGGGGATGACGGTCCCATAGCTAGTT

In [7]:

outfp = open('metagenome.fa', 'w')
print >>outfp, ">x 1"
print >>outfp, x
print >>outfp, ">y 2"
print >>outfp, y
outfp.close()

In [8]:

!python /usr/local/share/2012-paper-diginorm/pipeline/make-biased-reads.py metagenome.fa | head -100000 > reads.fa

Traceback (most recent call last):
  File "/usr/local/share/2012-paper-diginorm/pipeline/make-biased-reads.py", line 56, in <module>
    print '>read%d\n%s' % (i, read)
IOError: [Errno 32] Broken pipe

(Yes, you should see an error.)

Now, run partitioning.¶

In [9]:

!/usr/local/share/khmer/scripts/do-partition.py -k 32 -x 1e8 reads reads.fa

PARAMETERS:
 - kmer size =    32 		(-k)
 - n hashes =     4 		(-N)
 - min hashsize = 1e+08 	(-x)

Estimated memory usage is 5e+07 bytes (n_hashes x min_hashsize / 8)
--------
Saving hashtable to reads
Loading kmers from sequences in ['reads.fa']
--
SUBSET SIZE 100000
N THREADS 4
--
making hashtable
consuming input reads.fa
fp rate estimated to be 0.000
** Traverse all the things: stop_big_traversals is false.
enqueued 1 subset tasks
starting 1 threads
---
starting: reads 0
 done starting threads
saving: reads 0
exiting
---
done making subsets! see reads.subset.*.pmap
loading 1 pmap files (first one: reads.subset.0.pmap)
merging reads.subset.0.pmap
removing pmap files
outputting partitions for reads.fa
output 3 partitions for reads.fa
partitions are in reads.fa.part

In [10]:

# extract partitions
!/usr/local/share/khmer/scripts/extract-partitions.py -X 5 reads reads.fa.part

---
reading partitioned files: ['reads.fa.part']
outputting to files named "reads.groupN.fa"
min reads to keep a partition: 5
max size of a group file: 5
partition size distribution will go to reads.dist
---
... 0
2 groups
...x2 0

In [11]:

ls

metagenome.fa  reads.fa       reads.group0000.fa  reads.info
reads.dist     reads.fa.part  reads.group0001.fa

In [12]:

# calculate abundance histograms for these suckers
!/usr/local/share/khmer/scripts/abundance-dist-single.py -x 1e8 reads.group0000.fa group0.hist
!/usr/local/share/khmer/scripts/abundance-dist-single.py -x 1e8 reads.group0001.fa group1.hist

PARAMETERS:
 - kmer size =    32 		(-k)
 - n hashes =     4 		(-N)
 - min hashsize = 1e+08 	(-x)

Estimated memory usage is 4e+08 bytes (n_hashes x min_hashsize)
--------
making hashtable
building tracking ht
K: 32
HT sizes: [100000007, 100000037, 100000039, 100000049]
outputting to group0.hist
consuming input, round 1 -- reads.group0000.fa
preparing hist from reads.group0000.fa...
consuming input, round 2 -- reads.group0000.fa

PARAMETERS:
 - kmer size =    32 		(-k)
 - n hashes =     4 		(-N)
 - min hashsize = 1e+08 	(-x)

Estimated memory usage is 4e+08 bytes (n_hashes x min_hashsize)
--------
making hashtable
building tracking ht
K: 32
HT sizes: [100000007, 100000037, 100000039, 100000049]
outputting to group1.hist
consuming input, round 1 -- reads.group0001.fa
preparing hist from reads.group0001.fa...
consuming input, round 2 -- reads.group0001.fa

In [13]:

hist0 = numpy.loadtxt('group0.hist')
hist1 = numpy.loadtxt('group1.hist')

In [14]:

plot(hist0[:,0], hist0[:,1], label='group0')
plot(hist1[:,0], hist1[:,1], label='group1')

xlabel("k-mer abundance")
ylabel("N of k-mers with that abundance")
axis(ymax=500)
legend()

Out[14]:

<matplotlib.legend.Legend at 0x2956750>

In [14]: