Figure generation for A probabilistic approach to k-mer counting =============
import numpy
datadirs = ['../execublog1/data-athyra/',
'../execublog1/data-ec2-i1/',
'../execublog1/data-ec2-i2/',
'../execublog1/data-ec2-i3/']
names = ['our server', 'm2.2xlarge', 'm2.2xlarge/1TB EBS 100 IOPS', 'm2.4xlarge']
order = ['a', 'b', 'd', 'c']
# note: different zones!
figsize(12,6)
pwd
u'/Users/t/Documents/papers/khmer-counting/notebook'
def get_time_mem(filename):
"Extract the user time and max memory as generated by 'time' command"
for line in open(filename):
line = line.rstrip()
if 'system' in line:
fields1 = line.split('user')
time1 = float(fields1[0])
fields1b = line.split('system')[0].split()[-1]
time2 = float(fields1b)
walltime = line.split('elapsed')[0].split()[-1].rsplit(':')
assert len(walltime) <= 3
hours = 0.
minutes = 0.
seconds = walltime[-1]
if len(walltime) == 3:
hours = float(walltime[0])
minutes = float(walltime[1])
elif len(walltime) == 2:
minutes = float(walltime[0])
wall_seconds = hours*60*60 + minutes*60 + float(walltime[1])
time = wall_seconds
fields2 = line.split('avgdata ')
fields3 = fields2[1].split('max')
mem = fields3[0]
return float(time), float(mem)
raise Exception(filename)
jelly = {}
khmer = {}
dsk = {}
tally = {}
for datadir in datadirs:
tally[datadir] = get_time_mem(datadir + 'mkindex_5_part1_22.time')[0] + \
get_time_mem(datadir + 'suffix_5_part1.time')[0]
jelly[datadir] = get_time_mem(datadir + 'jelly_5_22.time1')[0] + \
get_time_mem(datadir + 'jelly_5_22.time2')[0]
khmer[datadir] = get_time_mem(datadir + 'bloom_5_1_22.time1')[0]
dsk[datadir] = get_time_mem(datadir + 'dsk_5_22.time')[0]
print jelly
print khmer
print dsk
jelly_y = []
dsk_y = []
khmer_y = []
tally_y = []
labels = []
for _, label, dirname in sorted(zip(order, names, datadirs)):
print dirname
labels.append(label)
jelly_y.append(jelly[dirname])
dsk_y.append(dsk[dirname])
khmer_y.append(khmer[dirname])
tally_y.append(tally[dirname])
{'../execublog1/data-athyra/': 270.89, '../execublog1/data-ec2-i1/': 713.1400000000001, '../execublog1/data-ec2-i3/': 394.14, '../execublog1/data-ec2-i2/': 7661.01} {'../execublog1/data-athyra/': 803.74, '../execublog1/data-ec2-i1/': 1271.47, '../execublog1/data-ec2-i3/': 736.7, '../execublog1/data-ec2-i2/': 1890.36} {'../execublog1/data-athyra/': 315.41, '../execublog1/data-ec2-i1/': 1487.29, '../execublog1/data-ec2-i3/': 1668.58, '../execublog1/data-ec2-i2/': 9640.0} ../execublog1/data-athyra/ ../execublog1/data-ec2-i1/ ../execublog1/data-ec2-i3/ ../execublog1/data-ec2-i2/
import numpy as np
import matplotlib.pyplot as plt
N = 5
ind = np.arange(4) # the x locations for the groups
width = 0.2 # the width of the bars
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, jelly_y, width, color='r')
rects2 = ax.bar(ind+width, dsk_y, width, color='y')
rects3 = ax.bar(ind+2*width, tally_y, width, color='g')
rects4 = ax.bar(ind+3*width, khmer_y, width, color='b')
# add some
ax.set_ylabel('Time (s)')
ax.set_title('Time to generate abundance histograms of 48.7m reads')
ax.set_xticks(ind+2*width)
ax.set_xticklabels(labels)
ax.set_ylim(0, 20000)
ax.legend( (rects1[0], rects2[0], rects3[0], rects4[0]), ('Jellyfish', 'DSK (no threads!)', 'Tallymer', 'khmer (1% fp)'),
loc='upper left')
def autolabel(rects):
# attach some text labels
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, '%d'%int(height),
ha='center', va='bottom')
autolabel(rects1)
autolabel(rects2)
autolabel(rects3)
autolabel(rects4)
#plt.show()
savefig('../execublog1/time.png')
def _get_time_all(filename):
for line in open(filename):
line = line.rstrip()
if 'system' in line:
fields1 = line.split('user')
user = float(fields1[0])
fields1b = line.split('system')[0].split()[-1]
system = float(fields1b)
walltime = line.split('elapsed')[0].split()[-1].rsplit(':')
assert len(walltime) <= 3
hours = 0.
minutes = 0.
seconds = walltime[-1]
if len(walltime) == 3:
hours = float(walltime[0])
minutes = float(walltime[1])
elif len(walltime) == 2:
minutes = float(walltime[0])
wall_seconds = hours*60*60 + minutes*60 + float(walltime[1])
return float(system), float(user), float(wall_seconds)
raise Exception(filename)
def get_time_all(*filenames):
aa, bb, cc = 0., 0., 0.
for filename in filenames:
a, b, c = _get_time_all(filename)
aa += a
bb += b
cc += c
return aa, bb, cc
jelly2 = {}
khmer2 = {}
dsk2 = {}
tally2 = {}
for datadir in datadirs:
tally2[datadir] = get_time_all(datadir + 'mkindex_5_part1_22.time',
datadir + 'suffix_5_part1.time')
jelly2[datadir] = get_time_all(datadir + 'jelly_5_22.time1', datadir + 'jelly_5_22.time2')
khmer2[datadir] = get_time_all(datadir + 'bloom_5_1_22.time1')
dsk2[datadir] = get_time_all(datadir + 'dsk_5_22.time')
def get_wall_ratio(triple):
return triple[1] / triple[2]
def get_sys_ratio(triple):
return triple[0] / triple[2]
jelly_sysr = {}
khmer_sysr = {}
dsk_sysr = {}
tally_sysr = {}
jelly_wallr = {}
khmer_wallr = {}
dsk_wallr = {}
tally_wallr = {}
for datadir in datadirs:
jelly_sysr[datadir] = get_sys_ratio(jelly2[datadir])
jelly_wallr[datadir] = get_wall_ratio(jelly2[datadir])
khmer_sysr[datadir] = get_sys_ratio(khmer2[datadir])
khmer_wallr[datadir] = get_wall_ratio(khmer2[datadir])
dsk_sysr[datadir] = get_sys_ratio(dsk2[datadir])
dsk_wallr[datadir] = get_wall_ratio(dsk2[datadir])
tally_sysr[datadir] = get_sys_ratio(tally2[datadir])
tally_wallr[datadir] = get_wall_ratio(tally2[datadir])
print 'jelly', jelly_sysr
print 'dsk', dsk_sysr
print 'tally', tally_sysr
print 'khmer', khmer_sysr
print jelly2
print khmer2
print dsk2
print tally2
print jelly_wallr
print dsk_wallr
print tally_wallr
print khmer_wallr
jelly {'../execublog1/data-athyra/': 0.09575842592934403, '../execublog1/data-ec2-i1/': 0.08374232268558766, '../execublog1/data-ec2-i3/': 0.17181712082001319, '../execublog1/data-ec2-i2/': 0.006756289314333227} dsk {'../execublog1/data-athyra/': 0.4910751085888208, '../execublog1/data-ec2-i1/': 0.21867961191159763, '../execublog1/data-ec2-i3/': 0.23135240743626317, '../execublog1/data-ec2-i2/': 0.036559128630705394} tally {'../execublog1/data-athyra/': 0.015849034493726512, '../execublog1/data-ec2-i1/': 0.05362553154531455, '../execublog1/data-ec2-i3/': 0.05306034072376927, '../execublog1/data-ec2-i2/': 0.014740818934385244} khmer {'../execublog1/data-athyra/': 0.07356856694951104, '../execublog1/data-ec2-i1/': 0.2201074347015659, '../execublog1/data-ec2-i3/': 0.4807248540790009, '../execublog1/data-ec2-i2/': 0.18518165852007026} {'../execublog1/data-athyra/': (25.94, 2085.48, 270.89), '../execublog1/data-ec2-i1/': (59.72, 1475.93, 713.1400000000001), '../execublog1/data-ec2-i3/': (67.72, 1921.76, 394.14), '../execublog1/data-ec2-i2/': (51.76, 1812.48, 7661.01)} {'../execublog1/data-athyra/': (59.13, 6267.23, 803.74), '../execublog1/data-ec2-i1/': (279.86, 4636.07, 1271.47), '../execublog1/data-ec2-i3/': (354.15, 5222.75, 736.7), '../execublog1/data-ec2-i2/': (350.06, 5783.4, 1890.36)} {'../execublog1/data-athyra/': (154.89, 1840.55, 315.41), '../execublog1/data-ec2-i1/': (325.24, 1099.45, 1487.29), '../execublog1/data-ec2-i3/': (386.03, 1265.75, 1668.58), '../execublog1/data-ec2-i2/': (352.43, 1356.46, 9640.0)} {'../execublog1/data-athyra/': (75.74000000000001, 4650.01, 4778.84), '../execublog1/data-ec2-i1/': (321.07, 5015.89, 5987.26), '../execublog1/data-ec2-i3/': (345.53, 5738.29, 6512.02), '../execublog1/data-ec2-i2/': (265.32, 6730.02, 17999.0)} {'../execublog1/data-athyra/': 7.698623057329544, '../execublog1/data-ec2-i1/': 2.0696216731637547, '../execublog1/data-ec2-i3/': 4.875830923022276, '../execublog1/data-ec2-i2/': 0.23658499336249397} {'../execublog1/data-athyra/': 5.835420563710725, '../execublog1/data-ec2-i1/': 0.7392304123607367, '../execublog1/data-ec2-i3/': 0.7585791511344976, '../execublog1/data-ec2-i2/': 0.14071161825726142} {'../execublog1/data-athyra/': 0.9730415749428731, '../execublog1/data-ec2-i1/': 0.8377605114860555, '../execublog1/data-ec2-i3/': 0.8811843329719503, '../execublog1/data-ec2-i2/': 0.3739107728207123} {'../execublog1/data-athyra/': 7.7975837957548455, '../execublog1/data-ec2-i1/': 3.646228381322406, '../execublog1/data-ec2-i3/': 7.089385095697027, '../execublog1/data-ec2-i2/': 3.059417253856408}
jelly_w = []
dsk_w = []
khmer_w = []
tally_w = []
labels = []
for _, label, dirname in sorted(zip(order, names, datadirs)):
print dirname
labels.append(label)
jelly_w.append(jelly_wallr[dirname])
dsk_w.append(dsk_wallr[dirname])
khmer_w.append(khmer_wallr[dirname])
tally_w.append(tally_wallr[dirname])
../execublog1/data-athyra/ ../execublog1/data-ec2-i1/ ../execublog1/data-ec2-i3/ ../execublog1/data-ec2-i2/
import numpy as np
import matplotlib.pyplot as plt
N = 5
ind = np.arange(4) # the x locations for the groups
width = 0.2 # the width of the bars
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, jelly_w, width, color='r')
rects2 = ax.bar(ind+width, dsk_w, width, color='y')
rects3 = ax.bar(ind+2*width, tally_w, width, color='g')
rects4 = ax.bar(ind+3*width, khmer_w, width, color='b')
# add some
ax.set_ylabel('Time (s)')
ax.set_title('ratio of usertime to walltime')
ax.set_xticks(ind+2*width)
ax.set_xticklabels(labels)
ax.set_ylim(0, 10)
ax.legend( (rects1[0], rects2[0], rects3[0], rects4[0]), ('Jellyfish', 'DSK (no threads!)', 'Tallymer', 'khmer (1% fp)'),
loc='upper right')
def autolabel(rects):
# attach some text labels
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, '%1.2f'%(height + 0.005),
ha='center', va='bottom')
autolabel(rects1)
autolabel(rects2)
autolabel(rects3)
autolabel(rects4)
#plt.show()
savefig('../execublog1/usertime_ratio.png')
jelly_z = []
dsk_z = []
khmer_z = []
tally_z = []
labels = []
for _, label, dirname in sorted(zip(order, names, datadirs)):
print dirname
labels.append(label)
jelly_z.append(jelly2[dirname][0])
dsk_z.append(dsk2[dirname][0])
khmer_z.append(khmer2[dirname][0])
tally_z.append(tally2[dirname][0])
../execublog1/data-athyra/ ../execublog1/data-ec2-i1/ ../execublog1/data-ec2-i3/ ../execublog1/data-ec2-i2/
import numpy as np
import matplotlib.pyplot as plt
N = 5
ind = np.arange(4) # the x locations for the groups
width = 0.2 # the width of the bars
fig = plt.figure()
ax = fig.add_subplot(111)
rects1 = ax.bar(ind, jelly_z, width, color='r')
rects2 = ax.bar(ind+width, dsk_z, width, color='y')
rects3 = ax.bar(ind+2*width, tally_z, width, color='g')
rects4 = ax.bar(ind+3*width, khmer_z, width, color='b')
# add some
ax.set_ylabel('Time (s)')
ax.set_title('system time')
ax.set_xticks(ind+2*width)
ax.set_xticklabels(labels)
ax.set_ylim(0, 450)
ax.legend( (rects1[0], rects2[0], rects3[0], rects4[0]), ('Jellyfish', 'DSK (no threads!)', 'Tallymer', 'khmer (1% fp)'),
loc='upper left')
def autolabel(rects):
# attach some text labels
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, '%.1f'%(height + 0.05),
ha='center', va='bottom')
autolabel(rects1)
autolabel(rects2)
autolabel(rects3)
autolabel(rects4)
#plt.show()
savefig('../execublog1/system_time.png')