import time percentages = range(76,99,3) region_boundaries = [ ('v2', 136, 1868), #27f-338r ('v2.v3', 136, 2232), ('v2.v4', 136, 4051), ('v2.v6', 136, 4932), ('v2.v8', 136, 6426), ('v2.v9', 136, 6791), ('v3', 1916, 2232), ('v3.v4', 1916, 4051), ('v3.v6', 1916, 4932), ('v3.v8', 1916, 6426), ('v3.v9', 1916, 6791), ('v4', 2263, 4051), #515f-806r ('v4.v6', 2263, 4932), ('v4.v8', 2263, 6426), ('v4.v9', 2263, 6791), ('v6', 4653, 4932), #967f-1046r ('v6.v8', 4653, 6426), ('v6.v9', 4653, 6791), ('v9', 6450, 6791), ('full.length', 0, 7682), # Start 150, 250, 400 base pair reads ('v2.150', 136, 702), ('v2.250', 136, 1752), ('v2.v3.400', 136, 2036), # Skips reads that are larger than amplicon size ('v3.v4.150', 1916, 2235), ('v3.v4.250', 1916, 2493), ('v3.v4.400', 1916, 4014), ('v4.150', 2263, 3794), ('v4.250', 2263, 4046), ('v4.v6.400', 2263, 4574), ('v6.v8.150', 4653, 5085), ('v6.v8.250', 4653, 5903), ('v6.v8.400', 4653, 6419) ] labels = [] for p in percentages: for rb in region_boundaries: labels.append("%i.%s" % (p,rb[0])) ntasks = len(labels) stages = ['Load Subalignments', 'Filter', 'Build Trees', 'Compute Distances'] with open('tasks.log') as f: lines = f.readlines() log_segments = [ lines[i*ntasks*2:(i+1)*ntasks*2] for i in range(len(stages)) ] from datetime import datetime tfmt = "%Y-%m-%d %H:%M:%S.%f" def parse_logline(line): parts = line.split() timestamp = ' '.join(parts[:2]) dt = datetime.strptime(timestamp, tfmt) msg_id = parts[4][1:-1] which = parts[5] engine_id = int(parts[7]) return msg_id, dt, which, engine_id def parse_log(lines): history = [] # ordered, to match labels, if we care started = {} finished = {} for line in lines: msg_id, dt, which, engine_id = parse_logline(line) if which == 'arrived': started[msg_id] = dt history.append(msg_id) elif which == 'finished': finished[msg_id] = dt else: raise ValueError("expected arrived or finished, got %s" % which) # list ordered by start time: runtimes = [ (finished[msg_id] - started[msg_id]).total_seconds() for msg_id in history ] wall = (max(finished.values()) - min(started.values())).total_seconds() return runtimes, wall def hms(t): """render time as HH:MM:SS""" it = int(t) h = it / 3600 m = (it % 3600) / 60 s = (t % 60) return "%3i:%02i:%02i" % (h,m,s) serial_times = {} parallel_times = {} for stage, segment in zip(stages, log_segments): fig = plt.figure() times, wall = parse_log(segment) serial = sum(times) serial_times[stage] = serial parallel_times[stage] = wall plt.title("min = %is | %s | max =%s" % (min(times),stage,hms(max(times)))) n = len(times) stimes = sorted(times) plt.plot(sorted(times)) plt.fill(range(n)+[n-1,0], sorted(times)+[0,0], alpha=0.8) # plt.hist(times, 25) plt.ylabel("time (s)") plt.xlabel("task") plt.xlim(0,n-1) display(fig) print "%s:" % stage print " serial calculation: %s" % hms(serial) print " parallel calculation: %s" % hms(wall) print " serial / parallel : %8.1fx" % (serial / wall) sys.stdout.flush() # avoid double-plot, because we used display() plt.close('all') total_serial = sum(serial_times.values()) total_wall = sum(parallel_times.values()) print "total calculation time: %s (%.1f days)" % (hms(total_serial), total_serial / (3600*24)) print "total parallel runtime: %s" % hms(total_wall) print " serial / parallel: %8.1fx" % (total_serial / total_wall) print for stage in stages: serial = serial_times[stage] wall = parallel_times[stage] print "%s:" % stage print "\t serial runtime: %s" % (hms(serial)) print "\t serial percent: %8.1f%%" % (100 * serial / total_serial) print "\tparallel runtime: %s" % (hms(wall)) print "\tparallel percent: %8.1f%%" % (100 * wall / total_wall) print