%matplotlib inline import matplotlib.pyplot as plt import numpy as np import itertools as itt import time import shutil import os import contextlib import pandas as pd import blaze as blz import bquery import cytoolz from cytoolz.curried import pluck as cytoolz_pluck from collections import OrderedDict import copy from prettyprint import pp elapsed_times = OrderedDict() @contextlib.contextmanager def ctime(message=None): "Counts the time spent in some context" assert message is not None global elapsed_times t_elapsed = 0.0 print('\n') t = time.time() yield if message: print message + ": ", t_elapsed = time.time() - t print round(t_elapsed, 4), "sec" elapsed_times[message] = t_elapsed ga = itt.cycle(['ES', 'NL']) gb = itt.cycle(['b1', 'b2', 'b3', 'b4', 'b5']) gx = itt.cycle([1, 2]) gy = itt.cycle([-1, -2]) rootdir = 'bench-data.bcolz' if os.path.exists(rootdir): shutil.rmtree(rootdir) n_rows = 1000000 # -- data z = np.fromiter(((a, b, x, y) for a, b, x, y in itt.izip(ga, gb, gx, gy)), dtype='S2,S2,i8,i8', count=n_rows) ct = bquery.ctable(z, rootdir=rootdir) ct.flush() print('Simple Test Case') df = pd.DataFrame(z) with ctime(message='pandas'): result = df.groupby(['f0'], sort=False, as_index=False)['f2'].sum() # print(result) print('Simple Test Case') blaze_data = blz.Data(ct.rootdir) expr = blz.by(blaze_data.f0, sum_f2=blaze_data.f2.sum()) with ctime(message='blaze (pandas + bcolz)'): result = blz.compute(expr) # print result print('Simple Test Case') with ctime(message='bquery + bcolz'): result = ct.groupby(['f0'], ['f2']) # print(result) print('Simple Test Case') with ctime(message='bquery, create factorization cache'): ct.cache_factor(['f0'], refresh=True) with ctime(message='bquery + bcolz (fact. cached)'): result = ct.groupby(['f0'], ['f2']) # print(result) print('Simple Test Case Running Time') elapsed_times_bak = OrderedDict({ k: v for (k,v) in sorted(elapsed_times.iteritems())}) pp(elapsed_times_bak) print('Simple Test Case Running Time relative to Pandas') elapsed_times_bak = OrderedDict({ k: v for (k,v) in sorted(elapsed_times.iteritems())}) pp(elapsed_times_bak) elapsed_times = elapsed_times_bak elapsed_times_norm = OrderedDict({ k: v/elapsed_times['pandas'] for (k,v) in sorted(elapsed_times.iteritems())}) print '\nNormalized running time' pp(elapsed_times_norm) if 'bquery, create factorization cache' in elapsed_times_norm: base_bquery = elapsed_times_norm.pop('bquery, create factorization cache') labels = [] val = [] for k,v in sorted(elapsed_times_norm.iteritems(), reverse=True): labels.append(k) val.append(v) pos = np.arange(len(elapsed_times_norm))+.5 # the bar centers on the y axis print elapsed_times_norm.keys() plt.figure(1, figsize=[15,5]) plt.grid(True) plt.barh(pos,val, align='center') plt.barh(pos,[0, base_bquery, 0,0], left=[0, elapsed_times_norm['bquery + bcolz (fact. cached)'], 0, 0], align='center', color = '#FFFFCC') plt.yticks(pos, labels, fontsize=15) plt.xlabel('X times slower', fontsize=15) plt.title('Performance compared to pandas', fontsize=25) elapsed_times = OrderedDict() ga = itt.cycle(['ES', 'NL']) gb = itt.cycle(['b1', 'b2', 'b3', 'b4']) gc = itt.cycle([1, 2]) gd = itt.cycle([3, 4, 4, 3]) ge = itt.cycle(['c','d','e']) gx = itt.cycle([1, 2]) gy = itt.cycle([-1, -2]) gz = itt.cycle([1.11, 2.22, 3.33, 4.44, 5.55]) rootdir = 'bench-data.bcolz' if os.path.exists(rootdir): shutil.rmtree(rootdir) n_rows = 1000000 print('Rows: ', n_rows) z = np.fromiter(((a, b, c, d, e, x, y, z) for a, b, c, d, e, x, y, z in itt.izip(ga, gb, gc, gd, ge, gx, gy, gz)), dtype='S2,S2,i4,i8,S1,i4,i8,f8', count=n_rows) ct = bquery.ctable(z, rootdir=rootdir, ) # -- pandas -- df = pd.DataFrame(z) with ctime(message='pandas'): result = df.groupby(['f0','f1','f2','f3','f4'], sort=False, as_index=False)['f5','f6','f7'].sum() # print(result) # -- bquery -- with ctime(message='bquery + bcolz'): result = ct.groupby(['f0','f1','f2','f3','f4'], ['f5','f6','f7']) # print(result) with ctime(message='bquery, create factorization cache'): ct.cache_factor(['f0','f1','f2','f3','f4'], refresh=True) with ctime(message='bquery over bcolz (factorization cached)'): result = ct.groupby(['f0','f1','f2','f3','f4'], ['f5','f6','f7']) # print(result) print('Complex Test Case Running Time relative to Pandas') elapsed_times_bak = OrderedDict({ k: v for (k,v) in sorted(elapsed_times.iteritems())}) pp(elapsed_times_bak) elapsed_times = elapsed_times_bak elapsed_times_norm = OrderedDict({ k: v/elapsed_times['pandas'] for (k,v) in sorted(elapsed_times.iteritems())}) print '\nNormalized running time' pp(elapsed_times_norm) if 'bquery, create factorization cache' in elapsed_times_norm: base_bquery = elapsed_times_norm.pop('bquery, create factorization cache') labels = [] val = [] for k,v in sorted(elapsed_times_norm.iteritems(), reverse=True): labels.append(k) val.append(v) pos = np.arange(len(elapsed_times_norm))+.5 # the bar centers on the y axis print elapsed_times_norm.keys() plt.figure(1, figsize=[15,5]) plt.grid(True) plt.barh(pos,val, align='center') plt.barh(pos,[0, base_bquery, 0], left=[0, elapsed_times_norm['bquery over bcolz (factorization cached)'], 0], align='center', color = '#FFFFCC') plt.yticks(pos, labels, fontsize=15) plt.xlabel('X times slower', fontsize=15) plt.title('Performance compared to pandas', fontsize=25)