from blaze import * import pandas as pd ! head train.csv %time train_pandas = pd.read_csv('train.csv',error_bad_lines=False, index_col=False, dtype=unicode) train_pandas.head() train_pandas.columns %time train_pandas[['click','C14']].groupby('click').mean() %time pd.value_counts(train_pandas[['click']].values.ravel()) # Data - function used to point to the data source %time ctr_data_csv = Data('train.csv') # Identify the shape and column types of the dataset ds_train = discover(ctr_data_csv) ds_train # Construct expressions expr = by(ctr_data_csv.click,avg_c14=ctr_data_csv.C14.mean(),count_click=ctr_data_csv.click.count()) # Compute the expression now %time compute_01 = compute(expr) compute_01 # Convert data into BColz %time into('ctr_train.bcolz','train.csv', dshape=ds_train) # This will take an hour %time ds_train_bcolz = Data('ctr_train.bcolz') expr = by(ds_train_bcolz.click,avg_c14=ds_train_bcolz.C14.mean(),count_click=ds_train_bcolz.click.count()) %time compute_02 = compute(expr) compute_02 import multiprocessing pool = multiprocessing.Pool(8) %time compute_03 = compute(expr,map=pool.map) compute_03 ds_train_bcolz.C14. max(ctr_data_csv[ctr_data_csv.click>0].C14) from glob import glob # Specifying active columns at parse time greatly improves performance active_columns = ['click', 'C14'] intermediates = [] # Do a split-apply-combine operation on each chunk of each CSV file for fn in sorted(glob('train.csv')): for df in pd.read_csv(fn, usecols=active_columns, chunksize=1000000, skipinitialspace=True): chunk = df.groupby('click').agg({'click': ['count'], 'C14': ['sum', 'mean']}) intermediates.append(chunk) # What's there in each chunk? chunk # Bring those results together. These are much smaller and so likely fit in memory df = pd.concat(intermediates, axis=0) df.columns = ['click_count', 'c14_sum', 'c14_mean'] # Flatten multi-index # How does the data frame looks after concatenating all the chunks? df %%time # Perform second split-apply-combine operation on those intermediate results groups = df.groupby(df.index) # group once for many of the following applies df2 = pd.concat([groups.click_count.sum(), groups.c14_sum.sum(), groups.c14_mean.sum()], axis=1) df2['avg_c14_metric'] = df2.c14_sum / df2.c14_mean df2['count'] = df2.click_count df2 result = df2[['avg_c14_metric', 'click_count']] result %%time # Timing the entire run # Specifying active columns at parse time greatly improves performance active_columns = ['click', 'C14'] intermediates = [] # Do a split-apply-combine operation on each chunk of each CSV file for fn in sorted(glob('train.csv')): for df in pd.read_csv(fn, usecols=active_columns, chunksize=1000000, skipinitialspace=True): chunk = df.groupby('click').agg({'click': ['count'], 'C14': ['sum', 'mean']}) intermediates.append(chunk) # Bring those results together. These are much smaller and so likely fit in memory df = pd.concat(intermediates, axis=0) df.columns = ['click_count', 'c14_sum', 'c14_mean'] # Flatten multi-index # Perform second split-apply-combine operation on those intermediate results groups = df.groupby(df.index) # group once for many of the following applies df2 = pd.concat([groups.click_count.sum(), groups.c14_sum.sum(), groups.c14_mean.sum()], axis=1) df2['avg_c14_metric'] = df2.c14_sum / df2.c14_mean df2['count'] = df2.click_count # Select out the columns we want result = df2[['avg_c14_metric', 'click_count']] result