from blaze import *
import pandas as pd

! head train.csv

%time train_pandas = pd.read_csv('train.csv',error_bad_lines=False, index_col=False, dtype=unicode)

train_pandas.head()

train_pandas.columns

%time train_pandas[['click','C14']].groupby('click').mean()

%time pd.value_counts(train_pandas[['click']].values.ravel())

# Data - function used to point to the data source
%time ctr_data_csv = Data('train.csv')

# Identify the shape and column types of the dataset
ds_train = discover(ctr_data_csv)
ds_train

# Construct expressions
expr = by(ctr_data_csv.click,avg_c14=ctr_data_csv.C14.mean(),count_click=ctr_data_csv.click.count())

# Compute the expression now
%time compute_01 = compute(expr)

compute_01

# Convert data into BColz

%time into('ctr_train.bcolz','train.csv', dshape=ds_train)

# This will take an hour

%time ds_train_bcolz = Data('ctr_train.bcolz')

expr = by(ds_train_bcolz.click,avg_c14=ds_train_bcolz.C14.mean(),count_click=ds_train_bcolz.click.count())

%time compute_02 = compute(expr)

compute_02

import multiprocessing
pool = multiprocessing.Pool(8)

%time compute_03 = compute(expr,map=pool.map)

compute_03

ds_train_bcolz.C14.

max(ctr_data_csv[ctr_data_csv.click>0].C14)

from glob import glob

# Specifying active columns at parse time greatly improves performance
active_columns = ['click', 'C14']
intermediates = []

# Do a split-apply-combine operation on each chunk of each CSV file
for fn in sorted(glob('train.csv')):
    for df in pd.read_csv(fn, usecols=active_columns, 
                          chunksize=1000000, skipinitialspace=True):
        chunk = df.groupby('click').agg({'click': ['count'],
                                                   'C14': ['sum', 'mean']})
        intermediates.append(chunk)

# What's there in each chunk?
chunk

# Bring those results together.  These are much smaller and so likely fit in memory
df = pd.concat(intermediates, axis=0)
df.columns = ['click_count', 'c14_sum', 'c14_mean']  # Flatten multi-index

# How does the data frame looks after concatenating all the chunks?
df

%%time

# Perform second split-apply-combine operation on those intermediate results
groups = df.groupby(df.index)  # group once for many of the following applies
df2 = pd.concat([groups.click_count.sum(),
                 groups.c14_sum.sum(),
                 groups.c14_mean.sum()],
                axis=1)


df2['avg_c14_metric'] = df2.c14_sum / df2.c14_mean
df2['count'] = df2.click_count
df2

result = df2[['avg_c14_metric', 'click_count']]
result

%%time

# Timing the entire run

# Specifying active columns at parse time greatly improves performance
active_columns = ['click', 'C14']
intermediates = []

# Do a split-apply-combine operation on each chunk of each CSV file
for fn in sorted(glob('train.csv')):
    for df in pd.read_csv(fn, usecols=active_columns, 
                          chunksize=1000000, skipinitialspace=True):
        chunk = df.groupby('click').agg({'click': ['count'],
                                                   'C14': ['sum', 'mean']})
        intermediates.append(chunk)
        
# Bring those results together.  These are much smaller and so likely fit in memory
df = pd.concat(intermediates, axis=0)
df.columns = ['click_count', 'c14_sum', 'c14_mean']  # Flatten multi-index

# Perform second split-apply-combine operation on those intermediate results
groups = df.groupby(df.index)  # group once for many of the following applies
df2 = pd.concat([groups.click_count.sum(),
                 groups.c14_sum.sum(),
                 groups.c14_mean.sum()],
                axis=1)

df2['avg_c14_metric'] = df2.c14_sum / df2.c14_mean
df2['count'] = df2.click_count

# Select out the columns we want
result = df2[['avg_c14_metric', 'click_count']]
result