from blaze import *
import pandas as pd
! head train.csv
%time train_pandas = pd.read_csv('train.csv',error_bad_lines=False, index_col=False, dtype=unicode)
CPU times: user 2min 15s, sys: 2min 23s, total: 4min 38s Wall time: 4min 46s
train_pandas.head()
id | click | hour | C1 | banner_pos | site_id | site_domain | site_category | app_id | app_domain | ... | device_type | device_conn_type | C14 | C15 | C16 | C17 | C18 | C19 | C20 | C21 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1000009418151094273 | 0 | 14102100 | 1005 | 0 | 1fbe01fe | f3845767 | 28905ebd | ecad2386 | 7801e8d9 | ... | 1 | 2 | 15706 | 320 | 50 | 1722 | 0 | 35 | -1 | 79 |
1 | 10000169349117863715 | 0 | 14102100 | 1005 | 0 | 1fbe01fe | f3845767 | 28905ebd | ecad2386 | 7801e8d9 | ... | 1 | 0 | 15704 | 320 | 50 | 1722 | 0 | 35 | 100084 | 79 |
2 | 10000371904215119486 | 0 | 14102100 | 1005 | 0 | 1fbe01fe | f3845767 | 28905ebd | ecad2386 | 7801e8d9 | ... | 1 | 0 | 15704 | 320 | 50 | 1722 | 0 | 35 | 100084 | 79 |
3 | 10000640724480838376 | 0 | 14102100 | 1005 | 0 | 1fbe01fe | f3845767 | 28905ebd | ecad2386 | 7801e8d9 | ... | 1 | 0 | 15706 | 320 | 50 | 1722 | 0 | 35 | 100084 | 79 |
4 | 10000679056417042096 | 0 | 14102100 | 1005 | 1 | fe8cc448 | 9166c161 | 0569f928 | ecad2386 | 7801e8d9 | ... | 1 | 0 | 18993 | 320 | 50 | 2161 | 0 | 35 | -1 | 157 |
5 rows × 24 columns
train_pandas.columns
Index([u'id', u'click', u'hour', u'C1', u'banner_pos', u'site_id', u'site_domain', u'site_category', u'app_id', u'app_domain', u'app_category', u'device_id', u'device_ip', u'device_model', u'device_type', u'device_conn_type', u'C14', u'C15', u'C16', u'C17', u'C18', u'C19', u'C20', u'C21'], dtype='object')
%time train_pandas[['click','C14']].groupby('click').mean()
CPU times: user 1.22 s, sys: 1.7 s, total: 2.92 s Wall time: 3.31 s
C14 | |
---|---|
click | |
0 | 18976.761383 |
1 | 18181.994233 |
%time pd.value_counts(train_pandas[['click']].values.ravel())
CPU times: user 748 ms, sys: 350 ms, total: 1.1 s Wall time: 1.1 s
0 33563901 1 6865066 dtype: int64
# Data - function used to point to the data source
%time ctr_data_csv = Data('train.csv')
CPU times: user 19.5 ms, sys: 11.3 ms, total: 30.8 ms Wall time: 35.5 ms
# Identify the shape and column types of the dataset
ds_train = discover(ctr_data_csv)
ds_train
dshape("""var * { id: string, click: int64, hour: int64, C1: int64, banner_pos: int64, site_id: string, site_domain: string, site_category: string, app_id: string, app_domain: string, app_category: string, device_id: string, device_ip: string, device_model: string, device_type: int64, device_conn_type: int64, C14: int64, C15: int64, C16: int64, C17: int64, C18: int64, C19: int64, C20: int64, C21: int64 }""")
# Construct expressions
expr = by(ctr_data_csv.click,avg_c14=ctr_data_csv.C14.mean(),count_click=ctr_data_csv.click.count())
# Compute the expression now
%time compute_01 = compute(expr)
CPU times: user 43.2 s, sys: 9.04 s, total: 52.3 s Wall time: 52.5 s
compute_01
click | avg_c14 | count_click | |
---|---|---|---|
0 | 0 | 18976.761383 | 33563901 |
1 | 1 | 18181.994233 | 6865066 |
# Convert data into BColz
%time into('ctr_train.bcolz','train.csv', dshape=ds_train)
# This will take an hour
%time ds_train_bcolz = Data('ctr_train.bcolz')
CPU times: user 7.99 ms, sys: 20.3 ms, total: 28.3 ms Wall time: 69.3 ms
expr = by(ds_train_bcolz.click,avg_c14=ds_train_bcolz.C14.mean(),count_click=ds_train_bcolz.click.count())
%time compute_02 = compute(expr)
CPU times: user 6.86 s, sys: 5.71 s, total: 12.6 s Wall time: 20.4 s
compute_02
click | avg_c14 | count_click | |
---|---|---|---|
0 | 0 | 18976.761383 | 33563901 |
1 | 1 | 18181.994233 | 6865066 |
import multiprocessing
pool = multiprocessing.Pool(8)
%time compute_03 = compute(expr,map=pool.map)
CPU times: user 186 ms, sys: 44.2 ms, total: 230 ms Wall time: 5.9 s
compute_03
ds_train_bcolz.C14.
Pandas-like operations
max(ctr_data_csv[ctr_data_csv.click>0].C14)
from glob import glob
# Specifying active columns at parse time greatly improves performance
active_columns = ['click', 'C14']
intermediates = []
# Do a split-apply-combine operation on each chunk of each CSV file
for fn in sorted(glob('train.csv')):
for df in pd.read_csv(fn, usecols=active_columns,
chunksize=1000000, skipinitialspace=True):
chunk = df.groupby('click').agg({'click': ['count'],
'C14': ['sum', 'mean']})
intermediates.append(chunk)
# What's there in each chunk?
chunk
click | C14 | ||
---|---|---|---|
count | sum | mean | |
click | |||
0 | 351604 | 7380262923 | 20990.270085 |
1 | 77363 | 1614597749 | 20870.412846 |
# Bring those results together. These are much smaller and so likely fit in memory
df = pd.concat(intermediates, axis=0)
df.columns = ['click_count', 'c14_sum', 'c14_mean'] # Flatten multi-index
# How does the data frame looks after concatenating all the chunks?
df
click_count | c14_sum | c14_mean | |
---|---|---|---|
click | |||
0 | 839781 | 15485235097 | 18439.611157 |
1 | 160219 | 2776968054 | 17332.326715 |
0 | 837010 | 15704841688 | 18763.000000 |
1 | 162990 | 2885584400 | 17704.000000 |
0 | 809601 | 15315350620 | 18917.158724 |
1 | 190399 | 3431793636 | 18024.220905 |
0 | 818277 | 14933318159 | 18249.710256 |
1 | 181723 | 3084832029 | 16975.462814 |
0 | 826325 | 15537485195 | 18803.116443 |
1 | 173675 | 3128568228 | 18013.923869 |
0 | 843004 | 16242455011 | 19267.352244 |
1 | 156996 | 2869792215 | 18279.397023 |
0 | 861439 | 17307156868 | 20090.983654 |
1 | 138561 | 2617169629 | 18888.212621 |
0 | 857272 | 16675203706 | 19451.473635 |
1 | 142728 | 2572199256 | 18021.686397 |
0 | 825936 | 13952152313 | 16892.534425 |
1 | 174064 | 2806406042 | 16122.840116 |
0 | 819998 | 14432652499 | 17600.838659 |
1 | 180002 | 3069368429 | 17051.857363 |
0 | 826223 | 15897438092 | 19241.098459 |
1 | 173777 | 3290585448 | 18935.678761 |
0 | 800866 | 13501155227 | 16858.195038 |
1 | 199134 | 3358807679 | 16867.072820 |
0 | 824410 | 12882590472 | 15626.436448 |
1 | 175590 | 2771596619 | 15784.478723 |
0 | 826295 | 13980965879 | 16920.065932 |
1 | 173705 | 2915364308 | 16783.421939 |
0 | 823124 | 14756484543 | 17927.413783 |
1 | 176876 | 3174768304 | 17949.118614 |
... | ... | ... | ... |
0 | 829161 | 15065875099 | 18170.023794 |
1 | 170839 | 2960857595 | 17331.274446 |
0 | 826675 | 16561823475 | 20034.261923 |
1 | 173325 | 3370328925 | 19445.140199 |
0 | 860241 | 18336728906 | 21315.804415 |
1 | 139759 | 2884343460 | 20637.980094 |
0 | 850870 | 18181877915 | 21368.573243 |
1 | 149130 | 3109406311 | 20850.307188 |
0 | 845280 | 17888362685 | 21162.647507 |
1 | 154720 | 3119763717 | 20163.933021 |
0 | 851055 | 17543236640 | 20613.516917 |
1 | 148945 | 2954959976 | 19839.269368 |
0 | 842777 | 17901540358 | 21241.000000 |
1 | 157223 | 3270726603 | 20803.000000 |
0 | 852260 | 17922574250 | 21029.467827 |
1 | 147740 | 2981194374 | 20178.654217 |
0 | 844891 | 16641698024 | 19696.857966 |
1 | 155109 | 2818825075 | 18173.188371 |
0 | 841249 | 16362477086 | 19450.218765 |
1 | 158751 | 2969096121 | 18702.849878 |
0 | 848139 | 17979268744 | 21198.493105 |
1 | 151861 | 3141363709 | 20685.783111 |
0 | 832139 | 17520063181 | 21054.250769 |
1 | 167861 | 3460206354 | 20613.521628 |
0 | 831248 | 17727410244 | 21326.259124 |
1 | 168752 | 3471709753 | 20572.851006 |
0 | 815621 | 17007405385 | 20852.093540 |
1 | 184379 | 3789951606 | 20555.223784 |
0 | 351604 | 7380262923 | 20990.270085 |
1 | 77363 | 1614597749 | 20870.412846 |
82 rows × 3 columns
%%time
# Perform second split-apply-combine operation on those intermediate results
groups = df.groupby(df.index) # group once for many of the following applies
df2 = pd.concat([groups.click_count.sum(),
groups.c14_sum.sum(),
groups.c14_mean.sum()],
axis=1)
CPU times: user 3.8 ms, sys: 880 µs, total: 4.68 ms Wall time: 4.09 ms
df2['avg_c14_metric'] = df2.c14_sum / df2.c14_mean
df2['count'] = df2.click_count
df2
result = df2[['avg_c14_metric', 'click_count']]
result
%%time
# Timing the entire run
# Specifying active columns at parse time greatly improves performance
active_columns = ['click', 'C14']
intermediates = []
# Do a split-apply-combine operation on each chunk of each CSV file
for fn in sorted(glob('train.csv')):
for df in pd.read_csv(fn, usecols=active_columns,
chunksize=1000000, skipinitialspace=True):
chunk = df.groupby('click').agg({'click': ['count'],
'C14': ['sum', 'mean']})
intermediates.append(chunk)
# Bring those results together. These are much smaller and so likely fit in memory
df = pd.concat(intermediates, axis=0)
df.columns = ['click_count', 'c14_sum', 'c14_mean'] # Flatten multi-index
# Perform second split-apply-combine operation on those intermediate results
groups = df.groupby(df.index) # group once for many of the following applies
df2 = pd.concat([groups.click_count.sum(),
groups.c14_sum.sum(),
groups.c14_mean.sum()],
axis=1)
df2['avg_c14_metric'] = df2.c14_sum / df2.c14_mean
df2['count'] = df2.click_count
# Select out the columns we want
result = df2[['avg_c14_metric', 'click_count']]
result
CPU times: user 39.3 s, sys: 5.64 s, total: 45 s Wall time: 45.5 s