import pandas as pd
def getstore_and_print_table(fname):
import pprint
store = pd.HDFStore(fname)
pprint.pprint(store.get_storer('df').group.table)
return store
df = pd.DataFrame(randn(1e6,2),columns=list('AB'))
%%timeit
df.to_hdf('test.h5','df',data_columns=['A','B'],mode='w',table=True, index=True)
1 loops, best of 3: 8.36 s per loop
%%timeit
df.to_hdf('test.h5','df',data_columns=['A','B'],mode='w',table=True, index=False)
1 loops, best of 3: 1.28 s per loop
store = getstore_and_print_table('test.h5')
/df/table (Table(1000000,)) '' description := { "index": Int64Col(shape=(), dflt=0, pos=0), "A": Float64Col(shape=(), dflt=0.0, pos=1), "B": Float64Col(shape=(), dflt=0.0, pos=2)} byteorder := 'little' chunkshape := (5461,)
store
<class 'pandas.io.pytables.HDFStore'> File path: test.h5 /df frame_table (typ->appendable,nrows->1000000,ncols->2,indexers->[index],dc->[A,B])
Also, that the selection works must have something to do with the fact that it has data_columns, even so I created the table with index=False
%timeit store.select('df',['B > 0.5', 'B < 1.6'])
1 loops, best of 3: 1.02 s per loop
%timeit store.select('df',['A<0.5','A>0.0'])
1 loops, best of 3: 641 ms per loop
%timeit store.create_table_index('df',columns=['B'],kind='full')
1 loops, best of 3: 613 µs per loop
store.get_storer('df').group.table
/df/table (Table(1000000,)) '' description := { "index": Int64Col(shape=(), dflt=0, pos=0), "A": Float64Col(shape=(), dflt=0.0, pos=1), "B": Float64Col(shape=(), dflt=0.0, pos=2)} byteorder := 'little' chunkshape := (5461,) autoIndex := True colindexes := { "B": Index(6, full, shuffle, zlib(1)).is_CSI=True}
No improvement just by creating an index. IIUC, that's because data_columns, that had been miracously created, even so I saved with index=False, had created an index automatically, just not 'full' as required by ptrepack --sortby:
%timeit store.select('df',['B > 0.5', 'B < 1.6'])
1 loops, best of 3: 1.05 s per loop
%timeit store.select('df',['A<0.5','A>0.0'])
1 loops, best of 3: 643 ms per loop
store.close()
!ptdump -v test.h5
/ (RootGroup) '' /df (Group) '' /df/table (Table(1000000,)) '' description := { "index": Int64Col(shape=(), dflt=0, pos=0), "A": Float64Col(shape=(), dflt=0.0, pos=1), "B": Float64Col(shape=(), dflt=0.0, pos=2)} byteorder := 'little' chunkshape := (5461,) autoIndex := True colindexes := { "B": Index(6, full, shuffle, zlib(1)).is_CSI=True}
%timeit !ptrepack --chunkshape=auto --sortby=B -o test.h5 test_sorted_noprop.h5
1 loops, best of 3: 4.29 s per loop
!ptdump -v test_sorted_noprop.h5
/ (RootGroup) '' /df (Group) '' /df/table (Table(1000000,)) '' description := { "index": Int64Col(shape=(), dflt=0, pos=0), "A": Float64Col(shape=(), dflt=0.0, pos=1), "B": Float64Col(shape=(), dflt=0.0, pos=2)} byteorder := 'little' chunkshape := (5461,)
store = getstore_and_print_table('test_sorted_noprop.h5')
/df/table (Table(1000000,)) '' description := { "index": Int64Col(shape=(), dflt=0, pos=0), "A": Float64Col(shape=(), dflt=0.0, pos=1), "B": Float64Col(shape=(), dflt=0.0, pos=2)} byteorder := 'little' chunkshape := (5461,)
store
<class 'pandas.io.pytables.HDFStore'> File path: test_sorted_noprop.h5 /df frame_table (typ->appendable,nrows->1000000,ncols->2,indexers->[index],dc->[A,B])
%timeit store.select('df',['B > 0.5', 'B < 1.6'])
1 loops, best of 3: 188 ms per loop
try:
%timeit store.select('df',['A<0.5','A>0.0'])
except ValueError as e:
print "ValueError:",e
1 loops, best of 3: 626 ms per loop
store.close()
%timeit !ptrepack --chunkshape=auto --sortby=B --propindexes -o test.h5 test_sorted.h5
1 loops, best of 3: 7.08 s per loop
!ptdump -v test_sorted.h5
/ (RootGroup) '' /df (Group) '' /df/table (Table(1000000,)) '' description := { "index": Int64Col(shape=(), dflt=0, pos=0), "A": Float64Col(shape=(), dflt=0.0, pos=1), "B": Float64Col(shape=(), dflt=0.0, pos=2)} byteorder := 'little' chunkshape := (5461,) autoIndex := True colindexes := { "B": Index(6, full, shuffle, zlib(1)).is_CSI=True}
store = getstore_and_print_table('test_sorted.h5')
/df/table (Table(1000000,)) '' description := { "index": Int64Col(shape=(), dflt=0, pos=0), "A": Float64Col(shape=(), dflt=0.0, pos=1), "B": Float64Col(shape=(), dflt=0.0, pos=2)} byteorder := 'little' chunkshape := (5461,) autoIndex := True colindexes := { "B": Index(6, full, shuffle, zlib(1)).is_CSI=True}
%timeit store.select('df',['B > 0.5','B < 1.6'])
10 loops, best of 3: 172 ms per loop
try:
%timeit store.select('df',['A<0.5','A>0.0'])
except ValueError as e:
print "ValueError:",e
1 loops, best of 3: 627 ms per loop
store.close()
Compression (see next, here done at level 5, I also tested 9) doesn't make timing much worse, but certainly not better also. Interestingly, it only took marginally more than the ptrepack with --propindexes which means that that option is dominating the ptrepacking time. The filesizes for my examples were:
So my conclusion is, that doing things without index at data collection saves a lot of time, and even quite some space, without even using compression!
%timeit !ptrepack --chunkshape=auto --sortby=B --propindexes --complib=blosc --complevel=5 -o test.h5 test_sorted_compressed.h5
1 loops, best of 3: 7.21 s per loop
!ptdump -v test_sorted_compressed.h5
/ (RootGroup) '' /df (Group) '' /df/table (Table(1000000,), shuffle, blosc(5)) '' description := { "index": Int64Col(shape=(), dflt=0, pos=0), "A": Float64Col(shape=(), dflt=0.0, pos=1), "B": Float64Col(shape=(), dflt=0.0, pos=2)} byteorder := 'little' chunkshape := (5461,) autoIndex := True colindexes := { "B": Index(6, full, shuffle, zlib(1)).is_CSI=True}
store = getstore_and_print_table('test_sorted_compressed.h5')
/df/table (Table(1000000,), shuffle, blosc(5)) '' description := { "index": Int64Col(shape=(), dflt=0, pos=0), "A": Float64Col(shape=(), dflt=0.0, pos=1), "B": Float64Col(shape=(), dflt=0.0, pos=2)} byteorder := 'little' chunkshape := (5461,) autoIndex := True colindexes := { "B": Index(6, full, shuffle, zlib(1)).is_CSI=True}
%timeit store.select('df',['B > 0.5','B < 1.6'])
1 loops, best of 3: 191 ms per loop
store.close()