import timeit
from astropy.io import ascii
import pandas
import numpy as np
from astropy.table import Table, Column
from cStringIO import StringIO
import matplotlib.pyplot as plt
%matplotlib inline
def make_table(size=10000, n_floats=10, n_ints=0, n_strs=0, float_format=None, str_val=None):
if str_val is None:
str_val = "abcde12345"
cols = []
for i in xrange(n_floats):
dat = np.random.uniform(low=1, high=10, size=size)
cols.append(Column(dat, name='f{}'.format(i)))
for i in xrange(n_ints):
dat = np.random.randint(low=-9999999, high=9999999, size=size)
cols.append(Column(dat, name='i{}'.format(i)))
for i in xrange(n_strs):
dat = np.repeat(str_val, size)
cols.append(Column(dat, name='s{}'.format(i)))
t = Table(cols)
if float_format is not None:
for col in t.columns.values():
if col.name.startswith('f'):
col.format = float_format
return t
t = make_table(5, float_format='%.4f')
print(t)
f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ 3.8303 8.4571 7.8665 3.8209 6.3194 3.1620 8.5229 3.3333 7.8304 8.6393 3.3609 7.6057 6.3219 1.4221 3.0528 2.6562 1.1607 1.6113 1.8004 6.0744 2.6694 4.9182 8.9608 4.3329 4.6248 5.0090 6.0787 9.0370 6.8887 6.1026 3.1817 4.0416 5.8638 1.9688 8.8393 6.3279 6.6718 9.1173 3.8640 6.1094 9.1954 8.5942 8.1698 6.5603 5.6420 5.0458 6.7071 7.8081 6.8218 3.1500
out = StringIO()
ascii.write(t, out, use_fast_writer=True)
print out.getvalue()
f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 3.8303 8.4571 7.8665 3.8209 6.3194 3.1620 8.5229 3.3333 7.8304 8.6393 3.3609 7.6057 6.3219 1.4221 3.0528 2.6562 1.1607 1.6113 1.8004 6.0744 2.6694 4.9182 8.9608 4.3329 4.6248 5.0090 6.0787 9.0370 6.8887 6.1026 3.1817 4.0416 5.8638 1.9688 8.8393 6.3279 6.6718 9.1173 3.8640 6.1094 9.1954 8.5942 8.1698 6.5603 5.6420 5.0458 6.7071 7.8081 6.8218 3.1500
out = StringIO()
pandas_table = pandas.DataFrame(np.array(t))
pandas_table.to_csv(out, float_format='%.4f')
print out.getvalue()
,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9 0,3.8303,8.4571,7.8665,3.8209,6.3194,3.1620,8.5229,3.3333,7.8304,8.6393 1,3.3609,7.6057,6.3219,1.4221,3.0528,2.6562,1.1607,1.6113,1.8004,6.0744 2,2.6694,4.9182,8.9608,4.3329,4.6248,5.0090,6.0787,9.0370,6.8887,6.1026 3,3.1817,4.0416,5.8638,1.9688,8.8393,6.3279,6.6718,9.1173,3.8640,6.1094 4,9.1954,8.5942,8.1698,6.5603,5.6420,5.0458,6.7071,7.8081,6.8218,3.1500
def plot_case(n_floats=10, n_ints=0, n_strs=0, float_format=None, str_val=None, strip=True):
global table, np_table, pandas_table, flt_format, strip_whitespace
strip_whitespace=strip
flt_format = float_format
n_rows = (100, 200, 500, 1000, 2000, 5000, 10000, 20000) # include 50000 for publish run
numbers = (10, 10, 5, 2, 1, 1, 1, 1)
repeats = (3, 3, 3, 3, 3, 3, 3, 2)
times_slow = []
times_fast = []
times_pandas = []
for n_row, number, repeat in zip(n_rows, numbers, repeats):
table = make_table(n_row, n_floats, n_ints, n_strs, float_format)
np_table = np.array(table)
pandas_table = pandas.DataFrame(np_table)
t = timeit.repeat("out = StringIO(); ascii.write(table, out, use_fast_writer=False, strip_whitespace=strip_whitespace)",
setup='from __main__ import ascii, table, StringIO, strip_whitespace', number=number, repeat=repeat)
times_slow.append(min(t) / number)
t = timeit.repeat("out = StringIO(); ascii.write(table, out, use_fast_writer=True, strip_whitespace=strip_whitespace)",
setup='from __main__ import ascii, table, StringIO, strip_whitespace', number=number, repeat=repeat)
times_fast.append(min(t) / number)
t = timeit.repeat("out = StringIO(); pandas_table.to_csv(out, float_format=flt_format)",
setup='from __main__ import pandas_table, pandas, StringIO, flt_format', number=number, repeat=repeat)
times_pandas.append(min(t) / number)
plt.loglog(n_rows, times_slow, '-ob', label='io.ascii Python')
plt.loglog(n_rows, times_fast, '-or', label='io.ascii Fast-c')
plt.loglog(n_rows, times_pandas, '-oc', label='Pandas')
plt.grid()
plt.legend(loc='best')
plt.title('n_floats={} n_ints={} n_strs={} float_format={}'.format(n_floats, n_ints, n_strs, float_format))
plt.xlabel('Number of rows')
plt.ylabel('Time (sec)')
print('Fast-C to Python speed ratio: {:.2f} : 1'.format(times_slow[-1] / times_fast[-1]))
print('Pandas to Fast-C speed ratio: {:.2f} : 1'.format(times_fast[-1] / times_pandas[-1]))
plot_case(n_floats=10, n_ints=0, n_strs=0, float_format=None)
Fast-C to Python speed ratio: 1.87 : 1 Pandas to Fast-C speed ratio: 1.06 : 1
plot_case(n_floats=10, n_ints=10, n_strs=10, float_format=None)
Fast-C to Python speed ratio: 3.63 : 1 Pandas to Fast-C speed ratio: 1.30 : 1
plot_case(n_floats=10, n_ints=10, n_strs=10, float_format='%.4f')
Fast-C to Python speed ratio: 3.91 : 1 Pandas to Fast-C speed ratio: 1.56 : 1
plot_case(n_floats=10, n_ints=0, n_strs=0, float_format='%.4f')
Fast-C to Python speed ratio: 1.57 : 1 Pandas to Fast-C speed ratio: 1.27 : 1
plot_case(n_floats=0, n_ints=0, n_strs=10)
Fast-C to Python speed ratio: 3.08 : 1 Pandas to Fast-C speed ratio: 2.01 : 1
plot_case(n_floats=0, n_ints=0, n_strs=10, str_val="'asdf asdfa'")
Fast-C to Python speed ratio: 2.96 : 1 Pandas to Fast-C speed ratio: 2.04 : 1
plot_case(n_floats=0, n_ints=0, n_strs=10, strip=False)
Fast-C to Python speed ratio: 3.60 : 1 Pandas to Fast-C speed ratio: 1.46 : 1
plot_case(n_floats=0, n_ints=10, n_strs=0)
Fast-C to Python speed ratio: 12.05 : 1 Pandas to Fast-C speed ratio: 1.23 : 1