import timeit
from astropy.io import ascii
import pandas
import numpy as np
from astropy.table import Table, Column
import cStringIO as StringIO
import matplotlib.pyplot as plt
%matplotlib inline
def make_table(size=10000, n_floats=10, n_ints=0, n_strs=0, float_format=None, str_val=None):
if str_val is None:
str_val = "abcde12345"
cols = []
for i in xrange(n_floats):
dat = np.random.uniform(low=1, high=10, size=size)
cols.append(Column(dat, name='f{}'.format(i)))
for i in xrange(n_ints):
dat = np.random.randint(low=-9999999, high=9999999, size=size)
cols.append(Column(dat, name='i{}'.format(i)))
for i in xrange(n_strs):
dat = np.repeat(str_val, size)
cols.append(Column(dat, name='s{}'.format(i)))
t = Table(cols)
if float_format is not None:
for col in t.columns.values():
if col.name.startswith('f'):
col.format = float_format
fh1 = StringIO.StringIO()
t.write(fh1, format='ascii')
return fh1
def plot_case(n_floats=10, n_ints=0, n_strs=0, float_format=None, str_val=None):
global table1
n_rows = (100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000) # include 50000 for publish run
numbers = (10, 10, 5, 2, 1, 1, 1, 1, 1)
repeats = (3, 3, 3, 3, 3, 3, 3, 2, 1)
times_slow = []
times_fast = []
times_pandas = []
times_genfromtxt = []
for n_row, number, repeat in zip(n_rows, numbers, repeats):
table1 = make_table(n_row, n_floats, n_ints, n_strs, float_format)
t = timeit.repeat("table1.seek(0); ascii.read(table1, use_fast_reader=False, format='basic', guess=False)",
setup='from __main__ import ascii, table1', number=number, repeat=repeat)
times_slow.append(min(t) / number)
t = timeit.repeat("table1.seek(0); ascii.read(table1, use_fast_reader=True, format='basic', guess=False)",
setup='from __main__ import ascii, table1', number=number, repeat=repeat)
times_fast.append(min(t) / number)
t = timeit.repeat("table1.seek(0); pandas.read_csv(table1, sep=' ', header=0)",
setup='from __main__ import table1, pandas', number=number, repeat=repeat)
times_pandas.append(min(t) / number)
t = timeit.repeat("table1.seek(0); np.genfromtxt(table1, names=True)",
setup='from __main__ import table1, np', number=number, repeat=repeat)
times_genfromtxt.append(min(t) / number)
plt.loglog(n_rows, times_slow, '-ob', label='io.ascii Python')
plt.loglog(n_rows, times_fast, '-or', label='io.ascii Fast-c')
plt.loglog(n_rows, times_pandas, '-oc', label='Pandas')
plt.loglog(n_rows, times_genfromtxt, '-om', label='np.genfromtxt', alpha=0.5)
plt.grid()
plt.legend(loc='best')
plt.title('n_floats={} n_ints={} n_strs={} float_format={}'.format(n_floats, n_ints, n_strs, float_format))
plt.xlabel('Number of rows')
plt.ylabel('Time (sec)')
print('Fast-C to Python speed ratio: {:.2f} : 1'.format(times_slow[-1] / times_fast[-1]))
print('Pandas to Fast-C speed ratio: {:.2f} : 1'.format(times_fast[-1] / times_pandas[-1]))
plot_case(n_floats=10, n_ints=0, n_strs=0, float_format=None)
Fast-C to Python speed ratio: 2.12 : 1 Pandas to Fast-C speed ratio: 3.60 : 1
plot_case(n_floats=10, n_ints=10, n_strs=10, float_format=None)
Fast-C to Python speed ratio: 2.78 : 1 Pandas to Fast-C speed ratio: 2.33 : 1
plot_case(n_floats=10, n_ints=10, n_strs=10, float_format='%.4f')
Fast-C to Python speed ratio: 4.54 : 1 Pandas to Fast-C speed ratio: 1.77 : 1
plot_case(n_floats=10, n_ints=0, n_strs=0, float_format='%.4f')
Fast-C to Python speed ratio: 5.82 : 1 Pandas to Fast-C speed ratio: 2.06 : 1
plot_case(n_floats=0, n_ints=0, n_strs=10)
Fast-C to Python speed ratio: 3.17 : 1 Pandas to Fast-C speed ratio: 3.43 : 1
plot_case(n_floats=0, n_ints=0, n_strs=10, str_val="'asdf asdfa'")
Fast-C to Python speed ratio: 3.49 : 1 Pandas to Fast-C speed ratio: 3.13 : 1
plot_case(n_floats=0, n_ints=10, n_strs=0)
Fast-C to Python speed ratio: 6.23 : 1 Pandas to Fast-C speed ratio: 2.20 : 1