import timeit from astropy.io import ascii import pandas import numpy as np from astropy.table import Table, Column from tempfile import NamedTemporaryFile import random import string import matplotlib.pyplot as plt %matplotlib inline def make_table(table, size=10000, n_floats=10, n_ints=0, n_strs=0, float_format=None, str_val=None): if str_val is None: str_val = "abcde12345" cols = [] for i in xrange(n_floats): dat = np.random.uniform(low=1, high=10, size=size) cols.append(Column(dat, name='f{}'.format(i))) for i in xrange(n_ints): dat = np.random.randint(low=-9999999, high=9999999, size=size) cols.append(Column(dat, name='i{}'.format(i))) for i in xrange(n_strs): if str_val == 'random': dat = np.array([''.join([random.choice(string.letters) for j in range(10)]) for k in range(size)]) else: dat = np.repeat(str_val, size) cols.append(Column(dat, name='s{}'.format(i))) t = Table(cols) if float_format is not None: for col in t.columns.values(): if col.name.startswith('f'): col.format = float_format t.write(table.name, format='ascii') def plot_case(n_floats=10, n_ints=0, n_strs=0, float_format=None, str_val=None, genfromtxt=True): global table1 n_rows = (100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000) # include 50000 for publish run numbers = (10, 10, 5, 2, 1, 1, 1, 1, 1) repeats = (3, 3, 3, 3, 3, 3, 3, 2, 1) times_slow = [] times_fast = [] times_fast_converter = [] times_pandas = [] times_genfromtxt = [] for n_row, number, repeat in zip(n_rows, numbers, repeats): table1 = NamedTemporaryFile() make_table(table1, n_row, n_floats, n_ints, n_strs, float_format, str_val) t = timeit.repeat("ascii.read(table1.name, use_fast_reader=False, format='basic', guess=False)", setup='from __main__ import ascii, table1', number=number, repeat=repeat) times_slow.append(min(t) / number) t = timeit.repeat("ascii.read(table1.name, format='basic', guess=False)", setup='from __main__ import ascii, table1', number=number, repeat=repeat) times_fast.append(min(t) / number) if n_floats > 0: t = timeit.repeat("ascii.read(table1.name, format='basic', guess=False, use_fast_converter=True)", setup='from __main__ import ascii, table1', number=number, repeat=repeat) times_fast_converter.append(min(t) / number) t = timeit.repeat("pandas.read_csv(table1.name, sep=' ', header=0)", setup='from __main__ import table1, pandas', number=number, repeat=repeat) times_pandas.append(min(t) / number) if genfromtxt: t = timeit.repeat("np.genfromtxt(table1.name, names=True)", setup='from __main__ import table1, np', number=number, repeat=repeat) times_genfromtxt.append(min(t) / number) plt.loglog(n_rows, times_slow, '-ob', label='io.ascii Python') plt.loglog(n_rows, times_fast, '-or', label='io.ascii Fast-c') if n_floats > 0: plt.loglog(n_rows, times_fast_converter, '-oy', label='Fast converter') plt.loglog(n_rows, times_pandas, '-oc', label='Pandas') if genfromtxt: plt.loglog(n_rows, times_genfromtxt, '-om', label='np.genfromtxt', alpha=0.5) plt.grid() plt.legend(loc='best') plt.title('n_floats={} n_ints={} n_strs={} float_format={}'.format(n_floats, n_ints, n_strs, float_format)) plt.xlabel('Number of rows') plt.ylabel('Time (sec)') print('Fast-C to Python speed ratio: {:.2f} : 1'.format(times_slow[-1] / times_fast[-1])) if n_floats > 0: print('Fast-C with converter to Fast-C speed ratio: {:.2f} : 1'.format(times_fast[-1] / times_fast_converter[-1])) print('Pandas to Fast-C with converter speed ratio: {:.2f} : 1'.format(times_fast_converter[-1] / times_pandas[-1])) else: print('Pandas to Fast-C speed ratio: {:.2f} : 1'.format(times_fast[-1] / times_pandas[-1])) plot_case(n_floats=10, n_ints=0, n_strs=0, float_format=None) plot_case(n_floats=10, n_ints=10, n_strs=10, float_format=None) plot_case(n_floats=10, n_ints=10, n_strs=10, float_format='%.4f') plot_case(n_floats=10, n_ints=0, n_strs=0, float_format='%.4f') plot_case(n_floats=0, n_ints=0, n_strs=10) plot_case(n_floats=0, n_ints=0, n_strs=10, str_val="'asdf asdfa'", genfromtxt=False) plot_case(n_floats=0, n_ints=0, n_strs=10, str_val="random") plot_case(n_floats=0, n_ints=10, n_strs=0)