%load_ext watermark %watermark -d -v -a 'Sebastian Raschka' -p numpy,pandas import pandas as pd import numpy as np df = pd.DataFrame() for col in ('a', 'b', 'c', 'd'): df[col] = pd.Series(range(1000), index=range(1000)) df.tail() df.loc[:, ['a', 'c', 'd']].sum(axis=0) # 1 %timeit -n 1000 -r 5 df.loc[:, ['a', 'c', 'd']].sum(axis=0) # 2 %timeit -n 1000 -r 5 df[['a', 'c', 'd']].sum(axis=0) # 3 %timeit -n 1000 -r 5 df[['a', 'c', 'd']].values.sum(axis=0) [df[col].values.sum(axis=0) for col in ('a', 'c', 'd')] # 4 %timeit -n 1000 -r 5 [df[col].values.sum(axis=0) for col in ('a', 'c', 'd')] from numpy import einsum [einsum('i->', df[col].values) for col in ('a', 'c', 'd')] # 5 %timeit -n 1000 -r 5 [einsum('i->', df[col].values) for col in ('a', 'c', 'd')] import timeit import random from numpy import einsum import pandas as pd def run_loc_sum(df): return df.loc[:, ['a', 'c', 'd']].sum(axis=0) def run_einsum(df): return [einsum('i->', df[col].values) for col in ('a', 'c', 'd')] orders = [10**i for i in range(4, 8)] loc_res = [] einsum_res = [] for n in orders: df = pd.DataFrame() for col in ('a', 'b', 'c', 'd'): df[col] = pd.Series(range(n), index=range(n)) print('n=%s (%s of %s)' %(n, orders.index(n)+1, len(orders))) loc_res.append(min(timeit.Timer('run_loc_sum(df)' , 'from __main__ import run_loc_sum, df').repeat(repeat=5, number=1))) einsum_res.append(min(timeit.Timer('run_einsum(df)' , 'from __main__ import run_einsum, df').repeat(repeat=5, number=1))) print('finished') %matplotlib inline from matplotlib import pyplot as plt def plot_1(): fig = plt.figure(figsize=(12,6)) plt.plot(orders, loc_res, label="df.loc[:, ['a', 'c', 'd']].sum(axis=0)", lw=2, alpha=0.6) plt.plot(orders,einsum_res, label="[einsum('i->', df[col].values) for col in ('a', 'c', 'd')]", lw=2, alpha=0.6) plt.title('Pandas Column Sums', fontsize=20) plt.xlim([min(orders), max(orders)]) plt.grid() #plt.xscale('log') plt.ticklabel_format(style='plain', axis='x') plt.legend(loc='upper left', fontsize=14) plt.xlabel('Number of rows', fontsize=16) plt.ylabel('time in seconds', fontsize=16) plt.tight_layout() plt.show() plot_1() import timeit import random from numpy import einsum import pandas as pd def run_loc_sum(df, n): return df.loc[:, 0:n-1].sum(axis=0) def run_einsum(df, n): return [einsum('i->', df[col].values) for col in range(0,n-1)] orders = [10**i for i in range(2, 5)] loc_res = [] einsum_res = [] for n in orders: df = pd.DataFrame() for col in range(n): df[col] = pd.Series(range(1000), index=range(1000)) print('n=%s (%s of %s)' %(n, orders.index(n)+1, len(orders))) loc_res.append(min(timeit.Timer('run_loc_sum(df, n)' , 'from __main__ import run_loc_sum, df, n').repeat(repeat=5, number=1))) einsum_res.append(min(timeit.Timer('run_einsum(df, n)' , 'from __main__ import run_einsum, df, n').repeat(repeat=5, number=1))) print('finished') from matplotlib import pyplot as plt def plot_2(): fig = plt.figure(figsize=(12,6)) plt.plot(orders, loc_res, label="df.loc[:, 0:n-1].sum(axis=0)", lw=2, alpha=0.6) plt.plot(orders,einsum_res, label="[einsum('i->', df[col].values) for col in range(0,n-1)]", lw=2, alpha=0.6) plt.title('Pandas Column Sums', fontsize=20) plt.xlim([min(orders), max(orders)]) plt.grid() #plt.xscale('log') plt.ticklabel_format(style='plain', axis='x') plt.legend(loc='upper left', fontsize=14) plt.xlabel('Number of columns', fontsize=16) plt.ylabel('time in seconds', fontsize=16) plt.tight_layout() plt.show() plot_2()