import timeit n = 10000 def test_format(n): return ['{}'.format(i) for i in range(n)] def test_binaryop(n): return ['%s' %i for i in range(n)] %timeit test_format(n) %timeit test_binaryop(n) funcs = ['test_format', 'test_binaryop'] orders_n = [10**n for n in range(1, 6)] times_n = {f:[] for f in funcs} for n in orders_n: for f in funcs: times_n[f].append(min(timeit.Timer('%s(n)' %f, 'from __main__ import %s, n' %f) .repeat(repeat=3, number=1000))) %pylab inline import matplotlib.pyplot as plt labels = [('test_format', '.format() method'), ('test_binaryop', 'binary operator %')] matplotlib.rcParams.update({'font.size': 12}) fig = plt.figure(figsize=(10,8)) for lb in labels: plt.plot(orders_n, times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3) plt.xlabel('sample size n') plt.ylabel('time per computation in milliseconds [ms]') plt.xlim([1,max(orders_n) + max(orders_n) * 10]) plt.legend(loc=2) plt.grid() plt.xscale('log') plt.yscale('log') plt.title('Performance of different string formatting methods') max_perf = max( f/b for f,b in zip(times_n['test_format'], times_n['test_binaryop']) ) min_perf = min( f/b for f,b in zip(times_n['test_format'], times_n['test_binaryop']) ) ftext = 'The binary op. % is {:.2f}x to {:.2f}x faster than .format()'\ .format(min_perf, max_perf) plt.figtext(.14,.75, ftext, fontsize=11, ha='left') plt.show() import timeit def reverse_join(my_str): return ''.join(reversed(my_str)) def reverse_slizing(my_str): return my_str[::-1] test_str = 'abcdefg' # Test to show that both work a = reverse_join(test_str) b = reverse_slizing(test_str) assert(a == b and a == 'gfedcba') %timeit reverse_join(test_str) %timeit reverse_slizing(test_str) funcs = ['reverse_join', 'reverse_slizing'] orders_n = [10**n for n in range(1, 6)] test_strings = (test_str*n for n in orders_n) times_n = {f:[] for f in funcs} for st,n in zip(test_strings, orders_n): for f in funcs: times_n[f].append(min(timeit.Timer('%s(st)' %f, 'from __main__ import %s, st' %f) .repeat(repeat=3, number=1000))) %pylab inline import matplotlib.pyplot as plt labels = [('reverse_join', '"".join(reversed(my_str))'), ('reverse_slizing', 'my_str[::-1]')] matplotlib.rcParams.update({'font.size': 12}) fig = plt.figure(figsize=(10,8)) for lb in labels: plt.plot([n*len(test_str) for n in orders_n], times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3) plt.xlabel('sample size n') plt.ylabel('time per computation in milliseconds [ms]') plt.xlim([1,max(orders_n) + max(orders_n) * 10]) plt.legend(loc=2) plt.grid() plt.xscale('log') plt.yscale('log') plt.title('Performance of different string reversing methods') max_perf = max( j/s for j,s in zip(times_n['reverse_join'], times_n['reverse_slizing']) ) min_perf = min( j/s for j,s in zip(times_n['reverse_join'], times_n['reverse_slizing']) ) ftext = 'my_str[::-1] is {:.2f}x to {:.2f}x faster than "".join(reversed(my_str))'\ .format(min_perf, max_perf) plt.figtext(.14,.75, ftext, fontsize=11, ha='left') plt.show() import timeit def string_add(in_chars): new_str = '' for char in in_chars: new_str += char return new_str def string_join(in_chars): return ''.join(in_chars) test_chars = ['a', 'b', 'c', 'd', 'e', 'f'] %timeit string_add(test_chars) %timeit string_join(test_chars) funcs = ['string_add', 'string_join'] orders_n = [10**n for n in range(1, 6)] test_chars_n = (test_chars*n for n in orders_n) times_n = {f:[] for f in funcs} for st,n in zip(test_chars_n, orders_n): for f in funcs: times_n[f].append(min(timeit.Timer('%s(st)' %f, 'from __main__ import %s, st' %f) .repeat(repeat=3, number=1000))) %pylab inline import matplotlib.pyplot as plt labels = [('string_add', 'new_str += char'), ('string_join', '"".join(chars)')] matplotlib.rcParams.update({'font.size': 12}) fig = plt.figure(figsize=(10,8)) for lb in labels: plt.plot([len(test_chars)*n for n in orders_n], times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3) plt.xlabel('sample size n') plt.ylabel('time per computation in milliseconds [ms]') #plt.xlim([1,max(orders_n) + max(orders_n) * 10]) plt.legend(loc=2) plt.grid() plt.xscale('log') plt.yscale('log') plt.title('Performance of different string concatenation methods') max_perf = max( a/j for a,j in zip(times_n['string_add'], times_n['string_join']) ) min_perf = min( a/j for a,j in zip(times_n['string_add'], times_n['string_join']) ) ftext = '"".join(chars) is {:.2f}x to {:.2f}x faster than new_str += char'\ .format(min_perf, max_perf) plt.figtext(.14,.75, ftext, fontsize=11, ha='left') plt.show() import timeit n = 1000 def plus_operator(n): my_str = 'a' for i in range(n): my_str = my_str + str(1) + str(2) return my_str def format_method(n): my_str = 'a' for i in range(n): my_str = '{}{}{}'.format(my_str,1,2) def binary_operator(n): my_str = 'a' for i in range(n): my_str = '%s%s%s' %(my_str,1,2) return my_str %timeit plus_operator(n) %timeit format_method(n) %timeit binary_operator(n) funcs = ['plus_operator', 'format_method', 'binary_operator'] orders_n = [10**n for n in range(1, 5)] times_n = {f:[] for f in funcs} for n in orders_n: for f in funcs: times_n[f].append(min(timeit.Timer('%s(n)' %f, 'from __main__ import %s, n' %f) .repeat(repeat=3, number=1000))) %pylab inline import matplotlib.pyplot as plt labels = [('plus_operator', 'my_str + str(1) + str(2)'), ('format_method', '"{}{}{}".format(my_str,1,2)'), ('binary_operator', '"%s%s%s" %(my_str,1,2)'), ] matplotlib.rcParams.update({'font.size': 12}) fig = plt.figure(figsize=(10,8)) for lb in labels: plt.plot(orders_n, times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3) plt.xlabel('sample size n') plt.ylabel('time per computation in milliseconds [ms]') #plt.xlim([1,max(orders_n) + max(orders_n) * 10]) plt.legend(loc=2) plt.grid() plt.xscale('log') plt.yscale('log') plt.title('Performance of different string assembly methods') max_perf = max( p/b for p,b in zip(times_n['plus_operator'], times_n['binary_operator']) ) min_perf = min( p/b for p,b in zip(times_n['plus_operator'], times_n['binary_operator']) ) ftext = '"%s%s%s" %(my_str,1,2) is {:.2f}x to'\ '{:.2f}x faster than my_str + str(1) + str(2)'\ .format(min_perf, max_perf) plt.figtext(.14,.75, ftext, fontsize=11, ha='left') plt.show() import timeit def string_is_int(a_str): try: int(a_str) return True except ValueError: return False an_int = '123' no_int = '123abc' %timeit string_is_int(an_int) %timeit string_is_int(no_int) %timeit an_int.isdigit() %timeit no_int.isdigit() funcs = ['string_is_int', 'isdigit'] t1 = '123' t2 = '123abc' isdigit_method = [] string_is_int_method = [] for t in [t1,t2]: string_is_int_method.append(min(timeit.Timer('string_is_int(t)', 'from __main__ import string_is_int, t') .repeat(repeat=3, number=1000000))) isdigit_method.append(min(timeit.Timer('t.isdigit()', 'from __main__ import t') .repeat(repeat=3, number=1000000))) %pylab inline N = len(isdigit_method) ind = np.arange(N) # the x locations for the groups width = 0.25 # the width of the bars fig, ax = plt.subplots() plt.bar(ind, [i for i in string_is_int_method], width, alpha=0.5, color='g', label='string_is_int(a_str)') plt.bar(ind + width, [i for i in isdigit_method], width, alpha=0.5, color='b', label='a_str.isdigit()') ax.set_ylabel('time in microseconds') ax.set_title('Time to check if a string is an integer') ax.set_xticks(ind + width) ax.set_xticklabels(['"%s"' %t for t in [t1, t2]]) plt.xlabel('test strings') plt.xlim(-0.1,1.6) #plt.ylim(0,15) plt.legend(loc='upper left') plt.show() import timeit def string_is_number(a_str): try: float(a_str) return True except ValueError: return False a_float = '1.234' no_float = '123abc' a_float.replace('.','',1).isdigit() no_float.replace('.','',1).isdigit() %timeit string_is_number(an_int) %timeit string_is_number(no_int) %timeit a_float.replace('.','',1).isdigit() %timeit no_float.replace('.','',1).isdigit() funcs = ["string_is_number", "replace('.','',1).isdigit()"] t1 = '1.234' t2 = '123abc' isdigit_method = [] string_is_number_method = [] for t in [t1,t2]: string_is_number_method.append(min(timeit.Timer('string_is_number(t)', 'from __main__ import string_is_number, t') .repeat(repeat=3, number=1000000))) isdigit_method.append(min(timeit.Timer("t.replace('.','',1).isdigit()", 'from __main__ import t') .repeat(repeat=3, number=1000000))) %pylab inline N = len(isdigit_method) ind = np.arange(N) # the x locations for the groups width = 0.25 # the width of the bars fig, ax = plt.subplots() plt.bar(ind , [i for i in isdigit_method], width, alpha=0.5, color='b', label="a_str.replace('.','',1).isdigit()") plt.bar(ind + width, [i for i in string_is_number_method], width, alpha=0.5, color='g', label='string_is_number(a_str)') ax.set_ylabel('time in microseconds') ax.set_title('Time to check if a string is a number') ax.set_xticks(ind + width) ax.set_xticklabels(['"%s"' %t for t in [t2, t1]]) plt.xlabel('test strings') plt.xlim(-0.1,1.6) #plt.ylim(0,15) plt.legend(loc='upper left') plt.show() import timeit import copy def reverse_func(my_list): return copy.deepcopy(my_list).reverse() def reversed_func(my_list): return list(reversed(my_list)) def reverse_slizing(my_list): return my_list[::-1] n = 10 test_list = list([i for i in range(n)]) %timeit reverse_func(test_list) %timeit reversed_func(test_list) %timeit reverse_slizing(test_list) funcs = ['reverse_func', 'reversed_func', 'reverse_slizing'] orders_n = [10**n for n in range(1, 6)] times_n = {f:[] for f in funcs} for n in orders_n: test_list = list([i for i in range(n)]) for f in funcs: times_n[f].append(min(timeit.Timer('%s(test_list)' %f, 'from __main__ import %s, test_list' %f) .repeat(repeat=3, number=1000))) %pylab inline import matplotlib.pyplot as plt labels = [('reverse_func', 'copy.deepcopy(my_list).reverse()'), ('reversed_func', 'list(reversed(my_list))'), ('reverse_slizing', 'my_list[::-1]'), ] matplotlib.rcParams.update({'font.size': 12}) fig = plt.figure(figsize=(10,8)) for lb in labels: plt.plot(orders_n, times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3) plt.xlabel('sample size n') plt.ylabel('time per computation in milliseconds [ms]') #plt.xlim([1,max(orders_n) + max(orders_n) * 10]) plt.legend(loc=2) plt.grid() plt.xscale('log') plt.yscale('log') plt.title('Performance of different list reversing approaches') max_perf = max( f/s for f,s in zip(times_n['reverse_func'], times_n['reverse_slizing']) ) min_perf = min( f/s for f,s in zip(times_n['reverse_func'], times_n['reverse_slizing']) ) ftext = 'my_list[::-1] is {:.2f}x to '\ '{:.2f}x faster than copy.deepcopy(my_list).reverse()'\ .format(min_perf, max_perf) plt.figtext(.14,.75, ftext, fontsize=11, ha='left') plt.show() import timeit def cond_loop(n): even_nums = [] for i in range(n): if i % 2 == 0: even_nums.append(i) return even_nums def list_compr(n): even_nums = [i for i in range(n) if i % 2 == 0] return even_nums def filter_func(n): even_nums = list(filter((lambda x: x % 2 != 0), range(n))) return even_nums %timeit cond_loop(n) %timeit list_compr(n) %timeit filter_func(n) funcs = ['cond_loop', 'list_compr', 'filter_func'] orders_n = [10**n for n in range(1, 6)] times_n = {f:[] for f in funcs} for n in orders_n: test_list = list([i for i in range(n)]) for f in funcs: times_n[f].append(min(timeit.Timer('%s(n)' %f, 'from __main__ import %s, n' %f) .repeat(repeat=3, number=1000))) %pylab inline import matplotlib.pyplot as plt labels = [('cond_loop', 'explicit for-loop'), ('list_compr', 'list comprehension'), ('filter_func', 'lambda function'), ] matplotlib.rcParams.update({'font.size': 12}) fig = plt.figure(figsize=(10,8)) for lb in labels: plt.plot(orders_n, times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3) plt.xlabel('sample size n') plt.ylabel('time per computation in milliseconds [ms]') #plt.xlim([1,max(orders_n) + max(orders_n) * 10]) plt.legend(loc=2) plt.grid() plt.xscale('log') plt.yscale('log') plt.title('Performance of different conditional list creation methods') max_perf = max( f/c for f,c in zip(times_n['filter_func'], times_n['cond_loop']) ) min_perf = min( f/c for f,c in zip(times_n['filter_func'], times_n['cond_loop']) ) ftext = 'the list comprehension is {:.2f}x to '\ '{:.2f}x faster than the lambda function'\ .format(min_perf, max_perf) plt.figtext(.14,.75, ftext, fontsize=11, ha='left') plt.show() import random import timeit from collections import defaultdict def add_element_check1(elements): """if ele not in dict (v1)""" d = dict() for e in elements: if e not in d: d[e] = 1 else: d[e] += 1 return d def add_element_check2(elements): """if ele not in dict (v2)""" d = dict() for e in elements: if e not in d: d[e] = 0 d[e] += 1 return d def add_element_except(elements): """try-except""" d = dict() for e in elements: try: d[e] += 1 except KeyError: d[e] = 1 return d def add_element_defaultdict(elements): """defaultdict""" d = defaultdict(int) for e in elements: d[e] += 1 return d def add_element_get(elements): """.get() method""" d = dict() for e in elements: d[e] = d.get(e, 1) + 1 return d random.seed(123) print('Results for 100 integers in range 1-10') rand_ints = [random.randrange(1, 10) for i in range(100)] %timeit add_element_check1(rand_ints) %timeit add_element_check2(rand_ints) %timeit add_element_except(rand_ints) %timeit add_element_defaultdict(rand_ints) %timeit add_element_get(rand_ints) print('\nResults for 1000 integers in range 1-5') rand_ints = [random.randrange(1, 5) for i in range(1000)] %timeit add_element_check1(rand_ints) %timeit add_element_check2(rand_ints) %timeit add_element_except(rand_ints) %timeit add_element_defaultdict(rand_ints) %timeit add_element_get(rand_ints) print('\nResults for 1000 integers in range 1-1000') rand_ints = [random.randrange(1, 1000) for i in range(1000)] %timeit add_element_check1(rand_ints) %timeit add_element_check2(rand_ints) %timeit add_element_except(rand_ints) %timeit add_element_defaultdict(rand_ints) %timeit add_element_get(rand_ints) funcs = ['add_element_check1', 'add_element_check2', 'add_element_except', 'add_element_defaultdict', 'add_element_get'] orders_n = [10**n for n in range(1, 6)] times_n = {f:[] for f in funcs} for n in orders_n: elements = [random.randrange(1, 100) for i in range(n)] for f in funcs: times_n[f].append(min(timeit.Timer('%s(elements)' %f, 'from __main__ import %s, elements' %f) .repeat(repeat=3, number=1000))) %pylab inline import matplotlib.pyplot as plt labels = [('add_element_check1', 'if ele not in dict (v1)'), ('add_element_check2', 'if ele not in dict (v2)'), ('add_element_except', 'try-except'), ('add_element_defaultdict', 'defaultdict'), ('add_element_get', '.get() method') ] matplotlib.rcParams.update({'font.size': 12}) fig = plt.figure(figsize=(10,10)) for lb in labels: plt.plot(orders_n, times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3) plt.xlabel('sample size n') plt.ylabel('time per computation in milliseconds [ms]') #plt.xlim([1,max(orders_n) + max(orders_n) * 10]) plt.legend(loc=2) plt.grid() plt.xscale('log') plt.yscale('log') plt.title('Performance of different methods to count elements in a dictionary') plt.show() import timeit n = 1000 def set_loop(n): a_set = set() for i in range(n): if i % 3 == 0: a_set.add(i) return a_set def set_compr(n): return {i for i in range(n) if i % 3 == 0} %timeit set_loop(n) %timeit set_compr(n) def list_loop(n): a_list = list() for i in range(n): if i % 3 == 0: a_list.append(i) return a_list def list_compr(n): return [i for i in range(n) if i % 3 == 0] %timeit list_loop(n) %timeit list_compr(n) funcs = ['list_loop', 'list_compr'] orders_n = [10**n for n in range(1, 6)] times_n = {f:[] for f in funcs} for n in orders_n: for f in funcs: times_n[f].append(min(timeit.Timer('%s(n)' %f, 'from __main__ import %s, n' %f) .repeat(repeat=3, number=1000))) %pylab inline import matplotlib.pyplot as plt labels = [('list_loop', 'explicit for-loop'), ('list_compr', 'list comprehension')] matplotlib.rcParams.update({'font.size': 12}) fig = plt.figure(figsize=(10,8)) for lb in labels: plt.plot(orders_n, times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3) plt.xlabel('sample size n') plt.ylabel('time per computation in milliseconds [ms]') #plt.xlim([1,max(orders_n) + max(orders_n) * 10]) plt.legend(loc=2) plt.grid() plt.xscale('log') plt.yscale('log') plt.title('Performance of explicit for-loops vs. list comprehensions') max_perf = max( l/c for l,c in zip(times_n['list_loop'], times_n['list_compr']) ) min_perf = min( l/c for l,c in zip(times_n['list_loop'], times_n['list_compr']) ) ftext = 'the list comprehension is {:.2f}x to '\ '{:.2f}x faster than the explicit for-loop'\ .format(min_perf, max_perf) plt.figtext(.14,.75, ftext, fontsize=11, ha='left') plt.show() def dict_loop(n): a_dict = dict() for i in range(n): if i % 3 == 0: a_dict[i] = i return a_dict def dict_compr(n): return {i:i for i in range(n) if i % 3 == 0} %timeit dict_loop(n) %timeit dict_compr(n) import subprocess def subprocess_findcopy(path, search_str, dest): query = 'find %s -name "%s" -exec cp {} %s \;' %(path, search_str, dest) subprocess.call(query, shell=True) return import shutil import os import fnmatch def walk_findcopy(path, search_str, dest): for path, subdirs, files in os.walk(path): for name in fnmatch.filter(files, search_str): try: shutil.copy(os.path.join(path,name), dest) except NameError: pass return import timeit def findcopy_timeit(inpath, outpath, search_str): shutil.rmtree(outpath) os.mkdir(outpath) print(50*'#') print('subprocsess call') %timeit subprocess_findcopy(inpath, search_str, outpath) print("copied %s files" %len(os.listdir(outpath))) shutil.rmtree(outpath) os.mkdir(outpath) print('\nos.walk approach') %timeit walk_findcopy(inpath, search_str, outpath) print("copied %s files" %len(os.listdir(outpath))) print(50*'#') print('small tree') inpath = '/Users/sebastian/Desktop/testdir_in' outpath = '/Users/sebastian/Desktop/testdir_out' search_str = '*.png' findcopy_timeit(inpath, outpath, search_str) print('larger tree') inpath = '/Users/sebastian/Dropbox' outpath = '/Users/sebastian/Desktop/testdir_out' search_str = '*.csv' findcopy_timeit(inpath, outpath, search_str) import numpy as np # 1st column, e.g., A[:,0,np.newaxis] def colvec_method1(A): for col in A.T: colvec = row[:,np.newaxis] yield colvec # 1st column, e.g., A[:,0:1] def colvec_method2(A): for idx in range(A.shape[1]): colvec = A[:,idx:idx+1] yield colvec # 1st column, e.g., A[:,0].reshape(-1,1) def colvec_method3(A): for idx in range(A.shape[1]): colvec = A[:,idx].reshape(-1,1) yield colvec # 1st column, e.g., np.vstack(A[:,0] def colvec_method4(A): for idx in range(A.shape[1]): colvec = np.vstack(A[:,idx]) yield colvec # 1st column, e.g., np.row_stack(A[:,0]) def colvec_method5(A): for idx in range(A.shape[1]): colvec = np.row_stack(A[:,idx]) yield colvec # 1st column, e.g., np.column_stack((A[:,0],)) def colvec_method6(A): for idx in range(A.shape[1]): colvec = np.column_stack((A[:,idx],)) yield colvec # 1st column, e.g., A[:,[0]] def colvec_method7(A): for idx in range(A.shape[1]): colvec = A[:,[idx]] yield colvec def test_method(method, A): for i in method(A): assert i.shape == (A.shape[0],1), "{}, {}".format(i.shape, A.shape[0],1) import timeit A = np.random.random((300, 3)) for method in [ colvec_method1, colvec_method2, colvec_method3, colvec_method4, colvec_method5, colvec_method6, colvec_method7]: print('\nTest:', method.__name__) %timeit test_method(colvec_method2, A) from numpy import sum as np_sum import timeit samples = list(range(1000000)) %timeit(sum(samples)) %timeit(np_sum(samples)) funcs = ['sum', 'np_sum'] orders_n = [10**n for n in range(1, 6)] times_n = {f:[] for f in funcs} for n in orders_n: samples = list(range(n)) times_n['sum'].append(min(timeit.Timer('sum(samples)', 'from __main__ import samples') .repeat(repeat=3, number=1000))) times_n['np_sum'].append(min(timeit.Timer('np_sum(samples)', 'from __main__ import np_sum, samples') .repeat(repeat=3, number=1000))) %pylab inline import matplotlib.pyplot as plt labels = [('sum', 'in-built sum() function'), ('np_sum', 'numpy.sum() function')] matplotlib.rcParams.update({'font.size': 12}) fig = plt.figure(figsize=(10,8)) for lb in labels: plt.plot(orders_n, times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3) plt.xlabel('sample size n') plt.ylabel('time per computation in milliseconds [ms]') plt.legend(loc=2) plt.grid() plt.xscale('log') plt.yscale('log') plt.title('Performance of explicit for-loops vs. list comprehensions') max_perf = max( n/i for i,n in zip(times_n['sum'], times_n['np_sum']) ) min_perf = min( n/i for i,n in zip(times_n['sum'], times_n['np_sum']) ) ftext = 'the in-built sum() is {:.2f}x to '\ '{:.2f}x faster than the numpy.sum()'\ .format(min_perf, max_perf) plt.figtext(.14,.75, ftext, fontsize=11, ha='left') plt.show() from numpy import arange as np_arange n = 1000000 def loop_range(n): for i in range(n): pass return def loop_arange(n): for i in np_arange(n): pass return %timeit(loop_range(n)) %timeit(loop_arange(n)) funcs = ['loop_range', 'loop_arange'] orders_n = [10**n for n in range(1, 6)] times_n = {f:[] for f in funcs} for n in orders_n: for f in funcs: times_n[f].append(min(timeit.Timer('%s(n)' %f, 'from __main__ import %s, n' %f) .repeat(repeat=3, number=1000))) import matplotlib.pyplot as plt labels = [('loop_range', 'in-built range()'), ('loop_arange', 'numpy.arange()')] matplotlib.rcParams.update({'font.size': 12}) fig = plt.figure(figsize=(10,8)) for lb in labels: plt.plot(orders_n, times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3) plt.xlabel('sample size n') plt.ylabel('time per computation in milliseconds [ms]') plt.legend(loc=2) plt.grid() plt.xscale('log') plt.yscale('log') plt.title('Performance of explicit for-loops vs. list comprehensions') max_perf = max( a/r for r,a in zip(times_n['loop_range'], times_n['loop_arange']) ) min_perf = min( a/r for r,a in zip(times_n['loop_range'], times_n['loop_arange']) ) ftext = 'the in-built range() is {:.2f}x to '\ '{:.2f}x faster than numpy.arange()'\ .format(min_perf, max_perf) plt.figtext(.14,.75, ftext, fontsize=11, ha='left') plt.show() # The statistics module has been added to # the standard library in Python 3.4 import timeit import statistics as stats import numpy as np def calc_mean(samples): return sum(samples)/len(samples) def np_mean(samples): return np.mean(samples) def np_mean_ary(np_array): return np.mean(np_array) def st_mean(samples): return stats.mean(samples) n = 1000000 samples = list(range(n)) samples_array = np.arange(n) assert(st_mean(samples) == np_mean(samples) == calc_mean(samples) == np_mean_ary(samples_array)) %timeit(calc_mean(samples)) %timeit(np_mean(samples)) %timeit(np_mean_ary(samples_array)) %timeit(st_mean(samples)) funcs = ['st_mean', 'np_mean', 'calc_mean', 'np_mean_ary'] orders_n = [10**n for n in range(1, 6)] times_n = {f:[] for f in funcs} for n in orders_n: samples = list(range(n)) for f in funcs: if f == 'np_mean_ary': samples = np.arange(n) times_n[f].append(min(timeit.Timer('%s(samples)' %f, 'from __main__ import %s, samples' %f) .repeat(repeat=3, number=1000))) %pylab inline import matplotlib.pyplot as plt labels = [('st_mean', 'statistics.mean()'), ('np_mean', 'numpy.mean() on list'), ('np_mean_ary', 'numpy.mean() on array'), ('calc_mean', 'sum(samples)/len(samples)') ] matplotlib.rcParams.update({'font.size': 12}) fig = plt.figure(figsize=(10,8)) for lb in labels: plt.plot(orders_n, times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3) plt.xlabel('sample size n') plt.ylabel('time per computation in milliseconds [ms]') plt.legend(loc=2) plt.grid() plt.xscale('log') plt.yscale('log') plt.title('Performance of different approaches for calculating sample means') max_perf = max( s/c for s,c in zip(times_n['st_mean'], times_n['np_mean_ary']) ) min_perf = min( s/c for s,c in zip(times_n['st_mean'], times_n['np_mean_ary']) ) ftext = 'using numpy.mean() on np.arrays is {:.2f}x to '\ '{:.2f}x faster than statistics.mean() on lists'\ .format(min_perf, max_perf) plt.figtext(.14,.15, ftext, fontsize=11, ha='left') plt.show() def py_lstsqr(x, y): """ Computes the least-squares solution to a linear matrix equation. """ x_avg = sum(x)/len(x) y_avg = sum(y)/len(y) var_x = 0 cov_xy = 0 for x_i, y_i in zip(x,y): temp = (x_i - x_avg) var_x += temp**2 cov_xy += temp*(y_i - y_avg) slope = cov_xy / var_x y_interc = y_avg - slope*x_avg return (slope, y_interc) %load_ext cythonmagic %%cython def cy_lstsqr(x, y): """ Computes the least-squares solution to a linear matrix equation. """ cdef double x_avg, y_avg, temp, var_x, cov_xy, slope, y_interc, x_i, y_i x_avg = sum(x)/len(x) y_avg = sum(y)/len(y) var_x = 0 cov_xy = 0 for x_i, y_i in zip(x,y): temp = (x_i - x_avg) var_x += temp**2 cov_xy += temp*(y_i - y_avg) slope = cov_xy / var_x y_interc = y_avg - slope*x_avg return (slope, y_interc) %pylab inline from matplotlib import pyplot as plt import timeit import random random.seed(12345) n = 500 x = [x_i*random.randrange(8,12)/10 for x_i in range(n)] y = [y_i*random.randrange(10,14)/10 for y_i in range(n)] slope, intercept = cy_lstsqr(x, y) line_x = [round(min(x)) - 1, round(max(x)) + 1] line_y = [slope*x_i + intercept for x_i in line_x] plt.figure(figsize=(8,8)) plt.scatter(x,y) plt.plot(line_x, line_y, color='red', lw='2') plt.ylabel('y') plt.xlabel('x') plt.title('Linear regression via least squares fit') ftext = 'y = ax + b = {:.3f} + {:.3f}x'\ .format(slope, intercept) plt.figtext(.15,.8, ftext, fontsize=11, ha='left') plt.show() import timeit import random random.seed(12345) funcs = ['py_lstsqr', 'cy_lstsqr'] orders_n = [10**n for n in range(1, 6)] times_n = {f:[] for f in funcs} for n in orders_n: x = [x_i*random.randrange(8,12)/10 for x_i in range(n)] y = [y_i*random.randrange(10,14)/10 for y_i in range(n)] for f in funcs: times_n[f].append(min(timeit.Timer('%s(x,y)' %f, 'from __main__ import %s, x, y' %f) .repeat(repeat=3, number=1000))) import matplotlib.pyplot as plt labels = [('py_lstsqr', 'regular Python (CPython)'), ('cy_lstsqr', 'Cython implementation')] matplotlib.rcParams.update({'font.size': 12}) fig = plt.figure(figsize=(10,8)) for lb in labels: plt.plot(orders_n, times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3) plt.xlabel('sample size n') plt.ylabel('time per computation in milliseconds [ms]') plt.xlim([1,max(orders_n) + max(orders_n) * 10]) plt.legend(loc=2) plt.grid() plt.xscale('log') plt.yscale('log') max_perf = max( py/nu for py,nu in zip(times_n['py_lstsqr'], times_n['cy_lstsqr']) ) min_perf = min( py/nu for py,nu in zip(times_n['py_lstsqr'], times_n['cy_lstsqr']) ) ftext = 'Using Cython is {:.2f}x to '\ '{:.2f}x faster than regular (C)Python'\ .format(min_perf, max_perf) plt.figtext(.15,.8, ftext, fontsize=11, ha='left') plt.title('Performance of least square fit implementations in Cython and (C)Python') plt.show() import numpy as np import scipy.stats from numba import jit %load_ext cythonmagic def py_mat_lstsqr(x, y): """ Computes the least-squares solution to a linear matrix equation. """ X = np.vstack([x, np.ones(len(x))]).T return (np.linalg.inv(X.T.dot(X)).dot(X.T)).dot(y) @jit def numba_mat_lstsqr(x, y): """ Computes the least-squares solution to a linear matrix equation. """ X = np.vstack([x, np.ones(len(x))]).T return (np.linalg.inv(X.T.dot(X)).dot(X.T)).dot(y) %%cython def cy_mat_lstsqr(x, y): """ Computes the least-squares solution to a linear matrix equation. """ X = np.vstack([x, np.ones(len(x))]).T return (np.linalg.inv(X.T.dot(X)).dot(X.T)).dot(y) def py_lstsqr(x, y): """ Computes the least-squares solution to a linear matrix equation. """ x_avg = sum(x)/len(x) y_avg = sum(y)/len(y) var_x = 0 cov_xy = 0 for x_i, y_i in zip(x,y): temp = (x_i - x_avg) var_x += temp**2 cov_xy += temp*(y_i - y_avg) slope = cov_xy / var_x y_interc = y_avg - slope*x_avg return (slope, y_interc) @jit def numba_lstsqr(x, y): """ Computes the least-squares solution to a linear matrix equation. """ x_avg = sum(x)/len(x) y_avg = sum(y)/len(y) var_x = 0 cov_xy = 0 for x_i, y_i in zip(x,y): temp = (x_i - x_avg) var_x += temp**2 cov_xy += temp*(y_i - y_avg) slope = cov_xy / var_x y_interc = y_avg - slope*x_avg return (slope, y_interc) %%cython def cy_lstsqr(x, y): """ Computes the least-squares solution to a linear matrix equation. """ cdef double x_avg, y_avg, temp, var_x, cov_xy, slope, y_interc, x_i, y_i x_avg = sum(x)/len(x) y_avg = sum(y)/len(y) var_x = 0 cov_xy = 0 for x_i, y_i in zip(x,y): temp = (x_i - x_avg) var_x += temp**2 cov_xy += temp*(y_i - y_avg) slope = cov_xy / var_x y_interc = y_avg - slope*x_avg return (slope, y_interc) def numpy_lstsqr(x, y): """ Computes the least-squares solution to a linear matrix equation. """ X = np.vstack([x, np.ones(len(x))]).T return np.linalg.lstsq(X,y)[0] def scipy_lstsqr(x,y): """ Computes the least-squares solution to a linear matrix equation. """ return scipy.stats.linregress(x, y)[0:2] import random random.seed(12345) n = 500 x = [x_i*random.randrange(8,12)/10 for x_i in range(n)] y = [y_i*random.randrange(10,14)/10 for y_i in range(n)] np.testing.assert_array_almost_equal( py_lstsqr(x, y), py_mat_lstsqr(x, y), decimal=6) np.testing.assert_array_almost_equal( numpy_lstsqr(x,y), py_lstsqr(x, y), decimal=6) np.testing.assert_array_almost_equal( scipy_lstsqr(x,y), py_lstsqr(x, y), decimal=6) print('ok') %pylab inline from matplotlib import pyplot as plt slope, intercept = py_mat_lstsqr(x, y) line_x = [round(min(x)) - 1, round(max(x)) + 1] line_y = [slope*x_i + intercept for x_i in line_x] plt.figure(figsize=(7,6)) plt.scatter(x,y) plt.plot(line_x, line_y, color='red', lw='2') plt.ylabel('y') plt.xlabel('x') plt.title('Linear regression via least squares fit') ftext = 'y = ax + b = {:.3f} + {:.3f}x'\ .format(slope, intercept) plt.figtext(.15,.8, ftext, fontsize=11, ha='left') plt.show() import timeit import random random.seed(12345) funcs = ['py_mat_lstsqr', 'numba_mat_lstsqr', 'cy_mat_lstsqr', 'py_lstsqr', 'numba_lstsqr', 'cy_lstsqr', 'numpy_lstsqr', 'scipy_lstsqr'] orders_n = [10**n for n in range(1, 6)] times_n = {f:[] for f in funcs} for n in orders_n: x = np.asarray([x_i*np.random.randint(8,12)/10 for x_i in range(n)]) y = np.asarray([y_i*np.random.randint(10,14)/10 for y_i in range(n)]) for f in funcs: times_n[f].append(min(timeit.Timer('%s(x,y)' %f, 'from __main__ import %s, x, y' %f) .repeat(repeat=3, number=1000))) import matplotlib.pyplot as plt labels = [('py_mat_lstsqr', 'matrix equation in reg. (C)Python & NumPy'), ('numba_mat_lstsqr', 'matrix equation in Numba'), ('cy_mat_lstsqr', 'matrix equation in Cython & NumPy'), ('py_lstsqr', '"classic" least squares in reg. (C)Python'), ('numba_lstsqr', '"classic" least squares in Numba'), ('cy_lstsqr', '"classic" least squares in Cython'), ('numpy_lstsqr', 'least squares via np.linalg.lstsq()'), ('scipy_lstsqr', 'least_squares via scipy.stats.linregress()'),] matplotlib.rcParams.update({'font.size': 12}) fig = plt.figure(figsize=(10,8)) for lb in labels: plt.plot(orders_n, times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3) plt.xlabel('sample size n') plt.ylabel('time per computation in milliseconds [ms]') plt.xlim([1,max(orders_n) + max(orders_n) * 10]) plt.legend(loc=2) plt.grid() plt.xscale('log') plt.yscale('log') max_perf = max( py/nu for py,nu in zip(times_n['py_lstsqr'], times_n['cy_lstsqr']) ) min_perf = min( py/nu for py,nu in zip(times_n['py_lstsqr'], times_n['cy_lstsqr']) ) ftext = 'Using Cython is {:.2f}x to '\ '{:.2f}x faster than regular (C)Python'\ .format(min_perf, max_perf) plt.figtext(.14,.15, ftext, fontsize=11, ha='left') plt.title('Performance of least square fit implementations') plt.show()