import timeit

n = 10000

def test_format(n):
    return ['{}'.format(i) for i in range(n)]

def test_binaryop(n):
    return ['%s' %i for i in range(n)]

%timeit test_format(n)
%timeit test_binaryop(n)

funcs = ['test_format', 'test_binaryop']

orders_n = [10**n for n in range(1, 6)]
times_n = {f:[] for f in funcs}

for n in orders_n:
    for f in funcs:
        times_n[f].append(min(timeit.Timer('%s(n)' %f, 
                      'from __main__ import %s, n' %f)
                              .repeat(repeat=3, number=1000)))

%pylab inline

import matplotlib.pyplot as plt

labels = [('test_format', '.format() method'), 
          ('test_binaryop', 'binary operator %')] 

matplotlib.rcParams.update({'font.size': 12})

fig = plt.figure(figsize=(10,8))
for lb in labels:
    plt.plot(orders_n, times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3)
plt.xlabel('sample size n')
plt.ylabel('time per computation in milliseconds [ms]')
plt.xlim([1,max(orders_n) + max(orders_n) * 10])
plt.legend(loc=2)
plt.grid()
plt.xscale('log')
plt.yscale('log')
plt.title('Performance of different string formatting methods')
max_perf = max( f/b for f,b in zip(times_n['test_format'],
                                   times_n['test_binaryop']) )
min_perf = min( f/b for f,b in zip(times_n['test_format'],
                                   times_n['test_binaryop']) )
    
ftext = 'The binary op. % is {:.2f}x to {:.2f}x faster than .format()'\
        .format(min_perf, max_perf)    
plt.figtext(.14,.75, ftext, fontsize=11, ha='left')


plt.show()

import timeit

def reverse_join(my_str):
    return ''.join(reversed(my_str))
    
def reverse_slizing(my_str):
    return my_str[::-1]

test_str = 'abcdefg'

# Test to show that both work
a = reverse_join(test_str)
b = reverse_slizing(test_str)
assert(a == b and a == 'gfedcba')

%timeit reverse_join(test_str)
%timeit reverse_slizing(test_str)

funcs = ['reverse_join', 'reverse_slizing']

orders_n = [10**n for n in range(1, 6)]
test_strings = (test_str*n for n in orders_n)
times_n = {f:[] for f in funcs}

for st,n in zip(test_strings, orders_n):
    for f in funcs:
        times_n[f].append(min(timeit.Timer('%s(st)' %f, 
                      'from __main__ import %s, st' %f)
                              .repeat(repeat=3, number=1000)))

%pylab inline

import matplotlib.pyplot as plt

labels = [('reverse_join', '"".join(reversed(my_str))'), 
          ('reverse_slizing', 'my_str[::-1]')] 

matplotlib.rcParams.update({'font.size': 12})

fig = plt.figure(figsize=(10,8))
for lb in labels:
    plt.plot([n*len(test_str) for n in orders_n], 
             times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3)
plt.xlabel('sample size n')
plt.ylabel('time per computation in milliseconds [ms]')
plt.xlim([1,max(orders_n) + max(orders_n) * 10])
plt.legend(loc=2)
plt.grid()
plt.xscale('log')
plt.yscale('log')
plt.title('Performance of different string reversing methods')
max_perf = max( j/s for j,s in zip(times_n['reverse_join'],
                                   times_n['reverse_slizing']) )
min_perf = min( j/s for j,s in zip(times_n['reverse_join'],
                                   times_n['reverse_slizing']) )
    
ftext = 'my_str[::-1] is {:.2f}x to {:.2f}x faster than "".join(reversed(my_str))'\
        .format(min_perf, max_perf)
plt.figtext(.14,.75, ftext, fontsize=11, ha='left')
plt.show()

import timeit

def string_add(in_chars):
    new_str = ''
    for char in in_chars:
        new_str += char
    return new_str

def string_join(in_chars):
    return ''.join(in_chars)

test_chars = ['a', 'b', 'c', 'd', 'e', 'f']

%timeit string_add(test_chars)
%timeit string_join(test_chars)

funcs = ['string_add', 'string_join']

orders_n = [10**n for n in range(1, 6)]
test_chars_n = (test_chars*n for n in orders_n)
times_n = {f:[] for f in funcs}

for st,n in zip(test_chars_n, orders_n):
    for f in funcs:
        times_n[f].append(min(timeit.Timer('%s(st)' %f, 
                      'from __main__ import %s, st' %f)
                              .repeat(repeat=3, number=1000)))

%pylab inline

import matplotlib.pyplot as plt

labels = [('string_add', 'new_str += char'), 
          ('string_join', '"".join(chars)')] 

matplotlib.rcParams.update({'font.size': 12})

fig = plt.figure(figsize=(10,8))
for lb in labels:
    plt.plot([len(test_chars)*n for n in orders_n], 
             times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3)
plt.xlabel('sample size n')
plt.ylabel('time per computation in milliseconds [ms]')
#plt.xlim([1,max(orders_n) + max(orders_n) * 10])
plt.legend(loc=2)
plt.grid()
plt.xscale('log')
plt.yscale('log')
plt.title('Performance of different string concatenation methods')
max_perf = max( a/j for a,j in zip(times_n['string_add'],
                                   times_n['string_join']) )
min_perf = min( a/j for a,j in zip(times_n['string_add'],
                                   times_n['string_join']) )

ftext = '"".join(chars) is {:.2f}x to {:.2f}x faster than new_str += char'\
        .format(min_perf, max_perf)
plt.figtext(.14,.75, ftext, fontsize=11, ha='left')
plt.show()

import timeit

n = 1000

def plus_operator(n):
    my_str = 'a'
    for i in range(n):
        my_str = my_str + str(1) + str(2)
    return my_str 
    
def format_method(n):
    my_str = 'a'
    for i in range(n):
        my_str = '{}{}{}'.format(my_str,1,2)
    
def binary_operator(n):
    my_str = 'a'
    for i in range(n):
        my_str = '%s%s%s' %(my_str,1,2)
    return my_str

%timeit plus_operator(n)
%timeit format_method(n)
%timeit binary_operator(n)

funcs = ['plus_operator', 'format_method', 'binary_operator']

orders_n = [10**n for n in range(1, 5)]
times_n = {f:[] for f in funcs}

for n in orders_n:
    for f in funcs:
        times_n[f].append(min(timeit.Timer('%s(n)' %f, 
                      'from __main__ import %s, n' %f)
                              .repeat(repeat=3, number=1000)))

%pylab inline

import matplotlib.pyplot as plt

labels = [('plus_operator', 'my_str + str(1) + str(2)'), 
          ('format_method', '"{}{}{}".format(my_str,1,2)'),
          ('binary_operator', '"%s%s%s" %(my_str,1,2)'),
          ] 

matplotlib.rcParams.update({'font.size': 12})

fig = plt.figure(figsize=(10,8))
for lb in labels:
    plt.plot(orders_n, times_n[lb[0]], 
             alpha=0.5, label=lb[1], marker='o', lw=3)
plt.xlabel('sample size n')
plt.ylabel('time per computation in milliseconds [ms]')
#plt.xlim([1,max(orders_n) + max(orders_n) * 10])
plt.legend(loc=2)
plt.grid()
plt.xscale('log')
plt.yscale('log')
plt.title('Performance of different string assembly methods')

max_perf = max( p/b for p,b in zip(times_n['plus_operator'],
                                   times_n['binary_operator']) )
min_perf = min( p/b for p,b in zip(times_n['plus_operator'],
                                   times_n['binary_operator']) )

ftext = '"%s%s%s" %(my_str,1,2) is {:.2f}x to'\
        '{:.2f}x faster than my_str + str(1) + str(2)'\
        .format(min_perf, max_perf)
plt.figtext(.14,.75, ftext, fontsize=11, ha='left')
plt.show()

import timeit

def string_is_int(a_str):
    try:
        int(a_str)
        return True
    except ValueError:
        return False

an_int = '123'
no_int = '123abc'

%timeit string_is_int(an_int)
%timeit string_is_int(no_int)
%timeit an_int.isdigit()
%timeit no_int.isdigit()

funcs = ['string_is_int', 'isdigit']
t1 = '123'
t2 = '123abc'
isdigit_method = []
string_is_int_method = []

for t in [t1,t2]:
    string_is_int_method.append(min(timeit.Timer('string_is_int(t)', 
                      'from __main__ import string_is_int, t')
                              .repeat(repeat=3, number=1000000)))
    isdigit_method.append(min(timeit.Timer('t.isdigit()', 
                      'from __main__ import t')
                              .repeat(repeat=3, number=1000000)))

%pylab inline

N = len(isdigit_method)
ind = np.arange(N)  # the x locations for the groups
width = 0.25       # the width of the bars

 
fig, ax = plt.subplots()
plt.bar(ind, 
        [i for i in string_is_int_method], 
        width,
        alpha=0.5,
        color='g',
        label='string_is_int(a_str)')

plt.bar(ind + width, 
        [i for i in isdigit_method], 
        width,
        alpha=0.5,
        color='b',
        label='a_str.isdigit()')
    
ax.set_ylabel('time in microseconds')
ax.set_title('Time to check if a string is an integer')
ax.set_xticks(ind + width)
ax.set_xticklabels(['"%s"' %t for t in [t1, t2]])
plt.xlabel('test strings')
plt.xlim(-0.1,1.6)
#plt.ylim(0,15)
plt.legend(loc='upper left')
plt.show()

import timeit

def string_is_number(a_str):
    try:
        float(a_str)
        return True
    except ValueError:
        return False
    
a_float = '1.234'
no_float = '123abc'

a_float.replace('.','',1).isdigit()
no_float.replace('.','',1).isdigit()

%timeit string_is_number(an_int)
%timeit string_is_number(no_int)
%timeit a_float.replace('.','',1).isdigit()
%timeit no_float.replace('.','',1).isdigit()

funcs = ["string_is_number", "replace('.','',1).isdigit()"]
t1 = '1.234'
t2 = '123abc'
isdigit_method = []
string_is_number_method = []

for t in [t1,t2]:
    string_is_number_method.append(min(timeit.Timer('string_is_number(t)', 
                      'from __main__ import string_is_number, t')
                              .repeat(repeat=3, number=1000000)))
    isdigit_method.append(min(timeit.Timer("t.replace('.','',1).isdigit()", 
                      'from __main__ import t')
                              .repeat(repeat=3, number=1000000)))

%pylab inline

N = len(isdigit_method)
ind = np.arange(N)  # the x locations for the groups
width = 0.25       # the width of the bars

 
fig, ax = plt.subplots()

plt.bar(ind , 
        [i for i in isdigit_method], 
        width,
        alpha=0.5,
        color='b',
        label="a_str.replace('.','',1).isdigit()")

plt.bar(ind + width, 
        [i for i in string_is_number_method], 
        width,
        alpha=0.5,
        color='g',
        label='string_is_number(a_str)')

    
ax.set_ylabel('time in microseconds')
ax.set_title('Time to check if a string is a number')
ax.set_xticks(ind + width)
ax.set_xticklabels(['"%s"' %t for t in [t2, t1]])
plt.xlabel('test strings')
plt.xlim(-0.1,1.6)
#plt.ylim(0,15)
plt.legend(loc='upper left')
plt.show()

import timeit
import copy

def reverse_func(my_list):
    return copy.deepcopy(my_list).reverse()
    
def reversed_func(my_list):
    return list(reversed(my_list))

def reverse_slizing(my_list):
    return my_list[::-1]

n = 10
test_list = list([i for i in range(n)])

%timeit reverse_func(test_list)
%timeit reversed_func(test_list)
%timeit reverse_slizing(test_list)

funcs = ['reverse_func', 'reversed_func',
         'reverse_slizing']

orders_n = [10**n for n in range(1, 6)]
times_n = {f:[] for f in funcs}

for n in orders_n:
    test_list = list([i for i in range(n)])
    for f in funcs:
        times_n[f].append(min(timeit.Timer('%s(test_list)' %f, 
                      'from __main__ import %s, test_list' %f)
                              .repeat(repeat=3, number=1000)))

%pylab inline

import matplotlib.pyplot as plt

labels = [('reverse_func', 'copy.deepcopy(my_list).reverse()'), 
          ('reversed_func', 'list(reversed(my_list))'),
          ('reverse_slizing', 'my_list[::-1]'),
          ] 

matplotlib.rcParams.update({'font.size': 12})

fig = plt.figure(figsize=(10,8))
for lb in labels:
    plt.plot(orders_n, times_n[lb[0]], 
             alpha=0.5, label=lb[1], marker='o', lw=3)
plt.xlabel('sample size n')
plt.ylabel('time per computation in milliseconds [ms]')
#plt.xlim([1,max(orders_n) + max(orders_n) * 10])
plt.legend(loc=2)
plt.grid()
plt.xscale('log')
plt.yscale('log')
plt.title('Performance of different list reversing approaches')

max_perf = max( f/s for f,s in zip(times_n['reverse_func'],
                                   times_n['reverse_slizing']) )
min_perf = min( f/s for f,s in zip(times_n['reverse_func'],
                                   times_n['reverse_slizing']) )

ftext = 'my_list[::-1] is {:.2f}x to '\
        '{:.2f}x faster than copy.deepcopy(my_list).reverse()'\
        .format(min_perf, max_perf)
plt.figtext(.14,.75, ftext, fontsize=11, ha='left')

plt.show()

import timeit

def cond_loop(n):
    even_nums = []
    for i in range(n):
        if i % 2 == 0:
            even_nums.append(i)
    return even_nums

def list_compr(n):
    even_nums = [i for i in range(n) if i % 2 == 0]
    return even_nums
    
def filter_func(n):
    even_nums = list(filter((lambda x: x % 2 != 0), range(n)))
    return even_nums

%timeit cond_loop(n)
%timeit list_compr(n)
%timeit filter_func(n)

funcs = ['cond_loop', 'list_compr',
         'filter_func']

orders_n = [10**n for n in range(1, 6)]
times_n = {f:[] for f in funcs}

for n in orders_n:
    test_list = list([i for i in range(n)])
    for f in funcs:
        times_n[f].append(min(timeit.Timer('%s(n)' %f, 
                      'from __main__ import %s, n' %f)
                              .repeat(repeat=3, number=1000)))

%pylab inline

import matplotlib.pyplot as plt

labels = [('cond_loop', 'explicit for-loop'), 
          ('list_compr', 'list comprehension'),
          ('filter_func', 'lambda function'),
          ] 

matplotlib.rcParams.update({'font.size': 12})

fig = plt.figure(figsize=(10,8))
for lb in labels:
    plt.plot(orders_n, times_n[lb[0]], 
             alpha=0.5, label=lb[1], marker='o', lw=3)
plt.xlabel('sample size n')
plt.ylabel('time per computation in milliseconds [ms]')
#plt.xlim([1,max(orders_n) + max(orders_n) * 10])
plt.legend(loc=2)
plt.grid()
plt.xscale('log')
plt.yscale('log')
plt.title('Performance of different conditional list creation methods')

max_perf = max( f/c for f,c in zip(times_n['filter_func'],
                                   times_n['cond_loop']) )
min_perf = min( f/c for f,c in zip(times_n['filter_func'],
                                   times_n['cond_loop']) )

ftext = 'the list comprehension is {:.2f}x to '\
        '{:.2f}x faster than the lambda function'\
        .format(min_perf, max_perf)
plt.figtext(.14,.75, ftext, fontsize=11, ha='left')

plt.show()

import random
import timeit
from collections import defaultdict


def add_element_check1(elements):
    """if ele not in dict (v1)"""
    d = dict()
    for e in elements:
        if e not in d:
            d[e] = 1
        else:
            d[e] += 1
    return d
            
def add_element_check2(elements):
    """if ele not in dict (v2)"""
    d = dict()
    for e in elements:
        if e not in d:
            d[e] = 0
        d[e] += 1            
    return d
        
def add_element_except(elements):
    """try-except"""
    d = dict()
    for e in elements:
        try:
            d[e] += 1
        except KeyError:
            d[e] = 1
    return d
            
def add_element_defaultdict(elements):
    """defaultdict"""
    d = defaultdict(int)
    for e in elements:
        d[e] += 1
    return d

def add_element_get(elements):
    """.get() method"""
    d = dict()
    for e in elements:
        d[e] = d.get(e, 1) + 1
    return d


random.seed(123)

print('Results for 100 integers in range 1-10') 
rand_ints = [random.randrange(1, 10) for i in range(100)]
%timeit add_element_check1(rand_ints)
%timeit add_element_check2(rand_ints)
%timeit add_element_except(rand_ints)
%timeit add_element_defaultdict(rand_ints)
%timeit add_element_get(rand_ints)

print('\nResults for 1000 integers in range 1-5')            
rand_ints = [random.randrange(1, 5) for i in range(1000)]
%timeit add_element_check1(rand_ints)
%timeit add_element_check2(rand_ints)
%timeit add_element_except(rand_ints)
%timeit add_element_defaultdict(rand_ints)
%timeit add_element_get(rand_ints)

print('\nResults for 1000 integers in range 1-1000')            
rand_ints = [random.randrange(1, 1000) for i in range(1000)]
%timeit add_element_check1(rand_ints)
%timeit add_element_check2(rand_ints)
%timeit add_element_except(rand_ints)
%timeit add_element_defaultdict(rand_ints)
%timeit add_element_get(rand_ints)

funcs = ['add_element_check1', 'add_element_check2',
         'add_element_except', 'add_element_defaultdict',
         'add_element_get']

orders_n = [10**n for n in range(1, 6)]
times_n = {f:[] for f in funcs}

for n in orders_n:
    elements = [random.randrange(1, 100) for i in range(n)]
    for f in funcs:
        times_n[f].append(min(timeit.Timer('%s(elements)' %f, 
                      'from __main__ import %s, elements' %f)
                              .repeat(repeat=3, number=1000)))

%pylab inline

import matplotlib.pyplot as plt

labels = [('add_element_check1', 'if ele not in dict (v1)'), 
          ('add_element_check2', 'if ele not in dict (v2)'),
          ('add_element_except', 'try-except'),
          ('add_element_defaultdict', 'defaultdict'),
          ('add_element_get', '.get() method')
          ] 

matplotlib.rcParams.update({'font.size': 12})

fig = plt.figure(figsize=(10,10))
for lb in labels:
    plt.plot(orders_n, times_n[lb[0]], 
             alpha=0.5, label=lb[1], marker='o', lw=3)
plt.xlabel('sample size n')
plt.ylabel('time per computation in milliseconds [ms]')
#plt.xlim([1,max(orders_n) + max(orders_n) * 10])
plt.legend(loc=2)
plt.grid()
plt.xscale('log')
plt.yscale('log')
plt.title('Performance of different methods to count elements in a dictionary')

plt.show()

import timeit
n = 1000

def set_loop(n):
    a_set = set()
    for i in range(n):
        if i % 3 == 0:
            a_set.add(i)
    return a_set

def set_compr(n):
    return {i for i in range(n) if i % 3 == 0}

%timeit set_loop(n)
%timeit set_compr(n)

def list_loop(n):
    a_list = list()
    for i in range(n):
        if i % 3 == 0:
            a_list.append(i)
    return a_list

def list_compr(n):
    return [i for i in range(n) if i % 3 == 0]

%timeit list_loop(n)
%timeit list_compr(n)

funcs = ['list_loop', 'list_compr']
orders_n = [10**n for n in range(1, 6)]
times_n = {f:[] for f in funcs}

for n in orders_n:
    for f in funcs:
        times_n[f].append(min(timeit.Timer('%s(n)' %f, 
                      'from __main__ import %s, n' %f)
                              .repeat(repeat=3, number=1000)))

%pylab inline

import matplotlib.pyplot as plt

labels = [('list_loop', 'explicit for-loop'), 
          ('list_compr', 'list comprehension')]

matplotlib.rcParams.update({'font.size': 12})

fig = plt.figure(figsize=(10,8))
for lb in labels:
    plt.plot(orders_n, times_n[lb[0]], 
             alpha=0.5, label=lb[1], marker='o', lw=3)
plt.xlabel('sample size n')
plt.ylabel('time per computation in milliseconds [ms]')
#plt.xlim([1,max(orders_n) + max(orders_n) * 10])
plt.legend(loc=2)
plt.grid()
plt.xscale('log')
plt.yscale('log')
plt.title('Performance of explicit for-loops vs. list comprehensions')

max_perf = max( l/c for l,c in zip(times_n['list_loop'],
                                   times_n['list_compr']) )
min_perf = min( l/c for l,c in zip(times_n['list_loop'],
                                   times_n['list_compr']) )

ftext = 'the list comprehension is {:.2f}x to '\
        '{:.2f}x faster than the explicit for-loop'\
        .format(min_perf, max_perf)
plt.figtext(.14,.75, ftext, fontsize=11, ha='left')

plt.show()

def dict_loop(n):
    a_dict = dict()
    for i in range(n):
        if i % 3 == 0:
            a_dict[i] = i
    return a_dict

def dict_compr(n):
    return {i:i for i in range(n) if i % 3 == 0}

%timeit dict_loop(n)
%timeit dict_compr(n)

import subprocess

def subprocess_findcopy(path, search_str, dest):    
    query = 'find %s -name "%s" -exec cp {} %s \;' %(path, search_str, dest)
    subprocess.call(query, shell=True)
    return  

import shutil
import os
import fnmatch

def walk_findcopy(path, search_str, dest):
    for path, subdirs, files in os.walk(path):
        for name in fnmatch.filter(files, search_str):
            try:
                shutil.copy(os.path.join(path,name), dest)
            except NameError:
                pass
    return

import timeit


def findcopy_timeit(inpath, outpath, search_str):
    
    shutil.rmtree(outpath)
    os.mkdir(outpath)
    print(50*'#')
    print('subprocsess call')
    %timeit subprocess_findcopy(inpath, search_str, outpath)
    print("copied %s files" %len(os.listdir(outpath)))
    shutil.rmtree(outpath)
    os.mkdir(outpath)
    print('\nos.walk approach')
    %timeit walk_findcopy(inpath, search_str, outpath)
    print("copied %s files" %len(os.listdir(outpath)))
    print(50*'#')

print('small tree')
inpath = '/Users/sebastian/Desktop/testdir_in'
outpath = '/Users/sebastian/Desktop/testdir_out'
search_str = '*.png'
findcopy_timeit(inpath, outpath, search_str)

print('larger tree')
inpath = '/Users/sebastian/Dropbox'
outpath = '/Users/sebastian/Desktop/testdir_out'
search_str = '*.csv'
findcopy_timeit(inpath, outpath, search_str)


import numpy as np

# 1st column, e.g., A[:,0,np.newaxis]

def colvec_method1(A):
    for col in A.T:
        colvec = row[:,np.newaxis]
        yield colvec

# 1st column, e.g., A[:,0:1]

def colvec_method2(A):
    for idx in range(A.shape[1]):
        colvec = A[:,idx:idx+1]
        yield colvec

# 1st column, e.g., A[:,0].reshape(-1,1)

def colvec_method3(A):
    for idx in range(A.shape[1]):
        colvec = A[:,idx].reshape(-1,1)
        yield colvec

# 1st column, e.g., np.vstack(A[:,0]

def colvec_method4(A):
    for idx in range(A.shape[1]):
        colvec = np.vstack(A[:,idx])
        yield colvec

# 1st column, e.g., np.row_stack(A[:,0])

def colvec_method5(A):
    for idx in range(A.shape[1]):
        colvec = np.row_stack(A[:,idx])
        yield colvec

# 1st column, e.g., np.column_stack((A[:,0],))

def colvec_method6(A):
    for idx in range(A.shape[1]):
        colvec = np.column_stack((A[:,idx],))
        yield colvec

# 1st column, e.g., A[:,[0]]

def colvec_method7(A):
    for idx in range(A.shape[1]):
        colvec = A[:,[idx]]
        yield colvec

def test_method(method, A):
    for i in method(A): 
        assert i.shape == (A.shape[0],1), "{}, {}".format(i.shape, A.shape[0],1)

import timeit

A = np.random.random((300, 3))

for method in [
            colvec_method1, colvec_method2, 
            colvec_method3, colvec_method4, 
            colvec_method5, colvec_method6,
            colvec_method7]:
    print('\nTest:', method.__name__)
    %timeit test_method(colvec_method2, A)

from numpy import sum as np_sum
import timeit

samples = list(range(1000000))

%timeit(sum(samples))
%timeit(np_sum(samples))

funcs = ['sum', 'np_sum']
orders_n = [10**n for n in range(1, 6)]
times_n = {f:[] for f in funcs}

for n in orders_n:
    samples = list(range(n))
    times_n['sum'].append(min(timeit.Timer('sum(samples)', 
                'from __main__ import samples')
                    .repeat(repeat=3, number=1000)))
    times_n['np_sum'].append(min(timeit.Timer('np_sum(samples)', 
                'from __main__ import np_sum, samples')
                    .repeat(repeat=3, number=1000)))

%pylab inline

import matplotlib.pyplot as plt

labels = [('sum', 'in-built sum() function'), 
          ('np_sum', 'numpy.sum() function')]

matplotlib.rcParams.update({'font.size': 12})

fig = plt.figure(figsize=(10,8))
for lb in labels:
    plt.plot(orders_n, times_n[lb[0]], 
             alpha=0.5, label=lb[1], marker='o', lw=3)
plt.xlabel('sample size n')
plt.ylabel('time per computation in milliseconds [ms]')
plt.legend(loc=2)
plt.grid()
plt.xscale('log')
plt.yscale('log')
plt.title('Performance of explicit for-loops vs. list comprehensions')

max_perf = max( n/i for i,n in zip(times_n['sum'],
                                   times_n['np_sum']) )
min_perf = min( n/i for i,n in zip(times_n['sum'],
                                   times_n['np_sum']) )

ftext = 'the in-built sum() is {:.2f}x to '\
        '{:.2f}x faster than the numpy.sum()'\
        .format(min_perf, max_perf)
plt.figtext(.14,.75, ftext, fontsize=11, ha='left')

plt.show()

from numpy import arange as np_arange

n = 1000000

def loop_range(n):
    for i in range(n):
        pass
    return

def loop_arange(n):
    for i in np_arange(n):
        pass
    return

%timeit(loop_range(n))
%timeit(loop_arange(n))

funcs = ['loop_range', 'loop_arange']
orders_n = [10**n for n in range(1, 6)]
times_n = {f:[] for f in funcs}

for n in orders_n:
    for f in funcs:
        times_n[f].append(min(timeit.Timer('%s(n)' %f, 
                'from __main__ import %s, n' %f)
                    .repeat(repeat=3, number=1000)))

import matplotlib.pyplot as plt

labels = [('loop_range', 'in-built range()'), 
          ('loop_arange', 'numpy.arange()')]

matplotlib.rcParams.update({'font.size': 12})

fig = plt.figure(figsize=(10,8))
for lb in labels:
    plt.plot(orders_n, times_n[lb[0]], 
             alpha=0.5, label=lb[1], marker='o', lw=3)
plt.xlabel('sample size n')
plt.ylabel('time per computation in milliseconds [ms]')
plt.legend(loc=2)
plt.grid()
plt.xscale('log')
plt.yscale('log')
plt.title('Performance of explicit for-loops vs. list comprehensions')

max_perf = max( a/r for r,a in zip(times_n['loop_range'],
                                   times_n['loop_arange']) )
min_perf = min( a/r for r,a in zip(times_n['loop_range'],
                                   times_n['loop_arange']) )

ftext = 'the in-built range() is {:.2f}x to '\
        '{:.2f}x faster than numpy.arange()'\
        .format(min_perf, max_perf)
plt.figtext(.14,.75, ftext, fontsize=11, ha='left')

plt.show()

# The statistics module has been added to
# the standard library in Python 3.4

import timeit
import statistics as stats
import numpy as np

def calc_mean(samples):
    return sum(samples)/len(samples)

def np_mean(samples):
    return np.mean(samples)

def np_mean_ary(np_array):
    return np.mean(np_array)

def st_mean(samples):
    return stats.mean(samples)

n = 1000000
samples = list(range(n))
samples_array = np.arange(n)

assert(st_mean(samples) == np_mean(samples)
       == calc_mean(samples) == np_mean_ary(samples_array))

%timeit(calc_mean(samples))
%timeit(np_mean(samples))
%timeit(np_mean_ary(samples_array))
%timeit(st_mean(samples))

funcs = ['st_mean', 'np_mean', 'calc_mean', 'np_mean_ary']
orders_n = [10**n for n in range(1, 6)]
times_n = {f:[] for f in funcs}

for n in orders_n:
    samples = list(range(n))
    for f in funcs:
        if f == 'np_mean_ary':
            samples = np.arange(n)
        times_n[f].append(min(timeit.Timer('%s(samples)' %f, 
                'from __main__ import %s, samples' %f)
                    .repeat(repeat=3, number=1000)))

%pylab inline

import matplotlib.pyplot as plt

labels = [('st_mean', 'statistics.mean()'), 
          ('np_mean', 'numpy.mean() on list'),
          ('np_mean_ary', 'numpy.mean() on array'),
          ('calc_mean', 'sum(samples)/len(samples)')
          ]

matplotlib.rcParams.update({'font.size': 12})

fig = plt.figure(figsize=(10,8))
for lb in labels:
    plt.plot(orders_n, times_n[lb[0]], 
             alpha=0.5, label=lb[1], marker='o', lw=3)
plt.xlabel('sample size n')
plt.ylabel('time per computation in milliseconds [ms]')
plt.legend(loc=2)
plt.grid()
plt.xscale('log')
plt.yscale('log')
plt.title('Performance of different approaches for calculating sample means')

max_perf = max( s/c for s,c in zip(times_n['st_mean'],
                                   times_n['np_mean_ary']) )
min_perf = min( s/c for s,c in zip(times_n['st_mean'],
                                   times_n['np_mean_ary']) )

ftext = 'using numpy.mean() on np.arrays is {:.2f}x to '\
        '{:.2f}x faster than statistics.mean() on lists'\
        .format(min_perf, max_perf)
plt.figtext(.14,.15, ftext, fontsize=11, ha='left')

plt.show()

def py_lstsqr(x, y):
    """ Computes the least-squares solution to a linear matrix equation. """

    x_avg = sum(x)/len(x)
    y_avg = sum(y)/len(y)
    var_x = 0
    cov_xy = 0
    for x_i, y_i in zip(x,y):
        temp = (x_i - x_avg)
        var_x += temp**2
        cov_xy += temp*(y_i - y_avg)
    slope = cov_xy / var_x
    y_interc = y_avg - slope*x_avg
    return (slope, y_interc)

%load_ext cythonmagic

%%cython

def cy_lstsqr(x, y):
    """ Computes the least-squares solution to a linear matrix equation. """
    cdef double x_avg, y_avg, temp, var_x, cov_xy, slope, y_interc, x_i, y_i
    x_avg = sum(x)/len(x)
    y_avg = sum(y)/len(y)
    var_x = 0
    cov_xy = 0
    for x_i, y_i in zip(x,y):
        temp = (x_i - x_avg)
        var_x += temp**2
        cov_xy += temp*(y_i - y_avg)
    slope = cov_xy / var_x
    y_interc = y_avg - slope*x_avg
    return (slope, y_interc)

%pylab inline

from matplotlib import pyplot as plt

import timeit
import random
random.seed(12345)

n = 500
x = [x_i*random.randrange(8,12)/10 for x_i in range(n)]
y = [y_i*random.randrange(10,14)/10 for y_i in range(n)]

slope, intercept = cy_lstsqr(x, y)

line_x = [round(min(x)) - 1, round(max(x)) + 1]
line_y = [slope*x_i + intercept for x_i in line_x]

plt.figure(figsize=(8,8))
plt.scatter(x,y)
plt.plot(line_x, line_y, color='red', lw='2')

plt.ylabel('y')
plt.xlabel('x')
plt.title('Linear regression via least squares fit')

ftext = 'y = ax + b = {:.3f} + {:.3f}x'\
        .format(slope, intercept)
plt.figtext(.15,.8, ftext, fontsize=11, ha='left')

plt.show()

import timeit
import random
random.seed(12345)

funcs = ['py_lstsqr', 'cy_lstsqr']

orders_n = [10**n for n in range(1, 6)]
times_n = {f:[] for f in funcs}

for n in orders_n:
    x = [x_i*random.randrange(8,12)/10 for x_i in range(n)]
    y = [y_i*random.randrange(10,14)/10 for y_i in range(n)]
    for f in funcs:
        times_n[f].append(min(timeit.Timer('%s(x,y)' %f, 
                      'from __main__ import %s, x, y' %f)
                              .repeat(repeat=3, number=1000)))

import matplotlib.pyplot as plt

labels = [('py_lstsqr', 'regular Python (CPython)'), 
          ('cy_lstsqr', 'Cython implementation')]


matplotlib.rcParams.update({'font.size': 12})

fig = plt.figure(figsize=(10,8))
for lb in labels:
    plt.plot(orders_n, times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3)
plt.xlabel('sample size n')
plt.ylabel('time per computation in milliseconds [ms]')
plt.xlim([1,max(orders_n) + max(orders_n) * 10])
plt.legend(loc=2)
plt.grid()
plt.xscale('log')
plt.yscale('log')
max_perf = max( py/nu for py,nu in zip(times_n['py_lstsqr'],
                                   times_n['cy_lstsqr']) )
min_perf = min( py/nu for py,nu in zip(times_n['py_lstsqr'],
                                   times_n['cy_lstsqr']) )
ftext = 'Using Cython is {:.2f}x to '\
        '{:.2f}x faster than regular (C)Python'\
        .format(min_perf, max_perf)
plt.figtext(.15,.8, ftext, fontsize=11, ha='left')
plt.title('Performance of least square fit implementations in Cython and (C)Python')
plt.show()

import numpy as np
import scipy.stats
from numba import jit

%load_ext cythonmagic

def py_mat_lstsqr(x, y):
    """ Computes the least-squares solution to a linear matrix equation. """
    X = np.vstack([x, np.ones(len(x))]).T
    return (np.linalg.inv(X.T.dot(X)).dot(X.T)).dot(y)

@jit
def numba_mat_lstsqr(x, y):
    """ Computes the least-squares solution to a linear matrix equation. """
    X = np.vstack([x, np.ones(len(x))]).T
    return (np.linalg.inv(X.T.dot(X)).dot(X.T)).dot(y)

%%cython
def cy_mat_lstsqr(x, y):
    """ Computes the least-squares solution to a linear matrix equation. """
    X = np.vstack([x, np.ones(len(x))]).T
    return (np.linalg.inv(X.T.dot(X)).dot(X.T)).dot(y)

def py_lstsqr(x, y):
    """ Computes the least-squares solution to a linear matrix equation. """
    x_avg = sum(x)/len(x)
    y_avg = sum(y)/len(y)
    var_x = 0
    cov_xy = 0
    for x_i, y_i in zip(x,y):
        temp = (x_i - x_avg)
        var_x += temp**2
        cov_xy += temp*(y_i - y_avg)
    slope = cov_xy / var_x
    y_interc = y_avg - slope*x_avg
    return (slope, y_interc)

@jit
def numba_lstsqr(x, y):
    """ Computes the least-squares solution to a linear matrix equation. """
    x_avg = sum(x)/len(x)
    y_avg = sum(y)/len(y)
    var_x = 0
    cov_xy = 0
    for x_i, y_i in zip(x,y):
        temp = (x_i - x_avg)
        var_x += temp**2
        cov_xy += temp*(y_i - y_avg)
    slope = cov_xy / var_x
    y_interc = y_avg - slope*x_avg
    return (slope, y_interc)

%%cython
def cy_lstsqr(x, y):
    """ Computes the least-squares solution to a linear matrix equation. """
    cdef double x_avg, y_avg, temp, var_x, cov_xy, slope, y_interc, x_i, y_i
    x_avg = sum(x)/len(x)
    y_avg = sum(y)/len(y)
    var_x = 0
    cov_xy = 0
    for x_i, y_i in zip(x,y):
        temp = (x_i - x_avg)
        var_x += temp**2
        cov_xy += temp*(y_i - y_avg)
    slope = cov_xy / var_x
    y_interc = y_avg - slope*x_avg
    return (slope, y_interc)

def numpy_lstsqr(x, y):
    """ Computes the least-squares solution to a linear matrix equation. """
    X = np.vstack([x, np.ones(len(x))]).T
    return np.linalg.lstsq(X,y)[0]

def scipy_lstsqr(x,y):
    """ Computes the least-squares solution to a linear matrix equation. """
    return scipy.stats.linregress(x, y)[0:2]

import random
random.seed(12345)

n = 500
x = [x_i*random.randrange(8,12)/10 for x_i in range(n)]
y = [y_i*random.randrange(10,14)/10 for y_i in range(n)]

np.testing.assert_array_almost_equal(
        py_lstsqr(x, y), py_mat_lstsqr(x, y), decimal=6)
np.testing.assert_array_almost_equal(
        numpy_lstsqr(x,y), py_lstsqr(x, y), decimal=6)
np.testing.assert_array_almost_equal(
        scipy_lstsqr(x,y), py_lstsqr(x, y), decimal=6)

print('ok')

%pylab inline

from matplotlib import pyplot as plt

slope, intercept = py_mat_lstsqr(x, y)

line_x = [round(min(x)) - 1, round(max(x)) + 1]
line_y = [slope*x_i + intercept for x_i in line_x]

plt.figure(figsize=(7,6))
plt.scatter(x,y)
plt.plot(line_x, line_y, color='red', lw='2')

plt.ylabel('y')
plt.xlabel('x')
plt.title('Linear regression via least squares fit')

ftext = 'y = ax + b = {:.3f} + {:.3f}x'\
        .format(slope, intercept)
plt.figtext(.15,.8, ftext, fontsize=11, ha='left')

plt.show()

import timeit
import random
random.seed(12345)

funcs = ['py_mat_lstsqr', 'numba_mat_lstsqr', 'cy_mat_lstsqr', 
         'py_lstsqr', 'numba_lstsqr', 'cy_lstsqr',
         'numpy_lstsqr', 'scipy_lstsqr']

orders_n = [10**n for n in range(1, 6)]
times_n = {f:[] for f in funcs}

for n in orders_n:
    x = np.asarray([x_i*np.random.randint(8,12)/10 for x_i in range(n)])
    y = np.asarray([y_i*np.random.randint(10,14)/10 for y_i in range(n)])
    for f in funcs:
        times_n[f].append(min(timeit.Timer('%s(x,y)' %f, 
                      'from __main__ import %s, x, y' %f)
                              .repeat(repeat=3, number=1000)))

import matplotlib.pyplot as plt

labels = [('py_mat_lstsqr', 'matrix equation in reg. (C)Python & NumPy'), 
          ('numba_mat_lstsqr', 'matrix equation in Numba'),
          ('cy_mat_lstsqr', 'matrix equation in Cython & NumPy'),
          ('py_lstsqr', '"classic" least squares in reg. (C)Python'),
          ('numba_lstsqr', '"classic" least squares in Numba'),
          ('cy_lstsqr', '"classic" least squares in Cython'),
          ('numpy_lstsqr', 'least squares via np.linalg.lstsq()'),
          ('scipy_lstsqr', 'least_squares via scipy.stats.linregress()'),]


matplotlib.rcParams.update({'font.size': 12})

fig = plt.figure(figsize=(10,8))
for lb in labels:
    plt.plot(orders_n, times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3)
plt.xlabel('sample size n')
plt.ylabel('time per computation in milliseconds [ms]')
plt.xlim([1,max(orders_n) + max(orders_n) * 10])
plt.legend(loc=2)
plt.grid()
plt.xscale('log')
plt.yscale('log')

max_perf = max( py/nu for py,nu in zip(times_n['py_lstsqr'],
                                   times_n['cy_lstsqr']) )
min_perf = min( py/nu for py,nu in zip(times_n['py_lstsqr'],
                                   times_n['cy_lstsqr']) )

ftext = 'Using Cython is {:.2f}x to '\
        '{:.2f}x faster than regular (C)Python'\
        .format(min_perf, max_perf)

plt.figtext(.14,.15, ftext, fontsize=11, ha='left')
plt.title('Performance of least square fit implementations')
plt.show()