#!/usr/bin/env python # coding: utf-8 # # Compute # # We oftentimes want to interact with other libraries and have them handle pandas objects. # In[1]: from IPython.display import Image Image(filename='data/pydata-ecosystem.png',height=1024,width=1024) # We are going to look at some interactions with: # # - ``scikit-learn`` # - ``statsmodels`` # - ``numba`` & ``cython`` # - ``dask`` # In[2]: import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') pd.options.display.max_rows = 8 pd.options.display.max_columns = 8 # # Scikit-Learn # # http://scikit-learn.org/stable/documentation.html # Scikit-Learn's algorithms all deal with numpy arrays. typically: # # - data munging in pandas # - pass numpy array to an Estimator # - wrap result in a DataFrame or Series # In[3]: import sklearn sklearn.__version__ # In[4]: from sklearn.datasets import california_housing data = california_housing.fetch_california_housing() # In[5]: X = pd.DataFrame(data.data, columns=data.feature_names) X # In[6]: y = pd.Series(data.target) y # In[7]: from sklearn.ensemble import RandomForestRegressor from sklearn.grid_search import GridSearchCV # In[8]: get_ipython().run_cell_magic('time', '', 'param_grid = dict(\n max_features=np.arange(2, 8),\n max_depth=[2, 4],\n min_samples_split=[5, 10, 15, 20],\n)\nrfc = RandomForestRegressor(n_estimators=10)\ngs = GridSearchCV(rfc, param_grid, cv=5, n_jobs=-1)\ngs.fit(X.values, y.values)\n') # In[9]: scores = gs.grid_scores_ scores[:10] # In[10]: def unpack_grid_scores(scores): rows = [] params = sorted(scores[0].parameters) for row in scores: mean = row.mean_validation_score std = row.cv_validation_scores.std() rows.append([mean, std] + [row.parameters[k] for k in params]) return pd.DataFrame(rows, columns=['mean_', 'std_'] + params) # In[11]: scores = unpack_grid_scores(gs.grid_scores_) scores # In[12]: (scores .pipe((sns.factorplot,'data'), x='max_features', y='mean_', hue='max_depth', col='min_samples_split') ) # In[13]: s = pd.Series(gs.best_estimator_.feature_importances_,index=X.columns) (s.sort_values() .plot .barh(figsize=(5,8)) ) # # Statsmodels # # http://statsmodels.sourceforge.net/ # In[14]: import statsmodels import statsmodels.api as sm statsmodels.__version__ # In[15]: # created in 4. Tidy Data df = pd.read_hdf('data/games.hdf','df') df # In[16]: df.info() # In[17]: df['home_win'] = df.home_win.astype(int) # In[18]: f ='home_win ~ home_strength + away_strength + home_rest + away_rest' res = (sm .Logit .from_formula(f, df) .fit() ) # In[19]: res.summary() # In[20]: df2 = df.assign(rest_difference=df.home_rest - df.away_rest, spread=df.home_points - df.away_points) f = 'spread ~ home_strength + away_strength + rest_difference' res = (sm .OLS .from_formula(f, df2) .fit() ) # In[21]: res.summary() # # Numba & Cython # # http://pandas.pydata.org/pandas-docs/stable/enhancingperf.html # In[22]: from numba import jit import cython get_ipython().run_line_magic('load_ext', 'cython') np.random.seed(1234) pd.set_option('max_row',12) s = Series(np.random.randn(1e5)) com = 20.0 # In[23]: def python(s): output = Series(index=range(len(s))) alpha = 1. / (1. + com) old_weight = 1.0 new_weight = 1.0 weighted_avg = s[0] output[0] = weighted_avg for i in range(1,len(s)): v = s[i] old_weight *= (1-alpha) weighted_avg = ((old_weight * weighted_avg) + (new_weight * v)) / (old_weight + new_weight) old_weight += new_weight output[i] = weighted_avg return output # In[24]: get_ipython().run_cell_magic('cython', '', 'cimport cython\n@cython.wraparound(False)\n@cython.boundscheck(False)\ndef _cython(double[:] arr, double com, double[:] output):\n cdef:\n double alpha, old_weight, new_weight, weighted_avg, v\n int i\n \n alpha = 1. / (1. + com)\n old_weight = 1.0\n new_weight = 1.0\n weighted_avg = arr[0]\n output[0] = weighted_avg\n \n for i in range(1,arr.shape[0]):\n v = arr[i]\n old_weight *= (1-alpha)\n weighted_avg = ((old_weight * weighted_avg) + \n (new_weight * v)) / (old_weight + new_weight)\n old_weight += new_weight\n output[i] = weighted_avg\n \n return output\n') # In[25]: def cython1(s): output = np.empty(len(s),dtype='float64') _cython(s.values, com, output) return Series(output) # In[26]: def cython2(s): return pd.ewma(s,com=com,adjust=True) # In[27]: @jit def _numba(arr, output): alpha = 1. / (1. + com) old_weight = 1.0 new_weight = 1.0 weighted_avg = arr[0] output[0] = weighted_avg for i in range(1,arr.shape[0]): v = arr[i] old_weight *= (1-alpha) weighted_avg = ((old_weight * weighted_avg) + (new_weight * v)) / (old_weight + new_weight) old_weight += new_weight output[i] = weighted_avg def numba(s): output = np.empty(len(s),dtype='float64') _numba(s.values, output) return Series(output) # In[28]: result1 = python(s) result2 = cython1(s) result3 = cython2(s) result4 = numba(s) result1.equals( result2) and result1.equals( result3) and result1.equals( result4) # In[29]: get_ipython().run_line_magic('timeit', 'python(s)') # In[30]: get_ipython().run_line_magic('timeit', 'cython1(s)') # In[31]: get_ipython().run_line_magic('timeit', 'cython2(s)') # In[32]: get_ipython().run_line_magic('timeit', 'numba(s)') # # Dask # # https://dask.readthedocs.org/en/latest/ # In[33]: import dask.dataframe as dd from dask import threaded, multiprocessing # In[34]: np.random.seed(1234) N = int(1e7) df = DataFrame({'key' : np.random.randint(0,1000,size=N), 'value' : np.random.randn(N)}) ddf = dd.from_pandas(df, npartitions=8) ddf # In[35]: get_ipython().run_line_magic('timeit', "df.groupby('key').value.sum()") # In[36]: get_ipython().run_line_magic('timeit', "ddf.groupby('key').value.sum().compute(get=threaded.get)")