Compute¶

We oftentimes want to interact with other libraries and have them handle pandas objects.

In [1]:

from IPython.display import Image
Image(filename='data/pydata-ecosystem.png',height=1024,width=1024) 

Out[1]:

We are going to look at some interactions with:

scikit-learn
statsmodels
numba & cython
dask

In [2]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.options.display.max_rows = 8
pd.options.display.max_columns = 8

Scikit-Learn¶

http://scikit-learn.org/stable/documentation.html

Scikit-Learn's algorithms all deal with numpy arrays. typically:

data munging in pandas
pass numpy array to an Estimator
wrap result in a DataFrame or Series

In [3]:

import sklearn
sklearn.__version__

Out[3]:

'0.17'

In [4]:

from sklearn.datasets import california_housing
data = california_housing.fetch_california_housing()

In [5]:

X = pd.DataFrame(data.data, columns=data.feature_names)
X

Out[5]:

	MedInc	HouseAge	AveRooms	AveBedrms	Population	AveOccup	Latitude	Longitude
0	8.3252	41	6.984127	1.023810	322	2.555556	37.88	-122.23
1	8.3014	21	6.238137	0.971880	2401	2.109842	37.86	-122.22
2	7.2574	52	8.288136	1.073446	496	2.802260	37.85	-122.24
3	5.6431	52	5.817352	1.073059	558	2.547945	37.85	-122.25
...	...	...	...	...	...	...	...	...
20636	2.5568	18	6.114035	1.315789	356	3.122807	39.49	-121.21
20637	1.7000	17	5.205543	1.120092	1007	2.325635	39.43	-121.22
20638	1.8672	18	5.329513	1.171920	741	2.123209	39.43	-121.32
20639	2.3886	16	5.254717	1.162264	1387	2.616981	39.37	-121.24

20640 rows × 8 columns

In [6]:

y = pd.Series(data.target)
y

Out[6]:

0        4.526
1        3.585
2        3.521
3        3.413
         ...  
20636    0.771
20637    0.923
20638    0.847
20639    0.894
dtype: float64

In [7]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV

In [8]:

%%time
param_grid = dict(
    max_features=np.arange(2, 8),
    max_depth=[2, 4],
    min_samples_split=[5, 10, 15, 20],
)
rfc = RandomForestRegressor(n_estimators=10)
gs = GridSearchCV(rfc, param_grid, cv=5, n_jobs=-1)
gs.fit(X.values, y.values)

CPU times: user 835 ms, sys: 142 ms, total: 977 ms
Wall time: 6.91 s

In [9]:

scores = gs.grid_scores_
scores[:10]

Out[9]:

[mean: 0.20146, std: 0.05887, params: {'max_depth': 2, 'min_samples_split': 5, 'max_features': 2},
 mean: 0.25245, std: 0.07831, params: {'max_depth': 2, 'min_samples_split': 10, 'max_features': 2},
 mean: 0.31007, std: 0.05206, params: {'max_depth': 2, 'min_samples_split': 15, 'max_features': 2},
 mean: 0.25166, std: 0.04992, params: {'max_depth': 2, 'min_samples_split': 20, 'max_features': 2},
 mean: 0.28584, std: 0.06496, params: {'max_depth': 2, 'min_samples_split': 5, 'max_features': 3},
 mean: 0.33954, std: 0.08037, params: {'max_depth': 2, 'min_samples_split': 10, 'max_features': 3},
 mean: 0.26840, std: 0.05488, params: {'max_depth': 2, 'min_samples_split': 15, 'max_features': 3},
 mean: 0.25677, std: 0.08151, params: {'max_depth': 2, 'min_samples_split': 20, 'max_features': 3},
 mean: 0.36317, std: 0.09080, params: {'max_depth': 2, 'min_samples_split': 5, 'max_features': 4},
 mean: 0.35095, std: 0.09386, params: {'max_depth': 2, 'min_samples_split': 10, 'max_features': 4}]

In [10]:

def unpack_grid_scores(scores):
    rows = []
    params = sorted(scores[0].parameters)
    for row in scores:
        mean = row.mean_validation_score
        std = row.cv_validation_scores.std()
        rows.append([mean, std] + [row.parameters[k] for k in params])
    return pd.DataFrame(rows, columns=['mean_', 'std_'] + params)

In [11]:

scores = unpack_grid_scores(gs.grid_scores_)
scores

Out[11]:

	mean_	std_	max_depth	max_features	min_samples_split
0	0.201463	0.058873	2	2	5
1	0.252446	0.078307	2	2	10
2	0.310067	0.052059	2	2	15
3	0.251659	0.049922	2	2	20
...	...	...	...	...	...
44	0.535333	0.076316	4	7	5
45	0.536545	0.064577	4	7	10
46	0.529594	0.075195	4	7	15
47	0.536525	0.064559	4	7	20

48 rows × 5 columns

In [12]:

(scores
       .pipe((sns.factorplot,'data'), x='max_features', y='mean_', hue='max_depth', col='min_samples_split')
 )

/Users/jreback/miniconda/envs/python3/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

Out[12]:

<seaborn.axisgrid.FacetGrid at 0x10cd4c668>

In [13]:

s = pd.Series(gs.best_estimator_.feature_importances_,index=X.columns)
(s.sort_values()
  .plot
  .barh(figsize=(5,8))
)

Out[13]:

<matplotlib.axes._subplots.AxesSubplot at 0x10d0813c8>

Statsmodels¶

http://statsmodels.sourceforge.net/

In [14]:

import statsmodels
import statsmodels.api as sm
statsmodels.__version__

Out[14]:

'0.6.1'

In [15]:

# created in 4. Tidy Data
df = pd.read_hdf('data/games.hdf','df')
df

Out[15]:

	date	away_team	away_points	home_team	...	away_strength	home_strength	away_rest	home_rest
game_id
0	2014-10-28 22:30:00	Houston Rockets	108	Los Angeles Lakers	...	0.682927	0.256098	NaN	NaN
1	2014-10-28 20:00:00	Orlando Magic	84	New Orleans Pelicans	...	0.304878	0.548780	NaN	NaN
2	2014-10-28 20:00:00	Dallas Mavericks	100	San Antonio Spurs	...	0.609756	0.670732	NaN	NaN
3	2014-10-29 19:30:00	Brooklyn Nets	105	Boston Celtics	...	0.463415	0.487805	NaN	NaN
...	...	...	...	...	...	...	...	...	...
1232	2015-04-15 20:00:00	San Antonio Spurs	103	New Orleans Pelicans	...	0.670732	0.548780	2	1
1233	2015-04-15 20:00:00	Detroit Pistons	112	New York Knicks	...	0.390244	0.207317	1	1
1234	2015-04-15 20:00:00	Miami Heat	105	Philadelphia 76ers	...	0.451220	0.219512	1	1
1235	2015-04-15 19:00:00	Charlotte Hornets	87	Toronto Raptors	...	0.402439	0.597561	1	-1

1236 rows × 10 columns

In [16]:

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1236 entries, 0 to 1235
Data columns (total 10 columns):
date             1230 non-null datetime64[ns]
away_team        1230 non-null object
away_points      1230 non-null float64
home_team        1230 non-null object
home_points      1230 non-null float64
home_win         1236 non-null bool
away_strength    1230 non-null float64
home_strength    1230 non-null float64
away_rest        1217 non-null float64
home_rest        1213 non-null float64
dtypes: bool(1), datetime64[ns](1), float64(6), object(2)
memory usage: 97.8+ KB

In [17]:

df['home_win'] = df.home_win.astype(int)

In [18]:

f ='home_win ~ home_strength + away_strength + home_rest + away_rest'
res = (sm
         .Logit
         .from_formula(f, df)
         .fit()
)

Optimization terminated successfully.
         Current function value: 0.564584
         Iterations 6

In [19]:

res.summary()

Out[19]:

Logit Regression Results
Dep. Variable:	home_win	No. Observations:	1213
Model:	Logit	Df Residuals:	1208
Method:	MLE	Df Model:	4
Date:	Wed, 11 Nov 2015	Pseudo R-squ.:	0.1727
Time:	08:33:12	Log-Likelihood:	-684.84
converged:	True	LL-Null:	-827.83
		LLR p-value:	1.148e-60

	coef	std err	z	P>\|z\|	[95.0% Conf. Int.]
Intercept	-0.3254	0.296	-1.098	0.272	-0.906 0.255
home_strength	5.5558	0.449	12.374	0.000	4.676 6.436
away_strength	-4.3365	0.442	-9.817	0.000	-5.202 -3.471
home_rest	0.1215	0.063	1.944	0.052	-0.001 0.244
away_rest	-0.0411	0.061	-0.669	0.503	-0.161 0.079

In [20]:

df2 = df.assign(rest_difference=df.home_rest - df.away_rest,
                spread=df.home_points - df.away_points)

f = 'spread ~ home_strength + away_strength + rest_difference'
res = (sm
         .OLS
         .from_formula(f, df2)
         .fit()
       )

In [21]:

res.summary()

Out[21]:

OLS Regression Results
Dep. Variable:	spread	R-squared:	0.236
Model:	OLS	Adj. R-squared:	0.234
Method:	Least Squares	F-statistic:	124.6
Date:	Wed, 11 Nov 2015	Prob (F-statistic):	2.57e-70
Time:	08:33:12	Log-Likelihood:	-4712.1
No. Observations:	1213	AIC:	9432.
Df Residuals:	1209	BIC:	9453.
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Intercept	-0.4035	1.546	-0.261	0.794	-3.437 2.630
home_strength	30.8045	2.097	14.687	0.000	26.689 34.919
away_strength	-25.6003	2.105	-12.159	0.000	-29.731 -21.470
rest_difference	0.5745	0.278	2.064	0.039	0.028 1.121

Omnibus:	9.335	Durbin-Watson:	1.997
Prob(Omnibus):	0.009	Jarque-Bera (JB):	11.716
Skew:	0.099	Prob(JB):	0.00286
Kurtosis:	3.438	Cond. No.	10.9

Numba & Cython¶

http://pandas.pydata.org/pandas-docs/stable/enhancingperf.html

In [22]:

from numba import jit
import cython
%load_ext cython

np.random.seed(1234)
pd.set_option('max_row',12)
s = Series(np.random.randn(1e5))
com = 20.0

/Users/jreback/miniconda/envs/python3/lib/python3.4/site-packages/ipykernel/__main__.py:7: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future

In [23]:

def python(s):
    output = Series(index=range(len(s)))

    alpha = 1. / (1. + com)
    old_weight = 1.0
    new_weight = 1.0
    weighted_avg = s[0]
    output[0] = weighted_avg
    
    for i in range(1,len(s)):
        v = s[i]
        old_weight *= (1-alpha)
        weighted_avg = ((old_weight * weighted_avg) + 
                        (new_weight * v)) / (old_weight + new_weight)
        old_weight += new_weight
        output[i] = weighted_avg
        
    return output

In [24]:

%%cython
cimport cython
@cython.wraparound(False)
@cython.boundscheck(False)
def _cython(double[:] arr, double com, double[:] output):
    cdef:
        double alpha, old_weight, new_weight, weighted_avg, v
        int i
    
    alpha = 1. / (1. + com)
    old_weight = 1.0
    new_weight = 1.0
    weighted_avg = arr[0]
    output[0] = weighted_avg
    
    for i in range(1,arr.shape[0]):
        v = arr[i]
        old_weight *= (1-alpha)
        weighted_avg = ((old_weight * weighted_avg) + 
                        (new_weight * v)) / (old_weight + new_weight)
        old_weight += new_weight
        output[i] = weighted_avg
        
    return output

In [25]:

def cython1(s):
    output = np.empty(len(s),dtype='float64')
    _cython(s.values, com, output)
    return Series(output)

In [26]:

def cython2(s):
    return pd.ewma(s,com=com,adjust=True)

In [27]:

@jit
def _numba(arr, output):
    alpha = 1. / (1. + com)
    old_weight = 1.0
    new_weight = 1.0
    weighted_avg = arr[0]
    output[0] = weighted_avg
    
    for i in range(1,arr.shape[0]):
        v = arr[i]
        old_weight *= (1-alpha)
        weighted_avg = ((old_weight * weighted_avg) + 
                        (new_weight * v)) / (old_weight + new_weight)
        old_weight += new_weight
        output[i] = weighted_avg
    

def numba(s):
 
    output = np.empty(len(s),dtype='float64')
    _numba(s.values, output)
    return Series(output)

In [28]:

result1 = python(s)
result2 = cython1(s)
result3 = cython2(s)
result4 = numba(s)
result1.equals(
    result2) and result1.equals(
    result3) and result1.equals(
    result4)

Out[28]:

True

In [29]:

%timeit python(s)

1 loops, best of 3: 1.11 s per loop

In [30]:

%timeit cython1(s)

1000 loops, best of 3: 971 µs per loop

In [31]:

%timeit cython2(s)

100 loops, best of 3: 6.98 ms per loop

In [32]:

%timeit numba(s)

1000 loops, best of 3: 954 µs per loop

Dask¶

https://dask.readthedocs.org/en/latest/

In [33]:

import dask.dataframe as dd
from dask import threaded, multiprocessing

In [34]:

np.random.seed(1234)
N = int(1e7)
df = DataFrame({'key' : np.random.randint(0,1000,size=N), 
                'value' : np.random.randn(N)})
ddf = dd.from_pandas(df, npartitions=8)
ddf

Out[34]:

dd.DataFrame<from_pandas-6bba1156f06742bbdaf58b9d400597eb, divisions=(0, 1250000, 2500000, ..., 8750000, 9999999)>

In [35]:

%timeit df.groupby('key').value.sum()

1 loops, best of 3: 258 ms per loop

In [36]:

%timeit ddf.groupby('key').value.sum().compute(get=threaded.get)

10 loops, best of 3: 106 ms per loop