We oftentimes want to interact with other libraries and have them handle pandas objects.
from IPython.display import Image
Image(filename='data/pydata-ecosystem.png',height=1024,width=1024)
We are going to look at some interactions with:
scikit-learn
statsmodels
numba
& cython
dask
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_rows = 8
pd.options.display.max_columns = 8
Scikit-Learn's algorithms all deal with numpy arrays. typically:
import sklearn
sklearn.__version__
'0.17'
from sklearn.datasets import california_housing
data = california_housing.fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
X
MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | |
---|---|---|---|---|---|---|---|---|
0 | 8.3252 | 41 | 6.984127 | 1.023810 | 322 | 2.555556 | 37.88 | -122.23 |
1 | 8.3014 | 21 | 6.238137 | 0.971880 | 2401 | 2.109842 | 37.86 | -122.22 |
2 | 7.2574 | 52 | 8.288136 | 1.073446 | 496 | 2.802260 | 37.85 | -122.24 |
3 | 5.6431 | 52 | 5.817352 | 1.073059 | 558 | 2.547945 | 37.85 | -122.25 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
20636 | 2.5568 | 18 | 6.114035 | 1.315789 | 356 | 3.122807 | 39.49 | -121.21 |
20637 | 1.7000 | 17 | 5.205543 | 1.120092 | 1007 | 2.325635 | 39.43 | -121.22 |
20638 | 1.8672 | 18 | 5.329513 | 1.171920 | 741 | 2.123209 | 39.43 | -121.32 |
20639 | 2.3886 | 16 | 5.254717 | 1.162264 | 1387 | 2.616981 | 39.37 | -121.24 |
20640 rows × 8 columns
y = pd.Series(data.target)
y
0 4.526 1 3.585 2 3.521 3 3.413 ... 20636 0.771 20637 0.923 20638 0.847 20639 0.894 dtype: float64
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
%%time
param_grid = dict(
max_features=np.arange(2, 8),
max_depth=[2, 4],
min_samples_split=[5, 10, 15, 20],
)
rfc = RandomForestRegressor(n_estimators=10)
gs = GridSearchCV(rfc, param_grid, cv=5, n_jobs=-1)
gs.fit(X.values, y.values)
CPU times: user 835 ms, sys: 142 ms, total: 977 ms Wall time: 6.91 s
scores = gs.grid_scores_
scores[:10]
[mean: 0.20146, std: 0.05887, params: {'max_depth': 2, 'min_samples_split': 5, 'max_features': 2}, mean: 0.25245, std: 0.07831, params: {'max_depth': 2, 'min_samples_split': 10, 'max_features': 2}, mean: 0.31007, std: 0.05206, params: {'max_depth': 2, 'min_samples_split': 15, 'max_features': 2}, mean: 0.25166, std: 0.04992, params: {'max_depth': 2, 'min_samples_split': 20, 'max_features': 2}, mean: 0.28584, std: 0.06496, params: {'max_depth': 2, 'min_samples_split': 5, 'max_features': 3}, mean: 0.33954, std: 0.08037, params: {'max_depth': 2, 'min_samples_split': 10, 'max_features': 3}, mean: 0.26840, std: 0.05488, params: {'max_depth': 2, 'min_samples_split': 15, 'max_features': 3}, mean: 0.25677, std: 0.08151, params: {'max_depth': 2, 'min_samples_split': 20, 'max_features': 3}, mean: 0.36317, std: 0.09080, params: {'max_depth': 2, 'min_samples_split': 5, 'max_features': 4}, mean: 0.35095, std: 0.09386, params: {'max_depth': 2, 'min_samples_split': 10, 'max_features': 4}]
def unpack_grid_scores(scores):
rows = []
params = sorted(scores[0].parameters)
for row in scores:
mean = row.mean_validation_score
std = row.cv_validation_scores.std()
rows.append([mean, std] + [row.parameters[k] for k in params])
return pd.DataFrame(rows, columns=['mean_', 'std_'] + params)
scores = unpack_grid_scores(gs.grid_scores_)
scores
mean_ | std_ | max_depth | max_features | min_samples_split | |
---|---|---|---|---|---|
0 | 0.201463 | 0.058873 | 2 | 2 | 5 |
1 | 0.252446 | 0.078307 | 2 | 2 | 10 |
2 | 0.310067 | 0.052059 | 2 | 2 | 15 |
3 | 0.251659 | 0.049922 | 2 | 2 | 20 |
... | ... | ... | ... | ... | ... |
44 | 0.535333 | 0.076316 | 4 | 7 | 5 |
45 | 0.536545 | 0.064577 | 4 | 7 | 10 |
46 | 0.529594 | 0.075195 | 4 | 7 | 15 |
47 | 0.536525 | 0.064559 | 4 | 7 | 20 |
48 rows × 5 columns
(scores
.pipe((sns.factorplot,'data'), x='max_features', y='mean_', hue='max_depth', col='min_samples_split')
)
/Users/jreback/miniconda/envs/python3/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison if self._edgecolors == str('face'):
<seaborn.axisgrid.FacetGrid at 0x10cd4c668>
s = pd.Series(gs.best_estimator_.feature_importances_,index=X.columns)
(s.sort_values()
.plot
.barh(figsize=(5,8))
)
<matplotlib.axes._subplots.AxesSubplot at 0x10d0813c8>
import statsmodels
import statsmodels.api as sm
statsmodels.__version__
'0.6.1'
# created in 4. Tidy Data
df = pd.read_hdf('data/games.hdf','df')
df
date | away_team | away_points | home_team | ... | away_strength | home_strength | away_rest | home_rest | |
---|---|---|---|---|---|---|---|---|---|
game_id | |||||||||
0 | 2014-10-28 22:30:00 | Houston Rockets | 108 | Los Angeles Lakers | ... | 0.682927 | 0.256098 | NaN | NaN |
1 | 2014-10-28 20:00:00 | Orlando Magic | 84 | New Orleans Pelicans | ... | 0.304878 | 0.548780 | NaN | NaN |
2 | 2014-10-28 20:00:00 | Dallas Mavericks | 100 | San Antonio Spurs | ... | 0.609756 | 0.670732 | NaN | NaN |
3 | 2014-10-29 19:30:00 | Brooklyn Nets | 105 | Boston Celtics | ... | 0.463415 | 0.487805 | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1232 | 2015-04-15 20:00:00 | San Antonio Spurs | 103 | New Orleans Pelicans | ... | 0.670732 | 0.548780 | 2 | 1 |
1233 | 2015-04-15 20:00:00 | Detroit Pistons | 112 | New York Knicks | ... | 0.390244 | 0.207317 | 1 | 1 |
1234 | 2015-04-15 20:00:00 | Miami Heat | 105 | Philadelphia 76ers | ... | 0.451220 | 0.219512 | 1 | 1 |
1235 | 2015-04-15 19:00:00 | Charlotte Hornets | 87 | Toronto Raptors | ... | 0.402439 | 0.597561 | 1 | -1 |
1236 rows × 10 columns
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1236 entries, 0 to 1235 Data columns (total 10 columns): date 1230 non-null datetime64[ns] away_team 1230 non-null object away_points 1230 non-null float64 home_team 1230 non-null object home_points 1230 non-null float64 home_win 1236 non-null bool away_strength 1230 non-null float64 home_strength 1230 non-null float64 away_rest 1217 non-null float64 home_rest 1213 non-null float64 dtypes: bool(1), datetime64[ns](1), float64(6), object(2) memory usage: 97.8+ KB
df['home_win'] = df.home_win.astype(int)
f ='home_win ~ home_strength + away_strength + home_rest + away_rest'
res = (sm
.Logit
.from_formula(f, df)
.fit()
)
Optimization terminated successfully. Current function value: 0.564584 Iterations 6
res.summary()
Dep. Variable: | home_win | No. Observations: | 1213 |
---|---|---|---|
Model: | Logit | Df Residuals: | 1208 |
Method: | MLE | Df Model: | 4 |
Date: | Wed, 11 Nov 2015 | Pseudo R-squ.: | 0.1727 |
Time: | 08:33:12 | Log-Likelihood: | -684.84 |
converged: | True | LL-Null: | -827.83 |
LLR p-value: | 1.148e-60 |
coef | std err | z | P>|z| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
Intercept | -0.3254 | 0.296 | -1.098 | 0.272 | -0.906 0.255 |
home_strength | 5.5558 | 0.449 | 12.374 | 0.000 | 4.676 6.436 |
away_strength | -4.3365 | 0.442 | -9.817 | 0.000 | -5.202 -3.471 |
home_rest | 0.1215 | 0.063 | 1.944 | 0.052 | -0.001 0.244 |
away_rest | -0.0411 | 0.061 | -0.669 | 0.503 | -0.161 0.079 |
df2 = df.assign(rest_difference=df.home_rest - df.away_rest,
spread=df.home_points - df.away_points)
f = 'spread ~ home_strength + away_strength + rest_difference'
res = (sm
.OLS
.from_formula(f, df2)
.fit()
)
res.summary()
Dep. Variable: | spread | R-squared: | 0.236 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.234 |
Method: | Least Squares | F-statistic: | 124.6 |
Date: | Wed, 11 Nov 2015 | Prob (F-statistic): | 2.57e-70 |
Time: | 08:33:12 | Log-Likelihood: | -4712.1 |
No. Observations: | 1213 | AIC: | 9432. |
Df Residuals: | 1209 | BIC: | 9453. |
Df Model: | 3 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
Intercept | -0.4035 | 1.546 | -0.261 | 0.794 | -3.437 2.630 |
home_strength | 30.8045 | 2.097 | 14.687 | 0.000 | 26.689 34.919 |
away_strength | -25.6003 | 2.105 | -12.159 | 0.000 | -29.731 -21.470 |
rest_difference | 0.5745 | 0.278 | 2.064 | 0.039 | 0.028 1.121 |
Omnibus: | 9.335 | Durbin-Watson: | 1.997 |
---|---|---|---|
Prob(Omnibus): | 0.009 | Jarque-Bera (JB): | 11.716 |
Skew: | 0.099 | Prob(JB): | 0.00286 |
Kurtosis: | 3.438 | Cond. No. | 10.9 |
from numba import jit
import cython
%load_ext cython
np.random.seed(1234)
pd.set_option('max_row',12)
s = Series(np.random.randn(1e5))
com = 20.0
/Users/jreback/miniconda/envs/python3/lib/python3.4/site-packages/ipykernel/__main__.py:7: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
def python(s):
output = Series(index=range(len(s)))
alpha = 1. / (1. + com)
old_weight = 1.0
new_weight = 1.0
weighted_avg = s[0]
output[0] = weighted_avg
for i in range(1,len(s)):
v = s[i]
old_weight *= (1-alpha)
weighted_avg = ((old_weight * weighted_avg) +
(new_weight * v)) / (old_weight + new_weight)
old_weight += new_weight
output[i] = weighted_avg
return output
%%cython
cimport cython
@cython.wraparound(False)
@cython.boundscheck(False)
def _cython(double[:] arr, double com, double[:] output):
cdef:
double alpha, old_weight, new_weight, weighted_avg, v
int i
alpha = 1. / (1. + com)
old_weight = 1.0
new_weight = 1.0
weighted_avg = arr[0]
output[0] = weighted_avg
for i in range(1,arr.shape[0]):
v = arr[i]
old_weight *= (1-alpha)
weighted_avg = ((old_weight * weighted_avg) +
(new_weight * v)) / (old_weight + new_weight)
old_weight += new_weight
output[i] = weighted_avg
return output
def cython1(s):
output = np.empty(len(s),dtype='float64')
_cython(s.values, com, output)
return Series(output)
def cython2(s):
return pd.ewma(s,com=com,adjust=True)
@jit
def _numba(arr, output):
alpha = 1. / (1. + com)
old_weight = 1.0
new_weight = 1.0
weighted_avg = arr[0]
output[0] = weighted_avg
for i in range(1,arr.shape[0]):
v = arr[i]
old_weight *= (1-alpha)
weighted_avg = ((old_weight * weighted_avg) +
(new_weight * v)) / (old_weight + new_weight)
old_weight += new_weight
output[i] = weighted_avg
def numba(s):
output = np.empty(len(s),dtype='float64')
_numba(s.values, output)
return Series(output)
result1 = python(s)
result2 = cython1(s)
result3 = cython2(s)
result4 = numba(s)
result1.equals(
result2) and result1.equals(
result3) and result1.equals(
result4)
True
%timeit python(s)
1 loops, best of 3: 1.11 s per loop
%timeit cython1(s)
1000 loops, best of 3: 971 µs per loop
%timeit cython2(s)
100 loops, best of 3: 6.98 ms per loop
%timeit numba(s)
1000 loops, best of 3: 954 µs per loop
import dask.dataframe as dd
from dask import threaded, multiprocessing
np.random.seed(1234)
N = int(1e7)
df = DataFrame({'key' : np.random.randint(0,1000,size=N),
'value' : np.random.randn(N)})
ddf = dd.from_pandas(df, npartitions=8)
ddf
dd.DataFrame<from_pandas-6bba1156f06742bbdaf58b9d400597eb, divisions=(0, 1250000, 2500000, ..., 8750000, 9999999)>
%timeit df.groupby('key').value.sum()
1 loops, best of 3: 258 ms per loop
%timeit ddf.groupby('key').value.sum().compute(get=threaded.get)
10 loops, best of 3: 106 ms per loop