#!/usr/bin/env python
# coding: utf-8

# # Compute
# 
# We oftentimes want to interact with other libraries and have them handle pandas objects.

# In[1]:


from IPython.display import Image
Image(filename='data/pydata-ecosystem.png',height=1024,width=1024) 


# We are going to look at some interactions with:
# 
# - ``scikit-learn``
# - ``statsmodels``
# - ``numba`` & ``cython``
# - ``dask``

# In[2]:


import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

pd.options.display.max_rows = 8
pd.options.display.max_columns = 8


# # Scikit-Learn
# 
# http://scikit-learn.org/stable/documentation.html

# Scikit-Learn's algorithms all deal with numpy arrays. typically:
# 
# - data munging in pandas
# - pass numpy array to an Estimator
# - wrap result in a DataFrame or Series

# In[3]:


import sklearn
sklearn.__version__


# In[4]:


from sklearn.datasets import california_housing
data = california_housing.fetch_california_housing()


# In[5]:


X = pd.DataFrame(data.data, columns=data.feature_names)
X


# In[6]:


y = pd.Series(data.target)
y


# In[7]:


from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV


# In[8]:


get_ipython().run_cell_magic('time', '', 'param_grid = dict(\n    max_features=np.arange(2, 8),\n    max_depth=[2, 4],\n    min_samples_split=[5, 10, 15, 20],\n)\nrfc = RandomForestRegressor(n_estimators=10)\ngs = GridSearchCV(rfc, param_grid, cv=5, n_jobs=-1)\ngs.fit(X.values, y.values)\n')


# In[9]:


scores = gs.grid_scores_
scores[:10]


# In[10]:


def unpack_grid_scores(scores):
    rows = []
    params = sorted(scores[0].parameters)
    for row in scores:
        mean = row.mean_validation_score
        std = row.cv_validation_scores.std()
        rows.append([mean, std] + [row.parameters[k] for k in params])
    return pd.DataFrame(rows, columns=['mean_', 'std_'] + params)


# In[11]:


scores = unpack_grid_scores(gs.grid_scores_)
scores


# In[12]:


(scores
       .pipe((sns.factorplot,'data'), x='max_features', y='mean_', hue='max_depth', col='min_samples_split')
 )


# In[13]:


s = pd.Series(gs.best_estimator_.feature_importances_,index=X.columns)
(s.sort_values()
  .plot
  .barh(figsize=(5,8))
)


# # Statsmodels
# 
# http://statsmodels.sourceforge.net/

# In[14]:


import statsmodels
import statsmodels.api as sm
statsmodels.__version__


# In[15]:


# created in 4. Tidy Data
df = pd.read_hdf('data/games.hdf','df')
df


# In[16]:


df.info()


# In[17]:


df['home_win'] = df.home_win.astype(int)


# In[18]:


f ='home_win ~ home_strength + away_strength + home_rest + away_rest'
res = (sm
         .Logit
         .from_formula(f, df)
         .fit()
)


# In[19]:


res.summary()


# In[20]:


df2 = df.assign(rest_difference=df.home_rest - df.away_rest,
                spread=df.home_points - df.away_points)

f = 'spread ~ home_strength + away_strength + rest_difference'
res = (sm
         .OLS
         .from_formula(f, df2)
         .fit()
       )


# In[21]:


res.summary()


# # Numba & Cython
# 
# http://pandas.pydata.org/pandas-docs/stable/enhancingperf.html

# In[22]:


from numba import jit
import cython
get_ipython().run_line_magic('load_ext', 'cython')

np.random.seed(1234)
pd.set_option('max_row',12)
s = Series(np.random.randn(1e5))
com = 20.0


# In[23]:


def python(s):
    output = Series(index=range(len(s)))

    alpha = 1. / (1. + com)
    old_weight = 1.0
    new_weight = 1.0
    weighted_avg = s[0]
    output[0] = weighted_avg
    
    for i in range(1,len(s)):
        v = s[i]
        old_weight *= (1-alpha)
        weighted_avg = ((old_weight * weighted_avg) + 
                        (new_weight * v)) / (old_weight + new_weight)
        old_weight += new_weight
        output[i] = weighted_avg
        
    return output


# In[24]:


get_ipython().run_cell_magic('cython', '', 'cimport cython\n@cython.wraparound(False)\n@cython.boundscheck(False)\ndef _cython(double[:] arr, double com, double[:] output):\n    cdef:\n        double alpha, old_weight, new_weight, weighted_avg, v\n        int i\n    \n    alpha = 1. / (1. + com)\n    old_weight = 1.0\n    new_weight = 1.0\n    weighted_avg = arr[0]\n    output[0] = weighted_avg\n    \n    for i in range(1,arr.shape[0]):\n        v = arr[i]\n        old_weight *= (1-alpha)\n        weighted_avg = ((old_weight * weighted_avg) + \n                        (new_weight * v)) / (old_weight + new_weight)\n        old_weight += new_weight\n        output[i] = weighted_avg\n        \n    return output\n')


# In[25]:


def cython1(s):
    output = np.empty(len(s),dtype='float64')
    _cython(s.values, com, output)
    return Series(output)


# In[26]:


def cython2(s):
    return pd.ewma(s,com=com,adjust=True)


# In[27]:


@jit
def _numba(arr, output):
    alpha = 1. / (1. + com)
    old_weight = 1.0
    new_weight = 1.0
    weighted_avg = arr[0]
    output[0] = weighted_avg
    
    for i in range(1,arr.shape[0]):
        v = arr[i]
        old_weight *= (1-alpha)
        weighted_avg = ((old_weight * weighted_avg) + 
                        (new_weight * v)) / (old_weight + new_weight)
        old_weight += new_weight
        output[i] = weighted_avg
    

def numba(s):
 
    output = np.empty(len(s),dtype='float64')
    _numba(s.values, output)
    return Series(output)


# In[28]:


result1 = python(s)
result2 = cython1(s)
result3 = cython2(s)
result4 = numba(s)
result1.equals(
    result2) and result1.equals(
    result3) and result1.equals(
    result4)


# In[29]:


get_ipython().run_line_magic('timeit', 'python(s)')


# In[30]:


get_ipython().run_line_magic('timeit', 'cython1(s)')


# In[31]:


get_ipython().run_line_magic('timeit', 'cython2(s)')


# In[32]:


get_ipython().run_line_magic('timeit', 'numba(s)')


# # Dask
# 
# https://dask.readthedocs.org/en/latest/

# In[33]:


import dask.dataframe as dd
from dask import threaded, multiprocessing


# In[34]:


np.random.seed(1234)
N = int(1e7)
df = DataFrame({'key' : np.random.randint(0,1000,size=N), 
                'value' : np.random.randn(N)})
ddf = dd.from_pandas(df, npartitions=8)
ddf


# In[35]:


get_ipython().run_line_magic('timeit', "df.groupby('key').value.sum()")


# In[36]:


get_ipython().run_line_magic('timeit', "ddf.groupby('key').value.sum().compute(get=threaded.get)")