#!/usr/bin/env python
# coding: utf-8

# # Statistics and Machine Learning
# 
# Some operations are especially useful for statistics
# 
# - `get_dummies`
# - `from_dummies` ([someday](https://github.com/pydata/pandas/issues/8745)?)
# - Categoricals
# - `sample`
# 

# In[1]:


import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_rows = 10
get_ipython().run_line_magic('matplotlib', 'inline')


# It's quite common to have categorical data (in the statistical sense), which must be transformed before putting them into an algorithm. There are a couple ways to handle this.
# 
# ### Categoricals
# 
# This basically creates a mapping between the categories and integers. This sometimes makes sense if you're representing soemthing like responses to a survey where the responses are `bad`, `neutral` and `good`.

# In[2]:


np.random.seed(27)
s = pd.Series(np.random.choice(['bad', 'neutral', 'good'], size=40))
s


# In[3]:


np.random.seed(27)
s = pd.Series(pd.Categorical(np.random.choice(['bad', 'neutral', 'good'], size=40),
                             categories=['bad', 'neutral', 'good'], ordered=True))
s


# Categoricals can put inside Series or DataFrames just like any other column. The have a
# a special `.cat` namespace.

# In[4]:


s.cat.categories


# In[5]:


s.cat.codes


# In[6]:


df = pd.concat([pd.DataFrame(np.random.randn(40, 3), columns=list('abc')),
                s],
               axis=1)
df


# Categoricals are still quite new. Other packages are starting to support them (notably seaborn and patsy).
# 
# There is also `get_dummies`.

# In[7]:


pd.get_dummies(s)


# In[8]:


kinds = [
    'A|B',
    'A|B|C',
    'C',
    'B|A',
    'A|B'
]
s = pd.Series(kinds)
s


# In[9]:


s.str.get_dummies(sep='|')


# # Integration with other libraries
# 
# The two foundational libraries for stats in python are statsmodels and scikit-learn.
# Everyone speaks the *lingua franca* of NumPy arrays, but there is some integration at the higher level of DataFrames.

# # Statsmodels

# Brief primer: you give a **estimator** (OLS, WLS, GLM) a **formula** and **dataset**. You then fit that model. The integration with pandas here is fantastic.

# http://nbviewer.ipython.org/urls/umich.box.com/shared/static/zyl08wsmxwoh6ts70v4o.ipynb?
# 

# In[5]:


import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

get_ipython().run_line_magic('matplotlib', 'inline')
pd.options.display.max_rows = 10


# We've got some longintudinal data (repeated measures of the same individual on children with HIV. There are two treatment arms, and each child receives an anti-retroviral theropy or are  in the control. Here's the data:

# In[6]:


df = pd.read_csv('data/cd4.csv', parse_dates=['VDATE'], index_col=['newpid', 'VISIT'])
df = df.dropna()  # estimator can't handle NaNs so ignore for now.
df.head()


# Some basic statistics:

# In[19]:


df.groupby(['arv', 'treatmnt']).CD4PCT.count().unstack()


# In[20]:


df.groupby('arv').CD4CNT.mean()


# In[21]:


sns.factorplot(x='arv', y='CD4CNT', col='treatmnt', data=df)


# In[22]:


years_since = df.groupby(level='newpid').VDATE.apply(lambda x: (x - x.min()).dt.days / 365)
df['age'] = df.baseage + years_since


# In[23]:


sns.lmplot("age", "CD4PCT", data=df, hue="arv")


# These statistics are throwing away information. They don't tie one observation of an individual to the second observation *of the same individual*. We'd expect some people to consistently have higher or lower `CD4PCT` than others. We'll use a GEE model, but we have to extract the age at the time of each observation to get there.

# In[24]:


mod_basic = sm.GEE.from_formula("CD4PCT ~ age + arv + treatmnt", "newpid", df.reset_index())
res_basic = mod_basic.fit()
res_basic.summary()


# In[25]:


ex = sm.cov_struct.Exchangeable()
mod_exchangable = sm.GEE.from_formula("CD4PCT ~ age + arv + treatmnt", "newpid", df.reset_index(),
                                      cov_struct=ex)
res_exchangable = mod_exchangable.fit()
res_exchangable.summary()


# In[26]:


fig, axes = plt.subplots(figsize=(15, 5), ncols=2, sharey=True)
axes[0].scatter(res_basic.fittedvalues, res_basic.resid)
axes[1].scatter(res_exchangable.fittedvalues, res_exchangable.resid)


# # Scikit-Learn

# Scikit-Learn's algorithms all deal with numpy arrays. My workflow is typically
# 
# - data munging in pandas
# - pass numpy array to an Estimator
# - wrap result in a DataFrame or Series

# In[9]:


from sklearn.datasets import california_housing
data = california_housing.fetch_california_housing()


# In[10]:


X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)


# In[11]:


X.head()


# In[12]:


y.head()


# In[13]:


from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV


# In[14]:


get_ipython().run_cell_magic('time', '', 'param_grid = dict(\n    max_features=np.arange(2, 8),\n    max_depth=[2, 4],\n    min_samples_split=[5, 10, 15, 20],\n)\nrfc = RandomForestRegressor(n_estimators=10)\ngs = GridSearchCV(rfc, param_grid, cv=5, n_jobs=-1)\ngs.fit(X.values, y.values)\n')


# In[15]:


scores = gs.grid_scores_
scores[:10]


# In[16]:


def unpack_grid_scores(scores):
    rows = []
    params = sorted(scores[0].parameters)
    for row in scores:
        mean = row.mean_validation_score
        std = row.cv_validation_scores.std()
        rows.append([mean, std] + [row.parameters[k] for k in params])
    return pd.DataFrame(rows, columns=['mean_', 'std_'] + params)


# In[17]:


scores = unpack_grid_scores(gs.grid_scores_)
scores.head()


# In[18]:


sns.factorplot(x='max_features', y='mean_', hue='max_depth', col='min_samples_split', data=scores)


# In[37]:


pd.Series(gs.best_estimator_.feature_importances_,
          index=X.columns).order().plot(kind='barh')


# In[ ]: