#!/usr/bin/env python # coding: utf-8 # # Statistics and Machine Learning # # Some operations are especially useful for statistics # # - `get_dummies` # - `from_dummies` ([someday](https://github.com/pydata/pandas/issues/8745)?) # - Categoricals # - `sample` # # In[1]: import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt pd.options.display.max_rows = 10 get_ipython().run_line_magic('matplotlib', 'inline') # It's quite common to have categorical data (in the statistical sense), which must be transformed before putting them into an algorithm. There are a couple ways to handle this. # # ### Categoricals # # This basically creates a mapping between the categories and integers. This sometimes makes sense if you're representing soemthing like responses to a survey where the responses are `bad`, `neutral` and `good`. # In[2]: np.random.seed(27) s = pd.Series(np.random.choice(['bad', 'neutral', 'good'], size=40)) s # In[3]: np.random.seed(27) s = pd.Series(pd.Categorical(np.random.choice(['bad', 'neutral', 'good'], size=40), categories=['bad', 'neutral', 'good'], ordered=True)) s # Categoricals can put inside Series or DataFrames just like any other column. The have a # a special `.cat` namespace. # In[4]: s.cat.categories # In[5]: s.cat.codes # In[6]: df = pd.concat([pd.DataFrame(np.random.randn(40, 3), columns=list('abc')), s], axis=1) df # Categoricals are still quite new. Other packages are starting to support them (notably seaborn and patsy). # # There is also `get_dummies`. # In[7]: pd.get_dummies(s) # In[8]: kinds = [ 'A|B', 'A|B|C', 'C', 'B|A', 'A|B' ] s = pd.Series(kinds) s # In[9]: s.str.get_dummies(sep='|') # # Integration with other libraries # # The two foundational libraries for stats in python are statsmodels and scikit-learn. # Everyone speaks the *lingua franca* of NumPy arrays, but there is some integration at the higher level of DataFrames. # # Statsmodels # Brief primer: you give a **estimator** (OLS, WLS, GLM) a **formula** and **dataset**. You then fit that model. The integration with pandas here is fantastic. # http://nbviewer.ipython.org/urls/umich.box.com/shared/static/zyl08wsmxwoh6ts70v4o.ipynb? # # In[5]: import statsmodels.api as sm import seaborn as sns import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') pd.options.display.max_rows = 10 # We've got some longintudinal data (repeated measures of the same individual on children with HIV. There are two treatment arms, and each child receives an anti-retroviral theropy or are in the control. Here's the data: # In[6]: df = pd.read_csv('data/cd4.csv', parse_dates=['VDATE'], index_col=['newpid', 'VISIT']) df = df.dropna() # estimator can't handle NaNs so ignore for now. df.head() # Some basic statistics: # In[19]: df.groupby(['arv', 'treatmnt']).CD4PCT.count().unstack() # In[20]: df.groupby('arv').CD4CNT.mean() # In[21]: sns.factorplot(x='arv', y='CD4CNT', col='treatmnt', data=df) # In[22]: years_since = df.groupby(level='newpid').VDATE.apply(lambda x: (x - x.min()).dt.days / 365) df['age'] = df.baseage + years_since # In[23]: sns.lmplot("age", "CD4PCT", data=df, hue="arv") # These statistics are throwing away information. They don't tie one observation of an individual to the second observation *of the same individual*. We'd expect some people to consistently have higher or lower `CD4PCT` than others. We'll use a GEE model, but we have to extract the age at the time of each observation to get there. # In[24]: mod_basic = sm.GEE.from_formula("CD4PCT ~ age + arv + treatmnt", "newpid", df.reset_index()) res_basic = mod_basic.fit() res_basic.summary() # In[25]: ex = sm.cov_struct.Exchangeable() mod_exchangable = sm.GEE.from_formula("CD4PCT ~ age + arv + treatmnt", "newpid", df.reset_index(), cov_struct=ex) res_exchangable = mod_exchangable.fit() res_exchangable.summary() # In[26]: fig, axes = plt.subplots(figsize=(15, 5), ncols=2, sharey=True) axes[0].scatter(res_basic.fittedvalues, res_basic.resid) axes[1].scatter(res_exchangable.fittedvalues, res_exchangable.resid) # # Scikit-Learn # Scikit-Learn's algorithms all deal with numpy arrays. My workflow is typically # # - data munging in pandas # - pass numpy array to an Estimator # - wrap result in a DataFrame or Series # In[9]: from sklearn.datasets import california_housing data = california_housing.fetch_california_housing() # In[10]: X = pd.DataFrame(data.data, columns=data.feature_names) y = pd.Series(data.target) # In[11]: X.head() # In[12]: y.head() # In[13]: from sklearn.ensemble import RandomForestRegressor from sklearn.grid_search import GridSearchCV # In[14]: get_ipython().run_cell_magic('time', '', 'param_grid = dict(\n max_features=np.arange(2, 8),\n max_depth=[2, 4],\n min_samples_split=[5, 10, 15, 20],\n)\nrfc = RandomForestRegressor(n_estimators=10)\ngs = GridSearchCV(rfc, param_grid, cv=5, n_jobs=-1)\ngs.fit(X.values, y.values)\n') # In[15]: scores = gs.grid_scores_ scores[:10] # In[16]: def unpack_grid_scores(scores): rows = [] params = sorted(scores[0].parameters) for row in scores: mean = row.mean_validation_score std = row.cv_validation_scores.std() rows.append([mean, std] + [row.parameters[k] for k in params]) return pd.DataFrame(rows, columns=['mean_', 'std_'] + params) # In[17]: scores = unpack_grid_scores(gs.grid_scores_) scores.head() # In[18]: sns.factorplot(x='max_features', y='mean_', hue='max_depth', col='min_samples_split', data=scores) # In[37]: pd.Series(gs.best_estimator_.feature_importances_, index=X.columns).order().plot(kind='barh') # In[ ]: