import pandas as pd import numpy as np import statsmodels.api as sm df_adv = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0) X = df_adv[['TV', 'Radio']] y = df_adv['Sales'] df_adv.head() X = df_adv[['TV', 'Radio']] y = df_adv['Sales'] ## fit a OLS model with intercept on TV and Radio X = sm.add_constant(X) est = sm.OLS(y, X).fit() est.summary() # import formula api as alias smf import statsmodels.formula.api as smf # formula: response ~ predictor + predictor est = smf.ols(formula='Sales ~ TV + Radio', data=df_adv).fit() import pandas as pd df = pd.read_csv('http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/SAheart.data', index_col=0) # copy data and separate predictors and response X = df.copy() y = X.pop('chd') df.head() # compute percentage of chronic heart disease for famhist y.groupby(X.famhist).mean() import statsmodels.formula.api as smf # encode df.famhist as a numeric via pd.Factor df['famhist_ord'] = pd.Categorical(df.famhist).labels est = smf.ols(formula="chd ~ famhist_ord", data=df).fit() # a utility function to only show the coeff section of summary from IPython.core.display import HTML def short_summary(est): return HTML(est.summary().tables[1].as_html()) # fit OLS on categorical variables children and occupation est = smf.ols(formula='chd ~ C(famhist)', data=df).fit() short_summary(est) df = pd.read_csv('https://raw2.github.com/statsmodels/statsmodels/master/' 'statsmodels/datasets/randhie/src/randhie.csv') df["logincome"] = np.log1p(df.income) df[['mdvis', 'logincome', 'hlthp']].tail() plt.scatter(df.logincome, df.mdvis, alpha=0.3) plt.xlabel('Log income') plt.ylabel('Number of visits') income_linspace = np.linspace(df.logincome.min(), df.logincome.max(), 100) est = smf.ols(formula='mdvis ~ logincome + hlthp', data=df).fit() plt.plot(income_linspace, est.params[0] + est.params[1] * income_linspace + est.params[2] * 0, 'r') plt.plot(income_linspace, est.params[0] + est.params[1] * income_linspace + est.params[2] * 1, 'g') short_summary(est) plt.scatter(df.logincome, df.mdvis, alpha=0.3) plt.xlabel('Log income') plt.ylabel('Number of visits') est = smf.ols(formula='mdvis ~ hlthp * logincome', data=df).fit() plt.plot(income_linspace, est.params[0] + est.params[1] * 0 + est.params[2] * income_linspace + est.params[3] * 0 * income_linspace, 'r') plt.plot(income_linspace, est.params[0] + est.params[1] * 1 + est.params[2] * income_linspace + est.params[3] * 1 * income_linspace, 'g') short_summary(est) # load the boston housing dataset - median house values in the Boston area df = pd.read_csv('http://vincentarelbundock.github.io/Rdatasets/csv/MASS/Boston.csv') # plot lstat (% lower status of the population) against median value plt.figure(figsize=(6 * 1.618, 6)) plt.scatter(df.lstat, df.medv, s=10, alpha=0.3) plt.xlabel('lstat') plt.ylabel('medv') # points linearlyd space on lstats x = pd.DataFrame({'lstat': np.linspace(df.lstat.min(), df.lstat.max(), 100)}) # 1-st order polynomial poly_1 = smf.ols(formula='medv ~ 1 + lstat', data=df).fit() plt.plot(x.lstat, poly_1.predict(x), 'b-', label='Poly n=1 $R^2$=%.2f' % poly_1.rsquared, alpha=0.9) # 2-nd order polynomial poly_2 = smf.ols(formula='medv ~ 1 + lstat + I(lstat ** 2.0)', data=df).fit() plt.plot(x.lstat, poly_2.predict(x), 'g-', label='Poly n=2 $R^2$=%.2f' % poly_2.rsquared, alpha=0.9) # 3-rd order polynomial poly_3 = smf.ols(formula='medv ~ 1 + lstat + I(lstat ** 2.0) + I(lstat ** 3.0)', data=df).fit() plt.plot(x.lstat, poly_3.predict(x), 'r-', alpha=0.9, label='Poly n=3 $R^2$=%.2f' % poly_3.rsquared) plt.legend() # TODO add image and put this code into an appendix at the bottom from mpl_toolkits.mplot3d import Axes3D X = df_adv[['TV', 'Radio']] y = df_adv['Sales'] ## fit a OLS model with intercept on TV and Radio X = sm.add_constant(X) est = sm.OLS(y, X).fit() ## Create the 3d plot -- skip reading this # TV/Radio grid for 3d plot xx1, xx2 = np.meshgrid(np.linspace(X.TV.min(), X.TV.max(), 100), np.linspace(X.Radio.min(), X.Radio.max(), 100)) # plot the hyperplane by evaluating the parameters on the grid Z = est.params[0] + est.params[1] * xx1 + est.params[2] * xx2 # create matplotlib 3d axes fig = plt.figure(figsize=(12, 8)) ax = Axes3D(fig, azim=-115, elev=15) # plot hyperplane surf = ax.plot_surface(xx1, xx2, Z, cmap=plt.cm.RdBu_r, alpha=0.6, linewidth=0) # plot data points - points over the HP are white, points below are black resid = y - est.predict(X) ax.scatter(X[resid >= 0].TV, X[resid >= 0].Radio, y[resid >= 0], color='black', alpha=1.0, facecolor='white') ax.scatter(X[resid < 0].TV, X[resid < 0].Radio, y[resid < 0], color='black', alpha=1.0) # set axis labels ax.set_xlabel('TV') ax.set_ylabel('Radio') ax.set_zlabel('Sales')