import numpy as np import pandas as pd import patsy import seaborn as sns import statsmodels.formula.api as smf import statsmodels.stats as sms import matplotlib.pyplot as plt %matplotlib inline sns.set_style("white") # for this demonstration we will use the *planes* example because it's balanced. # read in the data, rename some values: planes = pd.read_csv('../Datasets/planes.txt', sep='\t') # column names to lowercase: planes.columns = [c.lower() for c in planes.columns] # turn "paper" variable into an object, not int64: planes['paper'] = planes['paper'].astype(object) planes.replace(to_replace={'paper': {1: '80', 2: '50'}, 'angle': {1: 'horz', 2: '45deg '}, 'plane': {1: 'sleek', 2: 'simple'}}, inplace=True) # convert distance into meters for ease of SSR: planes['distance'] = planes['distance'] / 1000 # rename "plane" to "design": planes = planes.rename(columns={'plane': 'design'}) # planes is a balanced dataset (same number of cases in each cell): planes.groupby(['paper', 'design']).size() X = patsy.dmatrix('~ design * paper', planes) X # example for coefficients of design and paper: np.sum(X[:, 1] * X[:, 2]) X = patsy.dmatrix('~ C(design, Sum) * C(paper, Sum)', planes) print(X) print('sum of columns = ' + str(X.sum(axis=0))) print('sum of products of design and paper = ' + str((X[:, 1] * X[:, 2]).sum())) print('sum of products of design and interaction = ' + str((X[:, 1] * X[:, 3]).sum())) fit_full = smf.ols('distance ~ C(design, Sum) * C(paper, Sum)', planes).fit() print(fit_full.summary()) np.round(planes['distance'].mean(), decimals=4) # example for design: print(fit_full.params[0] + fit_full.params[1]) # model prediction for simple print(fit_full.params[0] + -1*fit_full.params[1]) # model prediction for sleek print('\n\n Actual Means:') print(planes.groupby('design').distance.mean()) # actual means anova_table = sms.anova.anova_lm(fit_full) # type I SS by default; same answer in this case print(anova_table) # fit all the models in the tree null = smf.ols('distance ~ 1', planes).fit() design = smf.ols('distance ~ C(design, Sum)', planes).fit() paper = smf.ols('distance ~ C(paper, Sum)', planes).fit() additive = smf.ols('distance ~ C(design, Sum) + C(paper, Sum)', planes).fit() full = smf.ols('distance ~ C(design, Sum) * C(paper, Sum)', planes).fit() print(null.summary()) null.mse_total # mean squared error total null.mse_resid # this is the same as the variance in distance: planes['distance'].var() # let's calculate the sum of squared residuals manually: res = planes['distance'] - null.predict() # absolute residuals (difference between data and prediction) res = res ** 2 # squared residuals res.sum() # the ssr is also contained in the model object: null.ssr ## Main effect of design, ignoring paper: print(null.ssr - design.ssr) ## sum of simple effects of design, eliminating paper: print(paper.ssr - additive.ssr) ## main effect from ANOVA table: print(anova_table['sum_sq'][0]) ## Main effect of paper, ignoring design: print(null.ssr - paper.ssr) ## sum of simple effects of paper, eliminating design: print(design.ssr - additive.ssr) ## main effect from ANOVA table: print(anova_table['sum_sq'][1]) # interaction term: print(additive.ssr - full.ssr) ## interaction effect from ANOVA table: print(anova_table['sum_sq'][2]) # e.g. for full model: print(full.summary()) diff = null.ssr - full.ssr # express as a proportion of the total (null model) residual: prop = diff / null.ssr print(prop) # same as R-squared: print(full.rsquared) print(anova_table) full.tvalues[1:] **2 print(anova_table['F']) # plot the explained variances as a pie chart: labels = 'design', 'paper', 'interaction', 'residual' sizes = anova_table['sum_sq'] colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral'] plt.pie(sizes, labels=labels, autopct='%1.0f%%', colors=colors, startangle=90) # Set aspect ratio to be equal so that pie is drawn as a circle. plt.axis('equal') plt.show() # anova table from interaction model: print(anova_table) # summary table from GLM fit: fit_glm = smf.glm('distance ~ C(design, Sum) * C(paper, Sum)', planes).fit() print(fit_glm.deviance) from IPython.core.display import HTML def css_styling(): styles = open("../custom_style.css", "r").read() return HTML(styles) css_styling()