import matplotlib.pyplot as plt import pandas import numpy as np from statsmodels.formula.api import ols from statsmodels.graphics.api import interaction_plot, abline_plot, qqplot from statsmodels.stats.api import anova_lm Outcome: S, salaries for IT staff in a corporation Predictors: X, experience in years M, managment, 2 levels, 0=non-management, 1=management E, education, 3 levels, 1=Bachelor's, 2=Master's, 3=Ph.D url = 'http://stats191.stanford.edu/data/salary.table' salary_table = pandas.read_table(url) # needs pandas 0.7.3 salary_table.to_csv('salary.table', index=False) print salary_table.head(10) E = salary_table.E # Education M = salary_table.M # Management X = salary_table.X # Experience S = salary_table.S # Salary fig = plt.figure(figsize=(10,8)) ax = fig.add_subplot(111, xlabel='Experience', ylabel='Salary', xlim=(0, 27), ylim=(9600, 28800)) symbols = ['D', '^'] man_label = ["Non-Mgmt", "Mgmt"] educ_label = ["Bachelors", "Masters", "PhD"] colors = ['r', 'g', 'blue'] factor_groups = salary_table.groupby(['E','M']) for values, group in factor_groups: i,j = values label = "%s - %s" % (man_label[j], educ_label[i-1]) ax.scatter(group['X'], group['S'], marker=symbols[j], color=colors[i-1], s=350, label=label) ax.legend(scatterpoints=1, markerscale=.7, labelspacing=1); formula = 'S ~ C(E) + C(M) + X' lm = ols(formula, salary_table).fit() print lm.summary() lm.model.exog[:10] print lm.model.data.orig_exog.head(10) print lm.model.data.frame.head(10) lm.predict({'X' : [12], 'M' : [1], 'E' : [2]}) resid = lm.resid fig = plt.figure(figsize=(12,8)) xticks = [] ax = fig.add_subplot(111, xlabel='Group (E, M)', ylabel='Residuals') for values, group in factor_groups: i,j = values xticks.append(str((i, j))) group_num = i*2 + j - 1 # for plotting purposes x = [group_num] * len(group) ax.scatter(x, resid[group.index], marker=symbols[j], color=colors[i-1], s=144, edgecolors='black') ax.set_xticks([1,2,3,4,5,6]) ax.set_xticklabels(xticks) ax.axis('tight'); interX_lm = ols('S ~ C(E)*X + C(M)', salary_table).fit() print interX_lm.summary() print anova_lm(lm, interX_lm) print interX_lm.f_test('C(E)[T.2]:X = C(E)[T.3]:X = 0') print interX_lm.f_test([[0,0,0,0,0,1,-1],[0,0,0,0,0,0,1]]) LC = interX_lm.model.data.orig_exog.design_info.linear_constraint('C(E)[T.2]:X = C(E)[T.3]:X = 0') print LC.coefs print LC.constants interM_lm = ols('S ~ X + C(E)*C(M)', salary_table).fit() print interM_lm.summary() print anova_lm(lm, interM_lm) infl = interM_lm.get_influence() resid = infl.resid_studentized_internal fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111, xlabel='X', ylabel='standardized resids') for values, group in factor_groups: i,j = values idx = group.index ax.scatter(X[idx], resid[idx], marker=symbols[j], color=colors[i-1], s=144, edgecolors='black') ax.axis('tight'); outl = interM_lm.outlier_test('fdr_bh') outl.sort('unadj_p', inplace=True) print outl idx = salary_table.index.drop(32) print idx lm32 = ols('S ~ C(E) + X + C(M)', data=salary_table, subset=idx).fit() print lm32.summary() interX_lm32 = ols('S ~ C(E) * X + C(M)', data=salary_table, subset=idx).fit() print interX_lm32.summary() table3 = anova_lm(lm32, interX_lm32) print table3 interM_lm32 = ols('S ~ X + C(E) * C(M)', data=salary_table, subset=idx).fit() print anova_lm(lm32, interM_lm32) resid = interM_lm32.get_influence().summary_frame()['standard_resid'] fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111, xlabel='X[~[32]]', ylabel='standardized resids') for values, group in factor_groups: i,j = values idx = group.index ax.scatter(X[idx], resid[idx], marker=symbols[j], color=colors[i-1], s=144, edgecolors='black') ax.axis('tight'); lm_final = ols('S ~ X + C(E)*C(M)', data=salary_table.drop([32])).fit() mf = lm_final.model.data.orig_exog lstyle = ['-','--'] fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111, xlabel='Experience', ylabel='Salary') for values, group in factor_groups: i,j = values idx = group.index ax.scatter(X[idx], S[idx], marker=symbols[j], color=colors[i-1], s=144, edgecolors='black') # drop NA because there is no idx 32 in the final model ax.plot(mf.X[idx].dropna(), lm_final.fittedvalues[idx].dropna(), ls=lstyle[j], color=colors[i-1]) ax.axis('tight'); From our first look at the data, the difference between Master's and PhD in the management group is different than in the non-management group. This is an interaction between the two qualitative variables management, M and education, E. We can visualize this by first removing the effect of experience, then plotting the means within each of the 6 groups using interaction.plot. U = S - X * interX_lm32.params['X'] U.name = 'Salary|X' fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111) ax = interaction_plot(E, M, U, colors=['red','blue'], markers=['^','D'], markersize=10, ax=ax) TEST - Job Aptitude Test Score ETHN - 1 if minority, 0 otherwise JPERF - Job performance evaluation try: minority_table = pandas.read_table('minority.table') except: # don't have data already url = 'http://stats191.stanford.edu/data/minority.table' minority_table = pandas.read_table(url) minority_table.to_csv('minority.table', sep="\t", index=False) factor_group = minority_table.groupby(['ETHN']) fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111, xlabel='TEST', ylabel='JPERF') colors = ['purple', 'green'] markers = ['o', 'v'] for factor, group in factor_group: ax.scatter(group['TEST'], group['JPERF'], color=colors[factor], marker=markers[factor], s=12**2) ax.legend(['ETHN == 1', 'ETHN == 0'], scatterpoints=1) min_lm = ols('JPERF ~ TEST', data=minority_table).fit() print min_lm.summary() fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111, xlabel='TEST', ylabel='JPERF') for factor, group in factor_group: ax.scatter(group['TEST'], group['JPERF'], color=colors[factor], marker=markers[factor], s=12**2) ax.legend(['ETHN == 1', 'ETHN == 0'], scatterpoints=1, loc='upper left') fig = abline_plot(model_results = min_lm, ax=ax) min_lm2 = ols('JPERF ~ TEST + TEST:ETHN', data=minority_table).fit() print min_lm2.summary() fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111, xlabel='TEST', ylabel='JPERF') for factor, group in factor_group: ax.scatter(group['TEST'], group['JPERF'], color=colors[factor], marker=markers[factor], s=12**2) fig = abline_plot(intercept = min_lm2.params['Intercept'], slope = min_lm2.params['TEST'], ax=ax, color='purple') ax = fig.axes[0] fig = abline_plot(intercept = min_lm2.params['Intercept'], slope = min_lm2.params['TEST'] + min_lm2.params['TEST:ETHN'], ax=ax, color='green') ax.legend(['ETHN == 1', 'ETHN == 0'], scatterpoints=1, loc='upper left'); min_lm3 = ols('JPERF ~ TEST + ETHN', data=minority_table).fit() print min_lm3.summary() fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111, xlabel='TEST', ylabel='JPERF') for factor, group in factor_group: ax.scatter(group['TEST'], group['JPERF'], color=colors[factor], marker=markers[factor], s=12**2) fig = abline_plot(intercept = min_lm3.params['Intercept'], slope = min_lm3.params['TEST'], ax=ax, color='purple') ax = fig.axes[0] fig = abline_plot(intercept = min_lm3.params['Intercept'] + min_lm3.params['ETHN'], slope = min_lm3.params['TEST'], ax=ax, color='green') ax.legend(['ETHN == 1', 'ETHN == 0'], scatterpoints=1, loc='upper left'); min_lm4 = ols('JPERF ~ TEST * ETHN', data=minority_table).fit() print min_lm4.summary() fig = plt.figure(figsize=(12,8)) ax = fig.add_subplot(111, ylabel='JPERF', xlabel='TEST') for factor, group in factor_group: ax.scatter(group['TEST'], group['JPERF'], color=colors[factor], marker=markers[factor], s=12**2) fig = abline_plot(intercept = min_lm4.params['Intercept'], slope = min_lm4.params['TEST'], ax=ax, color='purple') ax = fig.axes[0] fig = abline_plot(intercept = min_lm4.params['Intercept'] + min_lm4.params['ETHN'], slope = min_lm4.params['TEST'] + min_lm4.params['TEST:ETHN'], ax=ax, color='green') ax.legend(['ETHN == 1', 'ETHN == 0'], scatterpoints=1, loc='upper left'); table5 = anova_lm(min_lm, min_lm4) print table5 table6 = anova_lm(min_lm, min_lm3) print table6 table7 = anova_lm(min_lm, min_lm2) print table7 table8 = anova_lm(min_lm2, min_lm4) print table8 Weight - (1,2,3) - Level of weight gan between treatments Duration - (1,2) - Level of duration of treatment Days - Time of stay in hospital try: kidney_table = pandas.read_table('kidney.table') except: url = 'http://stats191.stanford.edu/data/kidney.table' kidney_table = pandas.read_table(url, delimiter=" *") kidney_table.to_csv("kidney.table", sep="\t", index=False) # Explore the dataset, it's a balanced design print kidney_table.groupby(['Weight', 'Duration']).size() kt = kidney_table fig = plt.figure(figsize=(10,8)) ax = fig.add_subplot(111) fig = interaction_plot(kt['Weight'], kt['Duration'], np.log(kt['Days']+1), colors=['red', 'blue'], markers=['D','^'], ms=10, ax=ax) help(anova_lm) kidney_lm = ols('np.log(Days+1) ~ C(Duration) * C(Weight)', data=kt).fit() print anova_lm(kidney_lm) print anova_lm(kidney_lm, typ=2) ANOVA Type-III Sum of Squares

SS(A|B, AB) for factor A.
SS(B|A, AB) for factor B.
print anova_lm(ols('np.log(Days+1) ~ C(Duration, Sum) * C(Weight, Poly)', data=kt).fit(), typ=3)