import matplotlib.pyplot as plt
import pandas
import numpy as np
from statsmodels.formula.api import ols
from statsmodels.graphics.api import interaction_plot, abline_plot, qqplot
from statsmodels.stats.api import anova_lm
Outcome: S, salaries for IT staff in a corporation
Predictors: X, experience in years
M, managment, 2 levels, 0=non-management, 1=management
E, education, 3 levels, 1=Bachelor's, 2=Master's, 3=Ph.D
url = 'http://stats191.stanford.edu/data/salary.table'
salary_table = pandas.read_table(url) # needs pandas 0.7.3
salary_table.to_csv('salary.table', index=False)
print salary_table.head(10)
E = salary_table.E # Education
M = salary_table.M # Management
X = salary_table.X # Experience
S = salary_table.S # Salary
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111, xlabel='Experience', ylabel='Salary',
xlim=(0, 27), ylim=(9600, 28800))
symbols = ['D', '^']
man_label = ["Non-Mgmt", "Mgmt"]
educ_label = ["Bachelors", "Masters", "PhD"]
colors = ['r', 'g', 'blue']
factor_groups = salary_table.groupby(['E','M'])
for values, group in factor_groups:
i,j = values
label = "%s - %s" % (man_label[j], educ_label[i-1])
ax.scatter(group['X'], group['S'], marker=symbols[j], color=colors[i-1],
s=350, label=label)
ax.legend(scatterpoints=1, markerscale=.7, labelspacing=1);
formula = 'S ~ C(E) + C(M) + X'
lm = ols(formula, salary_table).fit()
print lm.summary()
lm.model.exog[:10]
print lm.model.data.orig_exog.head(10)
print lm.model.data.frame.head(10)
lm.predict({'X' : [12], 'M' : [1], 'E' : [2]})
resid = lm.resid
fig = plt.figure(figsize=(12,8))
xticks = []
ax = fig.add_subplot(111, xlabel='Group (E, M)', ylabel='Residuals')
for values, group in factor_groups:
i,j = values
xticks.append(str((i, j)))
group_num = i*2 + j - 1 # for plotting purposes
x = [group_num] * len(group)
ax.scatter(x, resid[group.index], marker=symbols[j], color=colors[i-1],
s=144, edgecolors='black')
ax.set_xticks([1,2,3,4,5,6])
ax.set_xticklabels(xticks)
ax.axis('tight');
interX_lm = ols('S ~ C(E)*X + C(M)', salary_table).fit()
print interX_lm.summary()
print anova_lm(lm, interX_lm)
print interX_lm.f_test('C(E)[T.2]:X = C(E)[T.3]:X = 0')
print interX_lm.f_test([[0,0,0,0,0,1,-1],[0,0,0,0,0,0,1]])
LC = interX_lm.model.data.orig_exog.design_info.linear_constraint('C(E)[T.2]:X = C(E)[T.3]:X = 0')
print LC.coefs
print LC.constants
interM_lm = ols('S ~ X + C(E)*C(M)', salary_table).fit()
print interM_lm.summary()
print anova_lm(lm, interM_lm)
infl = interM_lm.get_influence()
resid = infl.resid_studentized_internal
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, xlabel='X', ylabel='standardized resids')
for values, group in factor_groups:
i,j = values
idx = group.index
ax.scatter(X[idx], resid[idx], marker=symbols[j], color=colors[i-1],
s=144, edgecolors='black')
ax.axis('tight');
outl = interM_lm.outlier_test('fdr_bh')
outl.sort('unadj_p', inplace=True)
print outl
idx = salary_table.index.drop(32)
print idx
lm32 = ols('S ~ C(E) + X + C(M)', data=salary_table, subset=idx).fit()
print lm32.summary()
interX_lm32 = ols('S ~ C(E) * X + C(M)', data=salary_table, subset=idx).fit()
print interX_lm32.summary()
table3 = anova_lm(lm32, interX_lm32)
print table3
interM_lm32 = ols('S ~ X + C(E) * C(M)', data=salary_table, subset=idx).fit()
print anova_lm(lm32, interM_lm32)
resid = interM_lm32.get_influence().summary_frame()['standard_resid']
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, xlabel='X[~[32]]', ylabel='standardized resids')
for values, group in factor_groups:
i,j = values
idx = group.index
ax.scatter(X[idx], resid[idx], marker=symbols[j], color=colors[i-1],
s=144, edgecolors='black')
ax.axis('tight');
lm_final = ols('S ~ X + C(E)*C(M)', data=salary_table.drop([32])).fit()
mf = lm_final.model.data.orig_exog
lstyle = ['-','--']
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, xlabel='Experience', ylabel='Salary')
for values, group in factor_groups:
i,j = values
idx = group.index
ax.scatter(X[idx], S[idx], marker=symbols[j], color=colors[i-1],
s=144, edgecolors='black')
# drop NA because there is no idx 32 in the final model
ax.plot(mf.X[idx].dropna(), lm_final.fittedvalues[idx].dropna(),
ls=lstyle[j], color=colors[i-1])
ax.axis('tight');
From our first look at the data, the difference between Master's and PhD in the management group is different than in the non-management group. This is an interaction between the two qualitative variables management, M and education, E. We can visualize this by first removing the effect of experience, then plotting the means within each of the 6 groups using interaction.plot.
U = S - X * interX_lm32.params['X']
U.name = 'Salary|X'
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
ax = interaction_plot(E, M, U, colors=['red','blue'], markers=['^','D'],
markersize=10, ax=ax)
TEST - Job Aptitude Test Score
ETHN - 1 if minority, 0 otherwise
JPERF - Job performance evaluation
try:
minority_table = pandas.read_table('minority.table')
except: # don't have data already
url = 'http://stats191.stanford.edu/data/minority.table'
minority_table = pandas.read_table(url)
minority_table.to_csv('minority.table', sep="\t", index=False)
factor_group = minority_table.groupby(['ETHN'])
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, xlabel='TEST', ylabel='JPERF')
colors = ['purple', 'green']
markers = ['o', 'v']
for factor, group in factor_group:
ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],
marker=markers[factor], s=12**2)
ax.legend(['ETHN == 1', 'ETHN == 0'], scatterpoints=1)
min_lm = ols('JPERF ~ TEST', data=minority_table).fit()
print min_lm.summary()
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, xlabel='TEST', ylabel='JPERF')
for factor, group in factor_group:
ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],
marker=markers[factor], s=12**2)
ax.legend(['ETHN == 1', 'ETHN == 0'], scatterpoints=1, loc='upper left')
fig = abline_plot(model_results = min_lm, ax=ax)
min_lm2 = ols('JPERF ~ TEST + TEST:ETHN', data=minority_table).fit()
print min_lm2.summary()
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, xlabel='TEST', ylabel='JPERF')
for factor, group in factor_group:
ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],
marker=markers[factor], s=12**2)
fig = abline_plot(intercept = min_lm2.params['Intercept'],
slope = min_lm2.params['TEST'], ax=ax, color='purple')
ax = fig.axes[0]
fig = abline_plot(intercept = min_lm2.params['Intercept'],
slope = min_lm2.params['TEST'] + min_lm2.params['TEST:ETHN'],
ax=ax, color='green')
ax.legend(['ETHN == 1', 'ETHN == 0'], scatterpoints=1, loc='upper left');
min_lm3 = ols('JPERF ~ TEST + ETHN', data=minority_table).fit()
print min_lm3.summary()
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, xlabel='TEST', ylabel='JPERF')
for factor, group in factor_group:
ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],
marker=markers[factor], s=12**2)
fig = abline_plot(intercept = min_lm3.params['Intercept'],
slope = min_lm3.params['TEST'], ax=ax, color='purple')
ax = fig.axes[0]
fig = abline_plot(intercept = min_lm3.params['Intercept'] + min_lm3.params['ETHN'],
slope = min_lm3.params['TEST'], ax=ax, color='green')
ax.legend(['ETHN == 1', 'ETHN == 0'], scatterpoints=1, loc='upper left');
min_lm4 = ols('JPERF ~ TEST * ETHN', data=minority_table).fit()
print min_lm4.summary()
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111, ylabel='JPERF', xlabel='TEST')
for factor, group in factor_group:
ax.scatter(group['TEST'], group['JPERF'], color=colors[factor],
marker=markers[factor], s=12**2)
fig = abline_plot(intercept = min_lm4.params['Intercept'],
slope = min_lm4.params['TEST'], ax=ax, color='purple')
ax = fig.axes[0]
fig = abline_plot(intercept = min_lm4.params['Intercept'] + min_lm4.params['ETHN'],
slope = min_lm4.params['TEST'] + min_lm4.params['TEST:ETHN'],
ax=ax, color='green')
ax.legend(['ETHN == 1', 'ETHN == 0'], scatterpoints=1, loc='upper left');
table5 = anova_lm(min_lm, min_lm4)
print table5
table6 = anova_lm(min_lm, min_lm3)
print table6
table7 = anova_lm(min_lm, min_lm2)
print table7
table8 = anova_lm(min_lm2, min_lm4)
print table8
Weight - (1,2,3) - Level of weight gan between treatments
Duration - (1,2) - Level of duration of treatment
Days - Time of stay in hospital
try:
kidney_table = pandas.read_table('kidney.table')
except:
url = 'http://stats191.stanford.edu/data/kidney.table'
kidney_table = pandas.read_table(url, delimiter=" *")
kidney_table.to_csv("kidney.table", sep="\t", index=False)
# Explore the dataset, it's a balanced design
print kidney_table.groupby(['Weight', 'Duration']).size()
kt = kidney_table
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111)
fig = interaction_plot(kt['Weight'], kt['Duration'], np.log(kt['Days']+1),
colors=['red', 'blue'], markers=['D','^'], ms=10, ax=ax)
help(anova_lm)
kidney_lm = ols('np.log(Days+1) ~ C(Duration) * C(Weight)', data=kt).fit()
print anova_lm(kidney_lm)
print anova_lm(kidney_lm, typ=2)
ANOVA Type-III Sum of Squares
SS(A|B, AB) for factor A.
SS(B|A, AB) for factor B.
print anova_lm(ols('np.log(Days+1) ~ C(Duration, Sum) * C(Weight, Poly)',
data=kt).fit(), typ=3)