import pandas as pd import numpy as np import statsmodels.api as sm import patsy %pylab inline df = pd.read_table("data/unemployment.tsv") scatter(df.education, df.income) ylabel("Median Income") xlabel("% population with bachelor's degree") y, X = patsy.dmatrices("income ~ education", df) income_edu_model = sm.OLS(y, X).fit() income_edu_model.summary() df['income_edu_resid'] = income_edu_model.norm_resid() df.sort('income_edu_resid') cols = ['education', 'income', 'unemployment', 'disability', 'life', 'obesity'] from pandas.tools.plotting import scatter_matrix scatter_matrix(df[cols], alpha=0.2, figsize=(6, 6), diagonal='kde') a = 1