import pandas as pd
import numpy as np
import statsmodels.api as sm
import patsy
%pylab inline

df = pd.read_table("data/unemployment.tsv")


scatter(df.education, df.income)
ylabel("Median Income")
xlabel("% population with bachelor's degree")

y, X = patsy.dmatrices("income ~ education", df)
income_edu_model = sm.OLS(y, X).fit()
income_edu_model.summary()

df['income_edu_resid'] = income_edu_model.norm_resid()

df.sort('income_edu_resid')

cols = ['education', 'income', 'unemployment', 'disability', 'life', 'obesity']
from pandas.tools.plotting import scatter_matrix
scatter_matrix(df[cols], alpha=0.2, figsize=(6, 6), diagonal='kde')
a = 1