import pandas.rpy.common as com galton = com.load_data('galton', package='UsingR') plot(galton['parent'], galton['child'], 'ob'); import pandas as pd hunger = pd.read_csv('http://apps.who.int/gho/athena/data/GHO/WHOSIS_000008.csv?profile=text&filter=COUNTRY:;SEX:') hunger = hunger[hunger['Sex'] != 'Both sexes'] # the last entry is all NaN hunger = hunger[hunger['Year'].notnull()] hunger.columns = map(lambda x: x.replace(' ', '_').lower(), hunger.columns) hunger.head() # emulates abline function, only possible to set basic line styles and width def abline(intercept, gradient, *args, **kwargs): a = gca() xlim = a.get_xlim() ylim = a.get_ylim() if args: sty = args[0] else: sty = 'r' if kwargs: lw = kwargs['linewidth'] else: lw = 5 a.plot(xlim, [intercept + gradient * x for x in xlim], sty, linewidth=lw) a.set_xlim(xlim) a.set_ylim(ylim) regions = hunger.groupby('who_region') colors = {'Africa': 'k', 'Americas': 'r', 'Eastern Mediterranean': 'g', 'Europe': 'b', 'South-East Asia': 'c', 'WHO Non Members': 'y', 'Western Pacific': 'm'} f, (ax1, ax2) = subplots(ncols=2) f.set_size_inches(8, 4) for g, v in regions: ax1.scatter(v['year'], v['numeric'], c=colors[g]) ax2.scatter(0, 0, c=colors[g], label=g) ax1.set_xlabel('Year') ax1.set_ylabel('Numeric') ax2.set_xticklabels('') ax2.set_yticklabels('') ax2.legend(loc='center') f.tight_layout(); from statsmodels.formula.api import ols from statsmodels.stats.anova import anova_lm anova_lm(ols('year ~ who_region', hunger).fit()) anova_lm(ols('numeric ~ who_region', hunger).fit()) lmRegion = ols('numeric ~ year + who_region + year * who_region', hunger).fit() #lmRegion.summary() for g, v in regions: scatter(v['year'], v['numeric'], c=colors[g]) intercept = lmRegion.params['Intercept'] se_asia = lmRegion.params['who_region[T.South-East Asia]'] year = lmRegion.params['year'] year_se_asia = lmRegion.params['year:who_region[T.South-East Asia]'] abline(intercept + se_asia, year + year_se_asia, 'c', linewidth=3) incomeData = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None) # last row is all NaNs incomeData = incomeData.drop(32561) incomeData.head() income = incomeData.ix[:,2] age = incomeData.ix[:,0] f, (ax1, ax2, ax3, ax4) = subplots(ncols=4) # original age vs income data ax1.scatter(age, income, marker='.') # original income data ax2.hist(income, range=(income.min(), income.max()), bins=100) # log-transformed income data log_income = income.apply(lambda x: np.log(x + 1)) ax3.hist(log_income, range=(log_income.min(), log_income.max()), bins=200) # log-transformed age vs income data ax4.scatter(age, log_income, marker='.') f.set_size_inches(10, 5) f.tight_layout(); import numpy as np np.random.seed(34568) xVals = np.random.standard_cauchy(size=50) hist(xVals); # add Tim Cook, CEO of Apple 2011 income age = np.append(np.array(age), 52) income = np.append(np.array(income), 378e6) scatter(age, income, marker='.') xlabel('age') ylabel('income'); from statsmodels.formula.api import rlm # robust linear model ?rlm # http://en.wikipedia.org/wiki/File:Anscombe%27s_quartet_3.svg bupaData = pd.read_csv('ftp://ftp.ics.uci.edu/pub/machine-learning-databases/liver-disorders/bupa.data', header=None) ggt = bupaData.ix[:,4] aat = bupaData.ix[:,2] plot(np.log(ggt), aat, 'ob') xlabel('log(ggt)') ylabel('aat'); lm1 = ols('aat ~ np.log(ggt)', pd.DataFrame({'aat' : aat, 'ggt' : ggt})).fit() plot(ggt.apply(np.log), lm1.resid, 'ob') xlabel('log(ggt)') ylabel('lm1.resid');