import pandas as pd import numpy as np import pandas.rpy.common as com galton = com.load_data('galton', package='UsingR') galton.head() f, (ax1, ax2) = subplots(ncols=2) ax1.hist(galton['child'], bins=100) ax2.hist(galton['parent'], bins=100) f.tight_layout(); f, ax = subplots(ncols=1) ax.hist(galton['child'], bins=100) meanChild = galton['child'].mean() ax.plot(np.repeat(meanChild, 100), np.linspace(0, 150, num=100), 'r', linewidth=5); plot(galton['parent'], galton['child'], 'ob'); def jitter(series, factor): z = float(series.max()) - float(series.min()) a = float(factor) * z / 50. return series.apply(lambda x: x + np.random.uniform(-a, a)) np.random.seed(1234) plot(jitter(galton['parent'], factor=1), jitter(galton['child'], factor=1), 'ob', alpha=.5); from statsmodels.formula.api import ols lm = ols('child ~ parent', galton).fit() f, (ax1, ax2) = subplots(ncols=2) # plot fitted values ax1.plot(galton['parent'], galton['child'], 'ob') ax1.plot(galton['parent'], lm.fittedvalues, 'r', linewidth=3); # plot residuals ax2.plot(galton['parent'], lm.resid, 'ob') ax2.plot(galton['parent'], np.repeat(0, len(galton['parent'])), 'r', linewidth=3) f.tight_layout();