# Example: movie ratings import pandas as pd movies = pd.read_csv('http://www.rossmanchance.com/iscam2/data/movies03RT.txt', sep='\t', names=['X', 'score', 'rating', 'genre', 'box_office', 'running_time'], skiprows=1) movies.head() def jitter(series, factor): z = float(series.max()) - float(series.min()) a = float(factor) * z / 50. return map(lambda x: x + np.random.uniform(-a, a), series) r = pd.Categorical.from_array(movies['rating']) plot(jitter(r.labels, 2.), movies['score'], 'ob') xticks(np.unique(r.labels), r.levels) xlabel('rating') ylabel('score') meanRatings = movies.groupby('rating')['score'].mean() plot(range(4), meanRatings, 'sr', markersize=10); from statsmodels.formula.api import ols lm1 = ols('score ~ rating', movies).fit() lm1.summary() plot(jitter(r.labels, 2.), movies['score'], 'ob') xticks(np.unique(r.labels), r.levels) xlabel('rating') ylabel('score') plot(range(4), lm1.params[0] + np.append(0, lm1.params[1:]), 'sr', markersize=10); lm1.summary() lm1.conf_int() from patsy.contrasts import Treatment lm2 = ols('score ~ C(rating, Treatment(reference="R"))', movies).fit() lm2.summary() lm2.conf_int() from statsmodels.stats.anova import anova_lm anova_lm(lm1) from statsmodels.stats.multicomp import tukeyhsd import statsmodels.sandbox.stats.multicomp as multi mc = multi.MultiComparison(movies['score'], r.labels) res = mc.tukeyhsd() print res[0] print zip(list(r.levels), range(4))