%matplotlib inline import pandas as pd import numpy as np import matplotlib.pyplot as plt from bs4 import BeautifulSoup import requests from pattern import web import scipy.stats as stats from scipy.stats import binom from __future__ import division import re #nice defaults for matplotlib from matplotlib import rcParams dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843), (0.4, 0.4, 0.4)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.grid'] = True rcParams['axes.facecolor'] = '#eeeeee' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'none' p = np.arange(0,1.01,0.01) plt.plot(p, p*(1-p)) plt.xlabel("p") plt.ylabel("p*(1-p)") plt.ylim(0,0.26) plt.show() B = 10000 p = 0.48 N = 5000 result = [np.mean([np.random.binomial(1,p) for i in range(N)]) for j in range(B)] plt.hist(result, bins = 40) plt.vlines(p, 0, 890, alpha=0.8, colors=dark2_colors[1]) plt.ylabel("Frequency") plt.xlabel("Result") plt.show() B = 50 N = 100 p = 0.49 def get_prange(): res = [np.random.binomial(1,p) for i in range(N)] avg = np.mean(res) res = avg + np.sqrt(avg*(1-avg)/N)*np.array([-2,2]) res = list(res) res = [(res[0]<=p and res[1]>=p)*1]+res return res cis = np.array([get_prange() for i in range(B)]) colors = [dark2_colors[1], dark2_colors[0]] cols = [colors[int(i)] for i in cis[:,0]] cis = cis[:,1:3] fig, ax = plt.subplots(1, 1) for i in range(len(cis)): ax.hlines(i+1, cis[i][0], cis[i][1], color=cols[i]) ax.scatter(np.mean(cis[i,:]),i+1, zorder = 2) ax.vlines(p, 0.8,B+0.2, alpha=0.5) plt.ylim(0.8,B+0.2) plt.xlabel("Probabilities") plt.ylabel("Trial") plt.show() URL = "http://www.pollster.com/08USPresGEMvO-2.html" html = requests.get(URL).text dom = web.Element(html) rows = dom.by_tag('tr') table = [] for row in rows: table_row = [] data = row.by_tag('td') for value in data: table_row.append(web.plaintext(value.content)) table.append(table_row) df = pd.DataFrame(table[1:], columns = table[0]) df.head() #convert to number Obama = np.array([float(a) for a in df["Obama"].values]) McCain = np.array([float(a) for a in df["McCain"].values]) df['diff']= (Obama - McCain)/100 #extract the year year = [my_str.split("/")[-1] for my_str in df["Dates"].values] year = [(a=='08')*1 for a in year] full_year = ["2007", "2008"] year = [full_year[i] for i in year] #extract day dates = df["Dates"].values day = [my_str.split("-")[0] for my_str in dates] day = [d[0]+"/"+d[1] for d in [a.split("/") for a in day]] day = [day[i]+"/"+year[i] for i in range(len(day))] day = [(pd.to_datetime(a)-pd.to_datetime("11/4/2008")).days for a in day] df["day"] = day df["week"] = np.array([round(d/7,0) for d in day]) #Sort by polster df = df.sort("Pollster") #Look at the last month new_df=df[df["day"]>-30] new_df.head() def check_NA(n): n = re.findall(r'[+-]?\d+.\d+',n) if len(n)==0: return np.nan else: return float(n[0]) N = np.array([check_NA(n) for n in new_df["N/Pop"].values]) leftover = [re.sub(r"-", "0", s) for s in new_df["Undecided"].values] leftover = np.array([int(s) for s in leftover]) Obama = np.array([float(a) for a in new_df["Obama"].values]) avgs = Obama/100+leftover/100/2 se = np.sqrt(avgs*(1-avgs)/N) se = np.nan_to_num(se) x = list(set(new_df["Pollster"])) x.sort() idx = new_df.groupby(['Pollster']).count()["diff"].values idx = np.array([0]+list(np.cumsum(idx))[:-1]) cols = {} i = 0 for poll in x: cols[poll]=dark2_colors[i] i+=1 if i>len(dark2_colors)-1: i=0 fig, ax = plt.subplots(1, 1) for i in range(len(avgs)): ax.vlines(i, avgs[i]-2*se[i], avgs[i]+2*se[i], color = cols[new_df["Pollster"].values[i]]) ax.scatter(i, avgs[i], zorder = 2) ax.hlines(.529, 0, len(avgs), alpha=0.4, zorder = 1) plt.ylim(np.min(avgs-2*se)-0.01, np.max(avgs+2*se)+0.01) plt.xlim(-1,len(avgs)) plt.xticks(idx,x, rotation = 90) plt.show() plt.hist(avgs, bins = np.arange(0.46,0.59, 0.01)) plt.xlabel("avgs") plt.ylabel("Frequency") plt.show() z = (avgs-np.mean(avgs))/np.std(avgs) stats.probplot(z, dist="norm", plot=plt) plt.title("Normal Q-Q plot") plt.show() print "Mean:",np.mean(avgs) print "Standard deviation:", np.std(avgs) print "Variance^(1/2):", np.std(avgs)/np.sqrt(len(avgs)) a = np.mean(avgs)+np.array([-2,2])*np.std(avgs)/np.sqrt(len(avgs)) print "95% Confidence Interval:",a keep = [i for i in range(len(avgs)) if avgs[i]>0.48] df2 = new_df.iloc[keep] idx = df2.groupby(['Pollster']).count()["diff"].values idx = np.array([0]+list(np.cumsum(idx))[:-1]) fig, ax = plt.subplots(1, 1) for i in range(len(avgs)): if i in keep: ax.vlines(i, avgs[i]-2*se[i], avgs[i]+2*se[i], color = cols[new_df["Pollster"].values[i]]) ax.scatter(i, avgs[i], zorder = 2) ax.hlines(.529, 0, len(avgs), alpha=0.4, zorder = 1) plt.ylim(np.min(avgs-2*se)-0.01, np.max(avgs+2*se)+0.01) plt.xlim(-1,len(avgs)) plt.xticks(idx,x, rotation = 90) plt.show() new_df = df[df["day"]>-100] leftover = [re.sub(r"-", "0", s) for s in new_df["Undecided"].values] leftover = np.array([int(s) for s in leftover]) Obama = np.array([float(a) for a in new_df["Obama"].values]) avgs = Obama/100+leftover/100/2 plt.scatter(new_df["day"],avgs) plt.xlim(-100,0) plt.xlabel("Day") plt.ylabel("Avgs") plt.show() new_df["avg"] = avgs new_df.boxplot(column = "avg", by = "week") plt.show() from statsmodels.formula.api import ols new_df = new_df.copy() new_df['week_factor'] = [str(int(w)) for w in new_df['week'].values] lm1 = ols('diff ~ week_factor', new_df).fit() new_df['week_resid'] = lm1.resid new_df['pollster_mean_resid'] = new_df.groupby("Pollster")["week_resid"].transform(lambda x: x.mean()) new_df.sort("pollster_mean_resid", ascending=1, inplace=True) new_df.boxplot(column="week_resid", by="pollster_mean_resid", rot=90) pollster_labels = new_df.groupby("pollster_mean_resid")["Pollster"].agg(lambda x: x.iloc[0]) plt.xticks(np.arange(len(pollster_labels))+1, pollster_labels.values) plt.show() lm2 = ols('diff ~ Pollster', new_df).fit() new_df['pollster_resid'] = lm2.resid new_df['week_mean_resid'] = new_df.groupby("week")["pollster_resid"].transform(lambda x: x.mean()) new_df.sort("week_mean_resid", ascending=1, inplace=True) new_df.boxplot(column="pollster_resid", by="week", rot=90) plt.show() lm3 = ols('diff ~ week_factor + Pollster', new_df).fit() from statsmodels.stats.anova import anova_lm print anova_lm(lm3) plt.scatter(new_df["day"],new_df["Obama"], color='blue', marker='o', facecolors='none') plt.scatter(new_df["day"],new_df["McCain"], color='red', marker='v', facecolors='none') plt.xlim(-100,0) plt.xlabel("Day") plt.ylabel("Avgs") from statsmodels.nonparametric.api import lowess #Smooth using 1/10 of the total data surrounding a point for the local fit. smoothed = lowess(new_df['Obama'], new_df['day'], frac=0.1) plt.plot(smoothed[:,0], smoothed[:,1], color='blue') smoothed = lowess(new_df['McCain'], new_df['day'], frac=0.1) plt.plot(smoothed[:,0], smoothed[:,1], color="red") plt.show() #plt.plot(lowess(new_df['diff'], new_df['week']))