%matplotlib inline 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
from pattern import web
import scipy.stats as stats
from scipy.stats import binom
from __future__ import division
import re

#nice defaults for matplotlib
from matplotlib import rcParams

dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = True
rcParams['axes.facecolor'] = '#eeeeee'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'

p = np.arange(0,1.01,0.01)
plt.plot(p, p*(1-p))
plt.xlabel("p")
plt.ylabel("p*(1-p)")
plt.ylim(0,0.26)
plt.show()

B = 10000
p = 0.48
N = 5000
result = [np.mean([np.random.binomial(1,p) for i in range(N)]) 
          for j in range(B)]

plt.hist(result, bins = 40)
plt.vlines(p, 0, 890, alpha=0.8, colors=dark2_colors[1])
plt.ylabel("Frequency")
plt.xlabel("Result")
plt.show()

B = 50
N = 100
p = 0.49
def get_prange():
    res = [np.random.binomial(1,p) for i in range(N)]
    avg = np.mean(res)
    res = avg + np.sqrt(avg*(1-avg)/N)*np.array([-2,2])
    res = list(res)
    res = [(res[0]<=p and res[1]>=p)*1]+res
    return res

cis = np.array([get_prange() for i in range(B)])
colors = [dark2_colors[1], dark2_colors[0]]
cols = [colors[int(i)] for i in cis[:,0]]
cis = cis[:,1:3]

fig, ax = plt.subplots(1, 1)
for i in range(len(cis)):
    ax.hlines(i+1, cis[i][0], cis[i][1], color=cols[i])
    ax.scatter(np.mean(cis[i,:]),i+1, zorder = 2)
ax.vlines(p, 0.8,B+0.2, alpha=0.5)
plt.ylim(0.8,B+0.2)
plt.xlabel("Probabilities")
plt.ylabel("Trial")
plt.show()

URL = "http://www.pollster.com/08USPresGEMvO-2.html"
html = requests.get(URL).text
dom = web.Element(html)
rows = dom.by_tag('tr')

table = []
for row in rows:
    table_row = []
    data = row.by_tag('td')
    for value in data:
        table_row.append(web.plaintext(value.content))
    table.append(table_row)

df = pd.DataFrame(table[1:], columns = table[0])
df.head()

#convert to number
Obama = np.array([float(a) for a in df["Obama"].values])
McCain = np.array([float(a) for a in df["McCain"].values])
df['diff']= (Obama - McCain)/100

#extract the year
year = [my_str.split("/")[-1] for my_str in df["Dates"].values]
year = [(a=='08')*1 for a in year]
full_year = ["2007", "2008"]
year = [full_year[i] for i in year]

#extract day
dates = df["Dates"].values
day = [my_str.split("-")[0] for my_str in dates]
day = [d[0]+"/"+d[1] for d in [a.split("/") for a in day]]
day = [day[i]+"/"+year[i] for i in range(len(day))]
day = [(pd.to_datetime(a)-pd.to_datetime("11/4/2008")).days for a in day]
df["day"] = day
df["week"] = np.array([round(d/7,0) for d in day])

#Sort by polster
df = df.sort("Pollster")

#Look at the last month
new_df=df[df["day"]>-30]

new_df.head()

def check_NA(n):
    n = re.findall(r'[+-]?\d+.\d+',n)
    if len(n)==0:
        return np.nan
    else:
        return float(n[0])
N = np.array([check_NA(n) for n in new_df["N/Pop"].values])

leftover = [re.sub(r"-", "0", s) for s in new_df["Undecided"].values]
leftover = np.array([int(s) for s in leftover])
Obama = np.array([float(a) for a in new_df["Obama"].values])
avgs = Obama/100+leftover/100/2
se = np.sqrt(avgs*(1-avgs)/N)
se = np.nan_to_num(se)

x = list(set(new_df["Pollster"]))
x.sort()
idx = new_df.groupby(['Pollster']).count()["diff"].values
idx = np.array([0]+list(np.cumsum(idx))[:-1])
cols = {}
i = 0
for poll in x:
    cols[poll]=dark2_colors[i]
    i+=1
    if i>len(dark2_colors)-1:
        i=0
fig, ax = plt.subplots(1, 1)
for i in range(len(avgs)):
    ax.vlines(i, avgs[i]-2*se[i], avgs[i]+2*se[i], 
              color = cols[new_df["Pollster"].values[i]])
    ax.scatter(i, avgs[i], zorder = 2)
ax.hlines(.529, 0, len(avgs), alpha=0.4, zorder = 1)
plt.ylim(np.min(avgs-2*se)-0.01, np.max(avgs+2*se)+0.01)
plt.xlim(-1,len(avgs))
plt.xticks(idx,x, rotation = 90)
plt.show()

plt.hist(avgs, bins = np.arange(0.46,0.59, 0.01))
plt.xlabel("avgs")
plt.ylabel("Frequency")
plt.show()

z = (avgs-np.mean(avgs))/np.std(avgs)
stats.probplot(z, dist="norm", plot=plt)
plt.title("Normal Q-Q plot")
plt.show()

print "Mean:",np.mean(avgs)
print "Standard deviation:", np.std(avgs)
print "Variance^(1/2):", np.std(avgs)/np.sqrt(len(avgs))
a = np.mean(avgs)+np.array([-2,2])*np.std(avgs)/np.sqrt(len(avgs))
print "95% Confidence Interval:",a

keep = [i for i in range(len(avgs)) if avgs[i]>0.48]

df2 = new_df.iloc[keep]
idx = df2.groupby(['Pollster']).count()["diff"].values
idx = np.array([0]+list(np.cumsum(idx))[:-1])
fig, ax = plt.subplots(1, 1)
for i in range(len(avgs)):
    if i in keep:
        ax.vlines(i, avgs[i]-2*se[i], avgs[i]+2*se[i], 
              color = cols[new_df["Pollster"].values[i]])
        ax.scatter(i, avgs[i], zorder = 2)
ax.hlines(.529, 0, len(avgs), alpha=0.4, zorder = 1)
plt.ylim(np.min(avgs-2*se)-0.01, np.max(avgs+2*se)+0.01)
plt.xlim(-1,len(avgs))
plt.xticks(idx,x, rotation = 90)
plt.show()

new_df = df[df["day"]>-100]
leftover = [re.sub(r"-", "0", s) for s in new_df["Undecided"].values]
leftover = np.array([int(s) for s in leftover])
Obama = np.array([float(a) for a in new_df["Obama"].values])
avgs = Obama/100+leftover/100/2

plt.scatter(new_df["day"],avgs)
plt.xlim(-100,0)
plt.xlabel("Day")
plt.ylabel("Avgs")
plt.show()

new_df["avg"] = avgs
new_df.boxplot(column = "avg", by = "week")
plt.show()

from statsmodels.formula.api import ols
new_df = new_df.copy()
new_df['week_factor'] = [str(int(w)) for w in new_df['week'].values]
lm1 = ols('diff ~ week_factor', new_df).fit()
new_df['week_resid'] = lm1.resid
new_df['pollster_mean_resid'] = new_df.groupby("Pollster")["week_resid"].transform(lambda x: x.mean())
new_df.sort("pollster_mean_resid", ascending=1, inplace=True)
new_df.boxplot(column="week_resid", by="pollster_mean_resid", rot=90)
pollster_labels = new_df.groupby("pollster_mean_resid")["Pollster"].agg(lambda x: x.iloc[0])
plt.xticks(np.arange(len(pollster_labels))+1, pollster_labels.values)
plt.show()

lm2 = ols('diff ~ Pollster', new_df).fit()
new_df['pollster_resid'] = lm2.resid
new_df['week_mean_resid'] = new_df.groupby("week")["pollster_resid"].transform(lambda x: x.mean())
new_df.sort("week_mean_resid", ascending=1, inplace=True)
new_df.boxplot(column="pollster_resid", by="week", rot=90)

plt.show()

lm3 = ols('diff ~ week_factor + Pollster', new_df).fit()
from statsmodels.stats.anova import anova_lm

print anova_lm(lm3)

plt.scatter(new_df["day"],new_df["Obama"], color='blue', marker='o', facecolors='none')
plt.scatter(new_df["day"],new_df["McCain"], color='red', marker='v', facecolors='none')
plt.xlim(-100,0)
plt.xlabel("Day")
plt.ylabel("Avgs")

from statsmodels.nonparametric.api import lowess
#Smooth using 1/10 of the total data surrounding a point for the local fit.
smoothed = lowess(new_df['Obama'], new_df['day'], frac=0.1)
plt.plot(smoothed[:,0], smoothed[:,1], color='blue')

smoothed = lowess(new_df['McCain'], new_df['day'], frac=0.1)
plt.plot(smoothed[:,0], smoothed[:,1], color="red")
plt.show()
#plt.plot(lowess(new_df['diff'], new_df['week']))