# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

import numpy as np
import scipy
import pandas as pd # pandas
import matplotlib.pyplot as plt # module for plotting 
from mpl_toolkits.mplot3d import Axes3D #3D plotting
import datetime as dt # module for manipulating dates and times

import requests 
import scipy.stats as stats
import statsmodels.api as sm
from scipy.stats import binom
from __future__ import division
import re
from StringIO import StringIO
from zipfile import ZipFile 
from pandas import read_csv
from urllib import urlopen
import urllib2
import json

import sklearn
import sklearn.preprocessing
import sklearn.datasets

#nice defaults for matplotlib
from matplotlib import rcParams

dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = True
rcParams['axes.facecolor'] = '#eeeeee'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'

zip_folder = requests.get('http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip').content
zip_files = StringIO()
zip_files.write(zip_folder)
csv_files = ZipFile(zip_files)

players = csv_files.open('Batting.csv')
players = read_csv(players)
players.head()

dat = players[(players.yearID >= 2010) & (players.yearID <= 2012)]
dat['AVG'] = dat['H']/dat['AB']
dat = dat[dat['AB']>500]

df = dat[['yearID', 'AVG']].dropna()
rcParams['figure.figsize'] = (14, 4)
plt.figure(1)
for i in range(1,4):
    plt.subplot(1,3,i)
    plt.hist(df[df.yearID == (2009+i)].AVG.values*1000, bins = np.arange(200,370,20), normed = True)
    plt.title(str(2009+i))
    plt.xlabel("AVG")
    if i==1:
        plt.ylabel("Frequency")
plt.show()

stream = urllib2.urlopen('http://elections.huffingtonpost.com/pollster/api/charts/?topic=2014-senate')
data = json.load(stream)   

states = [str(poll['state']) for poll in data]

### use google to figure out why 2 candidate for 3 states
## you will have to do this again each time database updated!

import collections
counter=collections.Counter(states)
twopollindex = [key for key in counter.keys() if counter[key]>1]
twopollindex

###NH: there are polls with Bass but he withdrew
for poll in data:
    if str(poll['state']) in twopollindex:
        print poll['title']

## conclusiions after googling
## In NH Bass withdrew take out 6
## In Oklahoma and SC there are actually 2. Both special elections
data.pop(6)

polls = [read_csv(urlopen(poll['url']+'.csv')) for poll in data]

states = [str(poll['state']) for poll in data]

#make a table of candidates
R = np.zeros(len(polls))
R.fill(np.nan)
D = np.zeros(len(polls))
D.fill(np.nan)
incumbent = np.zeros(len(polls))
incumbent.fill(np.nan)
d = {'states':states, 'R':R, 'D':D, 'incumbent':incumbent}
candidates = pd.DataFrame(data = d)
candidates = candidates[['states','R','D','incumbent']]

for i in range(len(data)):
    x = data[i]['estimates'][0:2]
    if not x[0]['last_name']:
        tmp = data[i]['url'].split('-')+['vs']
        j = tmp.index('vs')
        if j!=len(tmp)-1:
            candidates.R[i] = tmp[j-1].capitalize()
            candidates.D[i] = tmp[j+1].capitalize()
            candidates.incumbent[i] = np.nan
            #if no data means race is decided
    else:
        tmp = [x[0]['party'],x[1]['party']]
        candidates.R[i] = x[tmp.index('Rep')]['last_name']
        idx = [k for k in range(len(tmp)) if tmp[k]!='Rep'][0]
        candidates['D'][i] = x[idx]['last_name']
        tmp2 = [x[0]['incumbent'],x[1]['incumbent']]
        tmp2+=[True]
        if tmp2.index(True)!=2:
            candidates.incumbent[i] = tmp[tmp2.index(True)]
        
#remove second last name 
candidates.R = [candidate.split(' ')[-1] for candidate in candidates.R]

candidates.head(5)

diff = []
for i in range(len(polls)):
    tmp = polls[i]
    diff.append(tmp[candidates.R[i]].values-tmp[candidates.D[i]].values)

means = [np.mean(a) for a in diff]

ses = [np.std(a)/np.sqrt(len(a)) for a in diff]

ord_means, o = (list(x) for x in zip(*sorted(zip(means, range(len(means))))))

States = states[:]
for i in range(len(States)):
    if States[i] in States[:i]:
        States[i] +='2'

difference = [means[i] for i in o]
se = [ses[i] for i in o]
States = [States[i] for i in o]
d = {'se':se, 'states':States, 'difference':difference}
df = pd.DataFrame(data=d)

rcParams['figure.figsize'] = (12, 6)
plt.errorbar(range(len(States)), difference, yerr = 2*np.array(se), linestyle="None", zorder = 0)
plt.scatter(range(len(States)), difference, zorder = 1)
plt.hlines(0, -0.5, 35.5, alpha = 0.5, linewidth=1, zorder = 0)
plt.xlim(-0.5,35.5)
plt.ylabel('difference')
plt.xticks(range(len(States)), States, rotation = 50)
plt.show()

ord_diff = [diff[i] for i in o]

plt.boxplot(ord_diff)
plt.xticks(np.arange(1,len(States)+1,1), States, rotation = 50)
plt.show()

def plot_pollsters(state):
    idx = states.index(state)
    pollsters = list(polls[idx]['Pollster'].values)
    unique_polls = [pollsters[i] for i in range(len(pollsters)) if pollsters[i] not in pollsters[:i]]
    xvals = [unique_polls.index(poll) for poll in pollsters]
    
    plt.scatter(xvals,diff[idx], s=50)
    plt.xticks(range(max(xvals)+1),unique_polls, rotation = 90)
    plt.hlines(0, -0.1, max(xvals), alpha = 0.7, linewidth=1, zorder = 0, linestyles = '--')
    plt.xlim(-0.2, max(xvals)+0.2)
    plt.show()
    
plot_pollsters('KS')

plot_pollsters('NH')

plot_pollsters('NC')

i = states.index('NC')
pollster = polls[i]['Pollster'].values
day = [pd.to_datetime(x) for x in polls[i]['Start Date'].values]
dif = polls[i][candidates.R[i]] - polls[i][candidates.D[i]]
m = np.mean(dif)
s = np.std(dif)/np.sqrt(len(dif))
print [round(m-2*s,1),round(m+2*s,1)]

'''
ind <- which(day>"2014-08-01")
m <- mean(dif[ind]) 
s <- sd(dif[ind])/sqrt(length(ind))
cat("[",round(m-2*s,1),round(m+2*s,1),"]")
'''
day0 = pd.to_datetime("2014-08-01")
ind = [i for i in range(len(day)) if day[i]>day0]
m = np.mean(dif[ind])
s = np.std(dif[ind])/np.sqrt(len(ind))
print [round(m-2*s,1),round(m+2*s,1)]