# special IPython command to prepare the notebook for matplotlib %matplotlib inline import numpy as np import scipy import pandas as pd # pandas import matplotlib.pyplot as plt # module for plotting from mpl_toolkits.mplot3d import Axes3D #3D plotting import datetime as dt # module for manipulating dates and times import requests import scipy.stats as stats import statsmodels.api as sm from scipy.stats import binom from __future__ import division import re from StringIO import StringIO from zipfile import ZipFile from pandas import read_csv from urllib import urlopen import urllib2 import json import sklearn import sklearn.preprocessing import sklearn.datasets #nice defaults for matplotlib from matplotlib import rcParams dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), (0.4, 0.6509803921568628, 0.11764705882352941), (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), (0.6509803921568628, 0.4627450980392157, 0.11372549019607843), (0.4, 0.4, 0.4)] rcParams['figure.figsize'] = (10, 6) rcParams['figure.dpi'] = 150 rcParams['axes.color_cycle'] = dark2_colors rcParams['lines.linewidth'] = 2 rcParams['axes.grid'] = True rcParams['axes.facecolor'] = '#eeeeee' rcParams['font.size'] = 14 rcParams['patch.edgecolor'] = 'none' zip_folder = requests.get('http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip').content zip_files = StringIO() zip_files.write(zip_folder) csv_files = ZipFile(zip_files) players = csv_files.open('Batting.csv') players = read_csv(players) players.head() dat = players[(players.yearID >= 2010) & (players.yearID <= 2012)] dat['AVG'] = dat['H']/dat['AB'] dat = dat[dat['AB']>500] df = dat[['yearID', 'AVG']].dropna() rcParams['figure.figsize'] = (14, 4) plt.figure(1) for i in range(1,4): plt.subplot(1,3,i) plt.hist(df[df.yearID == (2009+i)].AVG.values*1000, bins = np.arange(200,370,20), normed = True) plt.title(str(2009+i)) plt.xlabel("AVG") if i==1: plt.ylabel("Frequency") plt.show() stream = urllib2.urlopen('http://elections.huffingtonpost.com/pollster/api/charts/?topic=2014-senate') data = json.load(stream) states = [str(poll['state']) for poll in data] ### use google to figure out why 2 candidate for 3 states ## you will have to do this again each time database updated! import collections counter=collections.Counter(states) twopollindex = [key for key in counter.keys() if counter[key]>1] twopollindex ###NH: there are polls with Bass but he withdrew for poll in data: if str(poll['state']) in twopollindex: print poll['title'] ## conclusiions after googling ## In NH Bass withdrew take out 6 ## In Oklahoma and SC there are actually 2. Both special elections data.pop(6) polls = [read_csv(urlopen(poll['url']+'.csv')) for poll in data] states = [str(poll['state']) for poll in data] #make a table of candidates R = np.zeros(len(polls)) R.fill(np.nan) D = np.zeros(len(polls)) D.fill(np.nan) incumbent = np.zeros(len(polls)) incumbent.fill(np.nan) d = {'states':states, 'R':R, 'D':D, 'incumbent':incumbent} candidates = pd.DataFrame(data = d) candidates = candidates[['states','R','D','incumbent']] for i in range(len(data)): x = data[i]['estimates'][0:2] if not x[0]['last_name']: tmp = data[i]['url'].split('-')+['vs'] j = tmp.index('vs') if j!=len(tmp)-1: candidates.R[i] = tmp[j-1].capitalize() candidates.D[i] = tmp[j+1].capitalize() candidates.incumbent[i] = np.nan #if no data means race is decided else: tmp = [x[0]['party'],x[1]['party']] candidates.R[i] = x[tmp.index('Rep')]['last_name'] idx = [k for k in range(len(tmp)) if tmp[k]!='Rep'][0] candidates['D'][i] = x[idx]['last_name'] tmp2 = [x[0]['incumbent'],x[1]['incumbent']] tmp2+=[True] if tmp2.index(True)!=2: candidates.incumbent[i] = tmp[tmp2.index(True)] #remove second last name candidates.R = [candidate.split(' ')[-1] for candidate in candidates.R] candidates.head(5) diff = [] for i in range(len(polls)): tmp = polls[i] diff.append(tmp[candidates.R[i]].values-tmp[candidates.D[i]].values) means = [np.mean(a) for a in diff] ses = [np.std(a)/np.sqrt(len(a)) for a in diff] ord_means, o = (list(x) for x in zip(*sorted(zip(means, range(len(means)))))) States = states[:] for i in range(len(States)): if States[i] in States[:i]: States[i] +='2' difference = [means[i] for i in o] se = [ses[i] for i in o] States = [States[i] for i in o] d = {'se':se, 'states':States, 'difference':difference} df = pd.DataFrame(data=d) rcParams['figure.figsize'] = (12, 6) plt.errorbar(range(len(States)), difference, yerr = 2*np.array(se), linestyle="None", zorder = 0) plt.scatter(range(len(States)), difference, zorder = 1) plt.hlines(0, -0.5, 35.5, alpha = 0.5, linewidth=1, zorder = 0) plt.xlim(-0.5,35.5) plt.ylabel('difference') plt.xticks(range(len(States)), States, rotation = 50) plt.show() ord_diff = [diff[i] for i in o] plt.boxplot(ord_diff) plt.xticks(np.arange(1,len(States)+1,1), States, rotation = 50) plt.show() def plot_pollsters(state): idx = states.index(state) pollsters = list(polls[idx]['Pollster'].values) unique_polls = [pollsters[i] for i in range(len(pollsters)) if pollsters[i] not in pollsters[:i]] xvals = [unique_polls.index(poll) for poll in pollsters] plt.scatter(xvals,diff[idx], s=50) plt.xticks(range(max(xvals)+1),unique_polls, rotation = 90) plt.hlines(0, -0.1, max(xvals), alpha = 0.7, linewidth=1, zorder = 0, linestyles = '--') plt.xlim(-0.2, max(xvals)+0.2) plt.show() plot_pollsters('KS') plot_pollsters('NH') plot_pollsters('NC') i = states.index('NC') pollster = polls[i]['Pollster'].values day = [pd.to_datetime(x) for x in polls[i]['Start Date'].values] dif = polls[i][candidates.R[i]] - polls[i][candidates.D[i]] m = np.mean(dif) s = np.std(dif)/np.sqrt(len(dif)) print [round(m-2*s,1),round(m+2*s,1)] ''' ind <- which(day>"2014-08-01") m <- mean(dif[ind]) s <- sd(dif[ind])/sqrt(length(ind)) cat("[",round(m-2*s,1),round(m+2*s,1),"]") ''' day0 = pd.to_datetime("2014-08-01") ind = [i for i in range(len(day)) if day[i]>day0] m = np.mean(dif[ind]) s = np.std(dif[ind])/np.sqrt(len(ind)) print [round(m-2*s,1),round(m+2*s,1)]