In this project, my goal is to predict 2012 US Presidential election results based on multiple polls. I used online data for polls and Electoral College votes. As long as the links do not change, these codes should work on any machine. This project is basically based on an assignment of Harvard's data science course although I wrote my own functions and made my personal interpretations most of the time.
%matplotlib inline
import pandas as pd
import requests
from pattern import web
import numpy as np
from fnmatch import fnmatch
from __future__ import division
import matplotlib.pyplot as plt
from collections import defaultdict
import json
#this mapping between states and abbreviations will come in handy later
states_abbrev = {
'AK': 'Alaska',
'AL': 'Alabama',
'AR': 'Arkansas',
'AS': 'American Samoa',
'AZ': 'Arizona',
'CA': 'California',
'CO': 'Colorado',
'CT': 'Connecticut',
'DC': 'District of Columbia',
'DE': 'Delaware',
'FL': 'Florida',
'GA': 'Georgia',
'GU': 'Guam',
'HI': 'Hawaii',
'IA': 'Iowa',
'ID': 'Idaho',
'IL': 'Illinois',
'IN': 'Indiana',
'KS': 'Kansas',
'KY': 'Kentucky',
'LA': 'Louisiana',
'MA': 'Massachusetts',
'MD': 'Maryland',
'ME': 'Maine',
'MI': 'Michigan',
'MN': 'Minnesota',
'MO': 'Missouri',
'MP': 'Northern Mariana Islands',
'MS': 'Mississippi',
'MT': 'Montana',
'NA': 'National',
'NC': 'North Carolina',
'ND': 'North Dakota',
'NE': 'Nebraska',
'NH': 'New Hampshire',
'NJ': 'New Jersey',
'NM': 'New Mexico',
'NV': 'Nevada',
'NY': 'New York',
'OH': 'Ohio',
'OK': 'Oklahoma',
'OR': 'Oregon',
'PA': 'Pennsylvania',
'PR': 'Puerto Rico',
'RI': 'Rhode Island',
'SC': 'South Carolina',
'SD': 'South Dakota',
'TN': 'Tennessee',
'TX': 'Texas',
'UT': 'Utah',
'VA': 'Virginia',
'VI': 'Virgin Islands',
'VT': 'Vermont',
'WA': 'Washington',
'WI': 'Wisconsin',
'WV': 'West Virginia',
'WY': 'Wyoming'
}
from matplotlib import rcParams
import matplotlib.cm as cm
import matplotlib as mpl
#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
(0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
(0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
(0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
(0.4, 0.6509803921568628, 0.11764705882352941),
(0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
(0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]
rcParams['figure.figsize'] = (20, 12)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 18
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'
def remove_border(axes=None, top=False, right=False, left=True, bottom=True):
"""
Minimize chartjunk by stripping out unnecesasry plot borders and axis ticks
The top/right/left/bottom keywords toggle whether the corresponding plot border is drawn
"""
ax = axes or plt.gca()
ax.spines['top'].set_visible(top)
ax.spines['right'].set_visible(right)
ax.spines['left'].set_visible(left)
ax.spines['bottom'].set_visible(bottom)
#turn off all ticks
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_ticks_position('none')
#now re-enable visibles
if top:
ax.xaxis.tick_top()
if bottom:
ax.xaxis.tick_bottom()
if left:
ax.yaxis.tick_left()
if right:
ax.yaxis.tick_right()
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
"""
Function
--------
plot_simulation
Inputs
------
simulation: Numpy array with n_sim (see simulate_election) elements
Each element stores the number of electoral college votes Obama wins in each simulation.
Returns
-------
Nothing
"""
def plot_simulation(result):
p05 = np.percentile(result, 5.)
p95 = np.percentile(result, 95.)
diff_quantiles = int(p95 - p05)
probability_win = (result >= 269).mean() * 100
plt.title("Chance of Obama's Victory: %.2f, Spread: %d votes" % (probability_win, diff_quantiles))
plt.hist(result, normed= True, bins = np.arange(200, 450, 1), label = 'Simulations')
plt.axvline(269, 0, 0.5, color = 'k', label = 'Victory Threshold')
plt.axvline(332, 0, 0.5, color = 'r', label = 'Actual Outcome')
plt.legend(frameon = False, loc = 'upper left')
plt.xlabel('Obama Electoral College Votes')
plt.ylabel('Probability')
remove_border()
url = 'http://www.realclearpolitics.com/epolls/2012/president/2012_elections_electoral_college_map.html'
html = requests.get(url).text
def find_governor_races(html):
dom = web.Element(html)
tag_a_all = dom.by_tag('a')
links = []
my_states = []
for tag_a in tag_a_all:
if 'href' in tag_a.attributes:
my_link = tag_a.attributes['href']
if fnmatch(my_link,
'/epolls/2012/president/??/*_romney_vs_obama-*.html' ):
my_states.append(my_link.split('/')[4])
links.append('http://www.realclearpolitics.com' + my_link)
links = list(set(links))
return links
#url = 'http://www.realclearpolitics.com/epolls/2012/president/ca/california_romney_vs_obama-2009.html#polls'
def get_2012_polls_per_state(url):
state = url.split('/')[6].upper()
xml = requests.get(url).text
dom = web.Element(xml)
table = dom.by_tag('div#polling-data-full')
new_table = table[0].by_tag('tr')
pollster = [poll.by_tag('a')[0].content for poll in new_table[3:]]
date = [poll.by_tag('td')[1].content for poll in new_table[3:]]
start_date_orig = [(poll.by_tag('td')[1].content).split(' - ')[0] for poll in new_table[3:]]
end_date_orig = [(poll.by_tag('td')[1].content).split(' - ')[1] for poll in new_table[3:]]
months = map(lambda x: int(x.split('/')[0]) ,end_date_orig)
deriv = np.diff(months)
year = 20120000*np.ones(len(months), dtype = 'int8')
try:
last_index_2012 = np.where(deriv>0)[0]
year[last_index_2012[0]+1,:] = 20110000
except:
pass
end_date_orig = [(poll.by_tag('td')[1].content).split(' - ')[1] for poll in new_table[3:]]
def extract_time(start_date_orig):
start_month = map(lambda x: int(x.split('/')[0]) * 100 ,start_date_orig)
start_day = map(lambda x: int(x.split('/')[1]) ,start_date_orig)
start_date = pd.to_datetime(list(year + \
np.array(start_month)+ np.array(start_day)), \
format='%Y%m%d')
return start_date
def sample_LV_issue(x):
if (x != 'LV') and (x != 'RV') :
try:
return int(x.split(' ')[0])
except:
return int(x.split(' ')[1])
else:
return np.nan
sample = [sample_LV_issue(poll.by_tag('td')[2].content) for poll in new_table[3:]]
# sample = [int(poll.by_tag('td')[2].content.split(' ')[0]) for poll in new_table[3:]]
def moe_to_float(x):
try:
return float(x)
except:
return np.nan
moe = [moe_to_float(poll.by_tag('td')[3].content) for poll in new_table[3:]]
obama = [float(poll.by_tag('td')[4].content) for poll in new_table[3:]]
romney = [float(poll.by_tag('td')[5].content) for poll in new_table[3:]]
obama_spread = list(np.array(obama)-np.array(romney))
href = [poll.by_tag('a')[0].attributes['href'] for poll in new_table[3:]]
# print href
result = pd.DataFrame(dict(Pollster = pollster, Sample = sample, MoE = moe ))
result['Obama (D)'] = obama
result['Romney (R)'] = romney
result['obama_spread'] = obama_spread
# result['links'] = href
result['State'] = state
result['start_date'] = extract_time(start_date_orig)
result['end_date'] = extract_time(end_date_orig)
return result
#adapted from https://github.com/dataiap/dataiap/blob/master/resources/util/map_util.py
#load in state geometry
state2poly = defaultdict(list)
data = json.load(file("data_hmw2/us-states.json"))
for f in data['features']:
state = states_abbrev[f['id']]
geo = f['geometry']
if geo['type'] == 'Polygon':
for coords in geo['coordinates']:
state2poly[state].append(coords)
elif geo['type'] == 'MultiPolygon':
for polygon in geo['coordinates']:
state2poly[state].extend(polygon)
def draw_state(plot, stateid, **kwargs):
"""
draw_state(plot, stateid, color=..., **kwargs)
Automatically draws a filled shape representing the state in
subplot.
The color keyword argument specifies the fill color. It accepts keyword
arguments that plot() accepts
"""
for polygon in state2poly[stateid]:
xs, ys = zip(*polygon)
plot.fill(xs, ys, **kwargs)
def make_map(states, label):
"""
Draw a cloropleth map, that maps data onto the United States
Inputs
-------
states : Column of a DataFrame
The value for each state, to display on a map
label : str
Label of the color bar
Returns
--------
The map
"""
fig = plt.figure(figsize=(12, 9))
ax = plt.gca()
if states.max() < 2: # colormap for election probabilities
cmap = cm.RdBu
vmin, vmax = 0, 1
else: # colormap for electoral votes
cmap = cm.binary
vmin, vmax = 0, states.max()
norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax)
skip = set(['National', 'District of Columbia', 'Guam', 'Puerto Rico',
'Virgin Islands', 'American Samoa', 'Northern Mariana Islands'])
for state in states_abbrev.values():
if state in skip:
continue
color = cmap(norm(states.ix[state]))
draw_state(ax, state, color = color, ec='k')
#add an inset colorbar
ax1 = fig.add_axes([0.45, 0.70, 0.4, 0.02])
cb1=mpl.colorbar.ColorbarBase(ax1, cmap=cmap,
norm=norm,
orientation='horizontal')
ax1.set_title(label)
remove_border(ax, left=False, bottom=False)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlim(-180, -60)
ax.set_ylim(15, 75)
return ax
links = find_governor_races(html)
results = []
for url in links:
results.append(get_2012_polls_per_state(url))
merged = pd.concat(results)
merged = merged.reset_index()
merged.drop('index', axis=1, inplace=True)
merged.head()
MoE | Pollster | Sample | Obama (D) | Romney (R) | obama_spread | State | start_date | end_date | |
---|---|---|---|---|---|---|---|---|---|
0 | 3.0 | WeAskAmerica | 1198 | 57 | 41 | 16 | IL | 2012-10-30 | 2012-10-30 |
1 | 3.7 | Chicago Tribune | 700 | 55 | 36 | 19 | IL | 2012-10-04 | 2012-10-08 |
2 | 2.8 | The Simon Poll/SIU | 1261 | 47 | 34 | 13 | IL | 2012-09-04 | 2012-09-10 |
3 | 2.8 | WeAskAmerica | 1382 | 54 | 37 | 17 | IL | 2012-09-05 | 2012-09-05 |
4 | 4.0 | Chicago Tribune | 600 | 56 | 35 | 21 | IL | 2012-02-02 | 2012-02-06 |
today =pd.to_datetime('20121002',format='%Y%m%d')
def clean_aggregated_polls(multipoll, today):
#convert state abbreviation to full name
multipoll.State.replace(states_abbrev, inplace=True)
#convert dates from strings to date objects, and compute midpoint
multipoll['poll_date'] = multipoll.start_date + (multipoll.end_date - multipoll.start_date).values / 2
#compute the poll age relative to Oct 2, in days
multipoll['age_days'] = (today - multipoll['poll_date']).values / np.timedelta64(1, 'D')
#drop any rows with data from after oct 2
multipoll = multipoll[multipoll.age_days > 0]
#drop unneeded columns
multipoll = multipoll.drop(['start_date', 'end_date'], axis=1)
#add electoral vote counts
# multipoll = multipoll.join(electoral_votes, on='State')
#drop rows with missing data
return multipoll.dropna()
merged_clean = clean_aggregated_polls(merged, today)
merged_clean.head()
MoE | Pollster | Sample | Obama (D) | Romney (R) | obama_spread | State | poll_date | age_days | |
---|---|---|---|---|---|---|---|---|---|
2 | 2.8 | The Simon Poll/SIU | 1261 | 47 | 34 | 13 | Illinois | 2012-09-07 | 25 |
3 | 2.8 | WeAskAmerica | 1382 | 54 | 37 | 17 | Illinois | 2012-09-05 | 27 |
4 | 4.0 | Chicago Tribune | 600 | 56 | 35 | 21 | Illinois | 2012-02-04 | 241 |
6 | 2.3 | FOX Chicago/WAA | 1815 | 50 | 35 | 15 | Illinois | 2012-09-28 | 4 |
16 | 2.4 | Quinnipiac | 1696 | 54 | 42 | 12 | Connecticut | 2012-09-30 | 2 |
U.S. presidential election has a winner-take-it-all system. A certain amount of Electoral College is assigned to each state based on its population. In most of the states, Electoral College votes are awarded to the presidental candidate who receives the majority of the votes in that state. The Electoral College consists of 538 electors and 269 votes are required to elect the President.
I will use these values to predict the results of 2012 Presidential Elections. The data can be downloaded from this link using the script below.
url = 'http://www.archives.gov/federal-register/electoral-college/allocation.html'
html = requests.get(url).text
dom = web.Element(html)
table = dom.by_tag('tbody')
states = [my_table[idx].by_tag('td')[0].content for idx in range(7, 58)]
votes = [int(my_table[idx].by_tag('td')[1].content) for idx in range(7, 58)]
electoral_votes = pd.DataFrame(dict(Votes = votes, State = states))
electoral_votes = electoral_votes.set_index('State')
electoral_votes.head()
Votes | |
---|---|
State | |
Alabama | 9 |
Alaska | 3 |
Arizona | 11 |
Arkansas | 6 |
California | 55 |
Now, let's merge the polls data with electoral votes.
merged_clean = merged_clean.join(electoral_votes, on='State')
merged_clean.head(3)
MoE | Pollster | Sample | Obama (D) | Romney (R) | obama_spread | State | poll_date | age_days | Votes | |
---|---|---|---|---|---|---|---|---|---|---|
2 | 2.8 | The Simon Poll/SIU | 1261 | 47 | 34 | 13 | Illinois | 2012-09-07 | 25 | 20 |
3 | 2.8 | WeAskAmerica | 1382 | 54 | 37 | 17 | Illinois | 2012-09-05 | 27 | 20 |
4 | 4.0 | Chicago Tribune | 600 | 56 | 35 | 21 | Illinois | 2012-02-04 | 241 | 20 |
Since the "importance" of each state is different, we group the data by state. The main feature that might informative is "obama_spread" but of course there might be some bias and uncertainty. The mean value across polls will reduce the bias of each poll and standard deviation will give us an idea about the uncertainty of the polls. The function given below computes the mean and standard deviation across polls.
"""
Function
--------
state_average
Inputs
------
multipoll : DataFrame
The multipoll data above
Returns
-------
averages : DataFrame
A dataframe, indexed by State, with the following columns:
N: Number of polls averaged together
poll_mean: The average value for obama_spread for all polls in this state
poll_std: The standard deviation of obama_spread
Notes
-----
For states where poll_std isn't finite (because N is too small), estimate the
poll_std value as .05 * poll_mean
"""
#your code here
def state_average(multipoll):
N = multipoll.groupby('State')['obama_spread'].count()
poll_mean = multipoll.groupby('State')['obama_spread'].mean()
poll_std = multipoll.groupby('State')['obama_spread'].std().fillna(poll_mean*0.05)
df = pd.DataFrame(dict(N=N,poll_mean = poll_mean, poll_std = poll_std ))
return df
Lets call the function on the merged_clean data frame, and join it with the electoral_votes frame.
avg = state_average(merged_clean).join(electoral_votes, how='outer')
avg.head()
N | poll_mean | poll_std | Votes | |
---|---|---|---|---|
State | ||||
Alabama | NaN | NaN | NaN | 9 |
Alaska | NaN | NaN | NaN | 3 |
Arizona | 18 | -6.000000 | 4.043877 | 11 |
Arkansas | 3 | -20.333333 | 4.041452 | 6 |
California | 15 | 20.066667 | 5.444088 | 55 |
As you can notice, there are some missing values in this table. The reason is that these states are strongly democrat or republican so that polls were not required. The following code will assign strong Democrat/Republican values to these states.
def default_missing(results):
red_states = ["Alabama", "Alaska", "Arkansas", "Idaho", "Wyoming"]
blue_states = ["Delaware", "District of Columbia", "Hawaii"]
results.ix[red_states, ["poll_mean"]] = -100.0
results.ix[red_states, ["poll_std"]] = 0.1
results.ix[blue_states, ["poll_mean"]] = 100.0
results.ix[blue_states, ["poll_std"]] = 0.1
default_missing(avg)
avg.head(3)
N | poll_mean | poll_std | Votes | |
---|---|---|---|---|
State | ||||
Alabama | NaN | -100 | 0.100000 | 9 |
Alaska | NaN | -100 | 0.100000 | 3 |
Arizona | 18 | -6 | 4.043877 | 11 |
We are now seeking for the probability of Obama wins a state. We assume that the probability that Obama wins a state is given by the probability that a draw from a Gaussian with μ=poll_mean and σ=poll_std is positive. We will use the idea of a cumulative distirbution function defined for Gaussian random variables
$$ CDF(z) = \frac1{2}\left(1 + {\rm erf}\left(\frac{z - \mu}{\sqrt{2 \sigma^2}}\right)\right) $$which yields probability of a random variable being smaller than $z$. In our case, we need $1-CDF(z)$ where $z$ is $0$. The following function computes these probabilities for a given data frame that consists of the man and standard values of polls for each state.
"""
Function
--------
aggregated_poll_model
Inputs
------
polls : DataFrame
DataFrame indexed by State, with the following columns:
poll_mean
poll_std
Votes
Returns
-------
A DataFrame indexed by State, with the following columns:
Votes: Electoral votes for that state
Obama: Estimated probability that Obama wins the state
"""
#your code here
from scipy.special import erf
def aggregated_poll_model(avg):
prob = 0.5 * ( 1 + erf( ( avg.poll_mean / np.sqrt( 2 * (avg.poll_std ** 2) ) ) ) )
df = pd.DataFrame(dict(Obama = prob, Votes = avg.Votes), index = avg.index)
return df
df = aggregated_poll_model(avg)
df.head()
Obama | Votes | |
---|---|---|
State | ||
Alabama | 0.000000 | 9 |
Alaska | 0.000000 | 3 |
Arizona | 0.068941 | 11 |
Arkansas | 0.000000 | 6 |
California | 0.999886 | 55 |
Even when we assume the win probabilities in each state are known, there is still uncertainty left in the election. We will use simulations from a simple probabilistic model to characterize this uncertainty. From these simulations, we will be able to make a prediction about the expected outcome of the election, and make a statement about how sure we are about it. We will assume that the outcome in each state is the result of an independent coin flip whose probability of coming up Obama is given by a Dataframe of state-wise win probabilities. The following function that uses this predictive model to simulate the outcome of the election given a Dataframe of probabilities.
"""
Function
--------
simulate_election
Inputs
------
model : DataFrame
A DataFrame summarizing an election forecast. The dataframe has 51 rows -- one for each state and
DC
It has the following columns:
Obama : Forecasted probability that Obama wins the state
Votes : Electoral votes for the state
The DataFrame is indexed by state (i.e., model.index is an array of state names)
n_sim : int
Number of simulations to run
Returns
-------
results : Numpy array with n_sim elements
Each element stores the number of electoral college votes Obama wins in each simulation.
"""
def simulate_election(predictwise, n_sim):
simulations = np.random.uniform(low = 0, high = 1, size = (51, n_sim))
obama_votes = (simulations < predictwise.Obama.values.reshape(51,1)) * predictwise.Votes.values.reshape(51, 1)
return obama_votes.sum(axis = 0)
prediction = simulate_election(df, 10000)
plot_simulation(prediction)
Plot probabilities on US map.
make_map(df.Obama, "P(Obama): Poll Aggregation")
plt.show()
You might have noticed that we treated each poll as equally important. However, this might not be the case. Some polls might be old or some polls might not have enough samples to make accurate predictions. We will now weighted each poll using their ages and margin of error.
A weighted estimate of Obama's advantage in a given state is given by
$$ \mu = \frac{\sum w_i \times \mu_i}{\sum w_i} $$where $\mu_i$ are individual polling measurements or a state, and $w_i$ are the weights assigned to each poll.
The weights $w_i$ should combine the uncertainties from the margin of error and the age of the forecast. One such combination is:
$$ w_i = \frac1{MoE^2} \times \lambda_{\rm age} $$where
$$ \lambda_{\rm age} = 0.5^{\frac{{\rm age}}{30 ~{\rm days}}} $$This model makes a few ad-hoc assumptions:
We will perform similar analysis using weighted aggregation. The following code computes the weighted mean accross polls for a given state.
"""
Function
--------
weighted_state_average
Inputs
------
multipoll : DataFrame
The multipoll data above
Returns
-------
averages : DataFrame
A dataframe, indexed by State, with the following columns:
N: Number of polls averaged together
poll_mean: The average value for obama_spread for all polls in this state
poll_std: The standard deviation of obama_spread
Notes
-----
For states where poll_std isn't finite (because N is too small), estimate the
poll_std value as .05 * poll_mean
"""
def weighted_state_average(multipoll):
N = multipoll.groupby('State')['obama_spread'].count()
f= lambda x: 0.5 ** (x / 30)
multipoll['weights'] = 1 / (multipoll.MoE ** 2) * multipoll.age_days.apply(f)
multipoll['weighted_obama'] = multipoll.obama_spread * 1 / (multipoll.MoE ** 2) \
* multipoll.age_days.apply(f)
poll_mean = multipoll.groupby('State').weighted_obama.sum() / \
multipoll.groupby('State').weights.sum()
poll_std = multipoll.groupby('State').obama_spread.std().fillna(poll_mean*0.05)
df = pd.DataFrame(dict(N = N, poll_mean = poll_mean, poll_std = poll_std))
return df
avg = weighted_state_average(merged_clean).join(electoral_votes, how='outer')
default_missing(avg)
avg.head()
N | poll_mean | poll_std | Votes | |
---|---|---|---|---|
State | ||||
Alabama | NaN | -100.000000 | 0.100000 | 9 |
Alaska | NaN | -100.000000 | 0.100000 | 3 |
Arizona | 18 | -6.893589 | 4.043877 | 11 |
Arkansas | 3 | -100.000000 | 0.100000 | 6 |
California | 15 | 18.535493 | 5.444088 | 55 |
# Compute probability of Obama's win
df = aggregated_poll_model(avg)
df.head()
Obama | Votes | |
---|---|---|
State | ||
Alabama | 0.000000 | 9 |
Alaska | 0.000000 | 3 |
Arizona | 0.044125 | 11 |
Arkansas | 0.000000 | 6 |
California | 0.999669 | 55 |
prediction = simulate_election(df, 10000)
plot_simulation(prediction)
make_map(df.Obama, "P(Obama): Weighted Polls")
plt.show()
As you can see, the mean of the simulation probabilities increased towards the actual outcome which indicates better accuracy with weighted aggregation. The spread of the distribution, on the other hand, did not change at all. This is because we did not use weighted standard deviation accross polls. One way to improve these results would be to use a better estimator for the standard deviation of obama_spread.