# special IPython command to prepare the notebook for matplotlib
%matplotlib inline 

import requests 
from StringIO import StringIO
import numpy as np
import pandas as pd # pandas
import matplotlib.pyplot as plt # module for plotting 
import datetime as dt # module for manipulating dates and times
import numpy.linalg as lin # module for performing linear algebra operations

# special matplotlib argument for improved plots
from matplotlib import rcParams

#colorbrewer2 Dark2 qualitative color table
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'white'
rcParams['patch.facecolor'] = dark2_colors[0]
rcParams['font.family'] = 'StixGeneral'

#your code here
url_exprs = "https://raw.githubusercontent.com/cs109/2014_data/master/exprs_GSE5859.csv"
exprs = pd.read_csv(url_exprs, index_col=0)

url_sampleinfo = "https://raw.githubusercontent.com/cs109/2014_data/master/sampleinfo_GSE5859.csv"
sampleinfo = pd.read_csv(url_sampleinfo)

(exprs.columns == sampleinfo.filename).all()

sampleinfo[exprs.columns == sampleinfo.filename]

#your code here
a = list(sampleinfo.filename)
b = list(exprs.columns)
matchIndex = [b.index(x) for x in a]
exprs = exprs[matchIndex]

# check if all the column names match the file names in sampleinfo
(exprs.columns == sampleinfo.filename).all()

exprs.head()

sampleinfo.head()

#your code here
sampleinfo["date"] = pd.to_datetime(sampleinfo.date)
sampleinfo["month"] = map(lambda x: x.month, sampleinfo.date)
sampleinfo["year"] = map(lambda x: x.year, sampleinfo.date)

#your code here
oct31 = dt.datetime(2002,10,31,0,0)
oct31

sampleinfo["elapsedInDays"] = map(lambda x: (x - oct31).days, sampleinfo.date)
sampleinfo.head()

#your code here
sampleinfoCEU = sampleinfo[sampleinfo.ethnicity == "CEU"]
sampleinfoCEU.head()

#your code here
exprsCEU = exprs[sampleinfoCEU.filename]
exprsCEU.head()

#your code here
(exprsCEU.columns == sampleinfoCEU.filename).all()

#your code here
data = exprsCEU.apply(lambda x: x - exprsCEU.mean(axis=1), axis = 0)
data.head()

#your code here
U,s,Vh = lin.svd(data.values)
V = Vh.T

#your code here
plt.hist(V[:,0], bins = 25)
plt.xlabel('PC1')
plt.ylabel('Frequency')
plt.title('Distribution of the values from PC1')

#your code here
plt.scatter(sampleinfoCEU.elapsedInDays, V[:,0])
plt.xlabel('Date sample was processed (Number of days since Oct 31, 2012)')
plt.ylabel('PC1')
plt.title('Relationship between the PC1 and the date the samples were processed')

#your code here
plt.scatter(sampleinfoCEU.elapsedInDays, V[:,0])
plt.xlim(0,160)
plt.xlabel('Date sample was processed (Number of days since Oct 31, 2012)')
plt.ylabel('PC1')
plt.title('Relationship between the PC1 and the date the samples were processed')
plt.axvline(x=100, color='r')

#your code here
url = "http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama.csv"
source = requests.get(url).text
s = StringIO(source)
election = pd.DataFrame.from_csv(s, index_col=None).convert_objects(
            convert_dates="coerce", convert_numeric=True) # Access polls as a CSV file

#your code here
election.head()

#your code here
filtered = election[map(lambda x: (x.month == 11) and (x.year ==2012), election["Start Date"])]
filtered.drop_duplicates('Pollster', inplace = True) # Removes duplicate pollsters
M = len(filtered)
print "Number of polls in November: %i" % M 

#your code here
N = np.median(filtered["Number of Observations"])
print N

#your code here

p = 0.53
"Simulated number of votes for Obama: %i" % np.random.binomial(N, p, size=1)

#your code here

p = 0.53
B = 1000
obs = np.random.binomial(N, p, size = B) / N

p = 0.53
B = 1000
obs = map(lambda x: np.mean(np.random.binomial(1, p, size = N)), xrange(B))

#your code here
plt.hist(obs)

import scipy.stats as stats
stats.probplot((obs - np.mean(obs)) / np.std(obs, ddof=1), dist="norm", plot = plt)
plt.show()

#your code here
np.std(obs, ddof=1)

np.sqrt((0.53 * 0.47) / 1200)

"Number of polls in November: %i" % M

"Median size of polls in November: %i" % N

# Represents the percentage of Obama votes from M polls
def simulatePolls(p, N, M):
    """ Function to simulate the results 
    of M polls each measuring the percent
    of Obama votes out of a sample size of N
    with probability p of voting for Obama
    
    M = Number of polls to simulate
    N = Sample size of each poll
    p = Probability of voting for Obama """
    
    return map(lambda x: np.mean(np.random.binomial(1, p, size = N)), xrange(M))

simulatePolls(p, N, M) 

p = 0.53
B = 1000
mom = map(lambda y: np.mean(simulatePolls(p, N, M)), xrange(B))

#your code here
plt.hist(mom)

stats.probplot((mom - np.mean(mom)) / np.std(mom, ddof=1), dist="norm", plot = plt)
plt.show()

#your code here
np.std(mom, ddof = 1)

"The SE of the average of polls is %g" % np.round(np.std(mom, ddof = 1), 5)

#your code here
ratio = np.std(mom, ddof = 1) / np.std(obs, ddof = 1) 

"The ratio of the SE of the average of polls to the SD of a single poll is %g" % ratio

#your code here
B = 1000
p = 0.53
sds = map(lambda y: np.std(simulatePolls(p, N, M), ddof = 0), xrange(B))

#your code here
plt.hist(sds)
plt.xlabel('Standard deviations across %i polls' % M)
plt.ylabel('Frequency')
plt.title('Histogram of standard deviations across %i polls' % M)

stats.probplot((sds - np.mean(sds)) / np.std(sds, ddof=1), dist="norm", plot = plt)
plt.show()

#your code here
thesd = np.std(filtered["Obama"] / 100, ddof = 0)
thesd

#your code here
thesd / np.mean(sds)

np.mean(thesd > sds)

# Standard deviation from simulations in 2(b) 
print "SD from simulations: %g" % np.std(obs, ddof=1)

# Standard deviation computed analytically
print "SD using normal approximation %g" % np.sqrt(p * (1-p)/ N)

# Standard deviation from simulations in 2(c) 
print "SD from simulations: %g" % np.std(mom, ddof=1)

# Standard deviation computed analytically
print "SD using normal approximation %g" % np.sqrt((p * (1-p)/ N) / M)

#your code here
election["Diff"] = (election.Obama / 100) - (election.Romney / 100)
election.head()

#your code here
last_day = max(election["Start Date"])
filtered = election[map(lambda x: (last_day - x).days <= 5, election["Start Date"]) ]
filtered = filtered.sort(columns=["Start Date"])
days= map(lambda x: (last_day - x).days , filtered["Start Date"])

color_map = {}
for i, p in enumerate(set(filtered.Pollster)):
    color_map[p] = np.random.rand();

plt.scatter(days, filtered.Diff, c = map(lambda x: color_map[x], filtered.Pollster),  s=60 )
plt.axhline(y=0.039, c = "gray")
plt.axhline(y=np.mean(filtered.Diff), c = "red")
plt.xlabel("Days")
plt.ylabel("Difference (Obama - Romney)")
plt.title("Plot of the difference between Obama and Romney colored by different pollsters in the last week")


#your code here
pollster_map = {}

polls = list(set(filtered.Pollster))

for i, p in enumerate(polls):
    pollster_map[p] = i

plt.scatter(map(lambda x: pollster_map[x],filtered.Pollster), filtered.Diff, \
            c = map(lambda x: color_map[x],filtered.Pollster),s=60)
plt.xticks(range(len(polls)), polls, rotation = 90)
plt.xlabel("Pollsters")
plt.ylabel("Difference (Obama - Romney)")
plt.title("Plot of the difference between Obama and Romney by different pollsters")
plt.show()

#your code here
aggr  = filtered.groupby("Pollster").mean()
print "Average across pollsters: %g" % np.round(np.mean(aggr.Diff),4)
print "Standard error: %g" % np.std(aggr.Diff, ddof = 0)


#your code here
three_months = dt.datetime(2012,8,15,0,0)

new_data = election [map(lambda x: x >= three_months , election["Start Date"]) ]
new_data = new_data.sort("Start Date")
new_data["days"]= map(lambda x: (x - three_months).days , new_data["Start Date"])

new_data["Diff"] = (new_data.Obama/100) - (new_data.Romney/100)
new_data = new_data.groupby(["days"], as_index=False).mean()

plt.figure()
plt.plot(new_data.days, new_data.Diff )
plt.xlabel("Days from three month before the election")
plt.ylabel("Difference (Obama - Romney)")
plt.title("Difference between Obama and Romney across time")
plt.show()

url_str = "http://elections.huffingtonpost.com/pollster/api/charts/?topic=2014-senate"

election_urls = [election['url'] + '.csv' for election in requests.get(url_str).json()]
election_urls

def build_frame(url):
    """
    Returns a pandas DataFrame object containing
    the data returned from the given url
    """
    source = requests.get(url).text
    
    # Use StringIO because pd.DataFrame.from_csv requires .read() method
    s = StringIO(source)
    
    return pd.DataFrame.from_csv(s, index_col=None).convert_objects(
            convert_dates="coerce", convert_numeric=True)

# Makes a dictionary of pandas DataFrames keyed on election string.
dfs = dict((election.split("/")[-1][:-4], build_frame(election)) for election in election_urls)

#your code here
dfs['2014-kentucky-senate-mcconnell-vs-grimes'].head()

#your code here
x = {}
for keys in dfs:
    dat = dfs[keys]
    candidate1 = dat.columns[7] 
    candidate2 = dat.columns[8] 
    dat.Diff = (dat[candidate1]/100) - (dat[candidate2]/100)
    x[keys] = [candidate1, candidate2, np.round(np.mean(dat.Diff), 3)]

predictions = pd.DataFrame(x).T  
predictions.columns = ['Candidate1', 'Candidate2', 'Difference']
predictions['Winner'] = np.where(predictions.Difference >=0, 
                                 predictions.Candidate1, predictions.Candidate2)
predictions