import numpy as np
import scipy.stats as stats
import pandas as pd
import pandas.io.sql as psql
import mysql.connector as sql

%matplotlib inline
import matplotlib.pyplot as plt

# from mpld3 import enable_notebook
# enable_notebook()

import pickle
with open('sql.config','rb') as fp:
    config = pickle.load(fp)

con = sql.connect(**config)
df = psql.frame_query('''
    SELECT fgxp.DIST, fgxp.GOOD, games.TEMP, games.STAD FROM fgxp, core, games 
    WHERE core.PID=fgxp.PID AND games.GID = core.GID AND fgxp.FGXP='FG';''', con)

con.close()

df.TEMP = df.TEMP.astype(float)
df.head()

df_elevation = pd.read_csv('stadium_with_altitude.csv', index_col=0)
df_elevation.head()

def getAlt(d):
    c = df_elevation.STAD==d
    if np.any(c): 
        return df_elevation[df_elevation.STAD==d]['alt'].iloc[0]
    return np.nan

df['ALT'] = df.STAD.map(getAlt)
df.head()

convert = lambda x: True if x == 'Y' else False

normal = (df.TEMP >= 40) & (df.DIST > 40) & df.TEMP.notnull()
normal = df[normal].GOOD.apply(convert)

cold = (df.TEMP < 40) & (df.DIST > 40) & df.TEMP.notnull()
cold = df[cold].GOOD.apply(convert)

print normal.mean(), normal.std()
print cold.mean(), cold.std()

import pymc as pm

with pm.Model() as model:
    p_normal = pm.Uniform("p_normal",lower=0.2, upper=0.8)
    p_cold = pm.Uniform("p_cold",lower=0.4, upper=0.8)
    
    # scraped observations
    obs_normal = pm.Bernoulli("obs_normal", p_normal, observed=normal.astype(int))
    obs_cold = pm.Bernoulli("obs_cold", p_cold, observed=cold.astype(int))
    
    start = pm.find_MAP()
    m = pm.NUTS(state=start)
    trace = pm.sample(2000, m, start=start, progressbar=False)

p_normal_samples = trace['p_normal'][:]
p_cold_samples=  trace['p_cold'][:]
delta_samples = trace['p_normal']-trace['p_cold']

plt.figure(figsize=(10, 10))

ax = plt.subplot(311)
plt.hist(p_normal_samples, histtype='stepfilled', bins=50, normed=True, color='#E41A1C', label='Warm')
plt.legend()
plt.vlines(normal.mean(), 0, 300, linestyles='--')
plt.xticks(np.arange(0.5, 0.8, 0.05))
plt.xlim(0.5, 0.8)
plt.ylim(0, 80)
plt.grid(False)

ax = plt.subplot(312)
plt.hist(p_cold_samples, histtype='stepfilled', bins=50, normed=True, color='#4DAF4A', label='Cold')
plt.vlines(cold.mean(), 0, 300, linestyles='--')
plt.legend()
plt.xticks(np.arange(0.5, 0.8, 0.05))
plt.xlim(0.5, 0.8)
plt.ylim(0, 60)
plt.grid(False)

ax = plt.subplot(313)
plt.hist(delta_samples, histtype='stepfilled', bins=50, normed=True, color='#377EB8', label='Delta')
plt.vlines(0, 0, 300, linestyles='--', label='$H_0$ (No difference)')
plt.legend()
plt.xticks(np.arange(-0.1, 0.201, 0.1))
plt.xlim(-0.1, 0.2)
plt.ylim(0, 60)
plt.grid(False);

print ("Field Goals less than 40 yards are less successful in temperatures < 40 degrees "
       "in {0:.1f}% of simulations").format((delta_samples > 0).mean() * 100)

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, roc_curve, auc, precision_recall_curve

convert = lambda x: 1 if x == 'Y' else 0
data = df;
data = data[data.TEMP.notnull()]
data.TEMP = data.TEMP.astype(float)
data.GOOD = data.GOOD.apply(convert)

X = data[['DIST','TEMP']]
y = data['GOOD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)

# Set up the random forest model with parameters found via grid search
clf = RandomForestClassifier(oob_score = True, criterion='gini', n_estimators=100,  min_samples_leaf=20, max_features=2)
clf.fit(X_train, y_train)

def summary():
    print 'Training set score: %.1f%%' % (clf.score(X_train, y_train)*100)
    print 'Test set score: %.1f%%' %(clf.score(X_test, y_test)*100)
    preds = clf.predict(X_test)
    print classification_report(np.array(y_test), preds)
    print 'Confusion matrix: '
    print pd.crosstab(y_test, preds, rownames=["Actual"], colnames=["Predicted"])

# summary()

def roc():
    full_probs = clf.predict_proba(X_test)
    fpr_full, tpr_full, thresholds_full = roc_curve(y_test, full_probs[:, 1])
    roc_auc_full = auc(fpr_full, tpr_full)
    
    plt.figure(figsize=(6, 5))
    plt.plot(fpr_full, tpr_full, label='Full random forest ROC curve (area = %0.2f)' % roc_auc_full)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic curve')
    plt.legend(loc="lower right");
roc()

h = 5  # step size in the mesh
x_min, x_max = X.DIST.min() - .5, X.DIST.max() + .5
y_min, y_max = X.TEMP.min() - .5, X.TEMP.max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
s = xx.ravel().shape
cm = plt.cm.RdBu

v = None
for estimator in clf.estimators_:
    e = estimator.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]    
    if v is None:
        v = e.reshape((len(e),1))
    else:
        v = np.hstack((v, e.reshape((len(e),1))))
Z = v.mean(axis=1)
p1 = np.percentile(v, 2.5, axis=1)
p2 = np.percentile(v, 97.5, axis=1)
P = p2-p1


fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,4))

plt.subplot(121)
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=cm, alpha=.8, levels=np.arange(0,1.1,0.1))
plt.title('Prediction')
plt.ylabel('Temperature (${\ }^{\circ} F$)')
plt.xlabel('Yards From Scrimmage')

plt.subplot(122)
P = P.reshape(xx.shape)
co = plt.contourf(xx, yy, P, cmap=cm, alpha=.8, levels=np.arange(0,1.1,0.1))
plt.xlabel('Yards From Scrimmage')
plt.title('Width of Confidence Interval')

fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.81, 0.15, 0.02, 0.7])
fig.colorbar(co, cax=cbar_ax);


print "Distance: %f, Temperature: %f" %(clf.feature_importances_[0],clf.feature_importances_[1])

convert = lambda x: 1 if x == 'Y' else 0
data = df;
data = data[data.TEMP.notnull()]
data.TEMP = data.TEMP.astype(float)
data.GOOD = data.GOOD.apply(convert)

X = data[['DIST','TEMP','ALT']]
y = data['GOOD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)

# Set up the random forest model with parameters found via grid search
clf = RandomForestClassifier(oob_score = True, criterion='entropy', n_estimators=100,  min_samples_leaf=15)
clf.fit(X_train, y_train)

roc()

h = 5  # step size in the mesh
x_min, x_max = X.DIST.min() - .5, X.DIST.max() + .5
y_min, y_max = X.TEMP.min() - .5, X.TEMP.max() + .5
z_min, z_max = X.ALT.min() - .5, X.ALT.max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
s = xx.ravel().shape
cm = plt.cm.RdBu

def predict(z):
    v = None
    for estimator in clf.estimators_:
        e = estimator.predict_proba(np.c_[xx.ravel(), yy.ravel(), np.repeat(z,s)])[:, 1]    
        if v is None:
            v = e.reshape((len(e),1))
        else:
            v = np.hstack((v, e.reshape((len(e),1))))
    m = v.mean(axis=1)
    p1 = np.percentile(v, 2.5, axis=1)
    p2 = np.percentile(v, 97.5, axis=1)
    p = p2-p1
    return m, p


fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(10,8))

plt.subplot(321)
Z,P = predict(5.0)
# Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel(), np.repeat(5.0,s)])[:, 1]
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=cm, alpha=.8, levels=np.arange(0,1.1,0.1))
plt.title('5 Meters Alititude Prediction')
plt.ylabel('Temperature (${\ }^{\circ} F$)')

plt.subplot(322)
P = P.reshape(xx.shape)
plt.contourf(xx, yy, P, cmap=cm, alpha=.8, levels=np.arange(0,1.1,0.1))
plt.title('5 Meters Alitude Confidence Interval Width')


plt.subplot(323)
Z,P = predict(500.0)
# Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel(), np.repeat(500.0,s)])[:, 1]
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=cm, alpha=.8, levels=np.arange(0,1.1,0.1))
plt.title('500 Meters Alititude Prediction')
plt.ylabel('Temperature (${\ }^{\circ} F$)')

plt.subplot(324)
P = P.reshape(xx.shape)
plt.contourf(xx, yy, P, cmap=cm, alpha=.8, levels=np.arange(0,1.1,0.1))
plt.title('500 Meters Alitude Confidence Interval Width')


plt.subplot(325)
Z,P = predict(1500.0)
# Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel(), np.repeat(1500.0,s)])[:, 1]
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=cm, alpha=.8, levels=np.arange(0,1.1,0.1))
plt.title('1500 Meters Alititude Prediction')
plt.xlabel('Yards From Scrimmage')
plt.ylabel('Temperature (${\ }^{\circ} F$)')

plt.subplot(326)
P = P.reshape(xx.shape)
plt.contourf(xx, yy, P, cmap=cm, alpha=.8, levels=np.arange(0,1.1,0.1))
plt.title('1500 Meters Alitude Confidence Interval Width')
plt.xlabel('Yards From Scrimmage')

fig.subplots_adjust(right=.8)
cbar_ax = fig.add_axes([1, 0.15, 0.02, 0.7])
fig.colorbar(co, cax=cbar_ax)

plt.tight_layout()


plt.figure(figsize=(12,6))
plt.bar([0,1,2],clf.feature_importances_, align='center')
plt.xticks([0,1,2],['Distance','Temperature','Altitude']);

from IPython.core.display import HTML
def css_styling():
    styles = open("style.css", "r").read()
    return HTML(styles)
css_styling()