import numpy as np import scipy.stats as stats import pandas as pd import pandas.io.sql as psql import mysql.connector as sql %matplotlib inline import matplotlib.pyplot as plt # from mpld3 import enable_notebook # enable_notebook() import pickle with open('sql.config','rb') as fp: config = pickle.load(fp) con = sql.connect(**config) df = psql.frame_query(''' SELECT fgxp.DIST, fgxp.GOOD, games.TEMP, games.STAD FROM fgxp, core, games WHERE core.PID=fgxp.PID AND games.GID = core.GID AND fgxp.FGXP='FG';''', con) con.close() df.TEMP = df.TEMP.astype(float) df.head() df_elevation = pd.read_csv('stadium_with_altitude.csv', index_col=0) df_elevation.head() def getAlt(d): c = df_elevation.STAD==d if np.any(c): return df_elevation[df_elevation.STAD==d]['alt'].iloc[0] return np.nan df['ALT'] = df.STAD.map(getAlt) df.head() convert = lambda x: True if x == 'Y' else False normal = (df.TEMP >= 40) & (df.DIST > 40) & df.TEMP.notnull() normal = df[normal].GOOD.apply(convert) cold = (df.TEMP < 40) & (df.DIST > 40) & df.TEMP.notnull() cold = df[cold].GOOD.apply(convert) print normal.mean(), normal.std() print cold.mean(), cold.std() import pymc as pm with pm.Model() as model: p_normal = pm.Uniform("p_normal",lower=0.2, upper=0.8) p_cold = pm.Uniform("p_cold",lower=0.4, upper=0.8) # scraped observations obs_normal = pm.Bernoulli("obs_normal", p_normal, observed=normal.astype(int)) obs_cold = pm.Bernoulli("obs_cold", p_cold, observed=cold.astype(int)) start = pm.find_MAP() m = pm.NUTS(state=start) trace = pm.sample(2000, m, start=start, progressbar=False) p_normal_samples = trace['p_normal'][:] p_cold_samples= trace['p_cold'][:] delta_samples = trace['p_normal']-trace['p_cold'] plt.figure(figsize=(10, 10)) ax = plt.subplot(311) plt.hist(p_normal_samples, histtype='stepfilled', bins=50, normed=True, color='#E41A1C', label='Warm') plt.legend() plt.vlines(normal.mean(), 0, 300, linestyles='--') plt.xticks(np.arange(0.5, 0.8, 0.05)) plt.xlim(0.5, 0.8) plt.ylim(0, 80) plt.grid(False) ax = plt.subplot(312) plt.hist(p_cold_samples, histtype='stepfilled', bins=50, normed=True, color='#4DAF4A', label='Cold') plt.vlines(cold.mean(), 0, 300, linestyles='--') plt.legend() plt.xticks(np.arange(0.5, 0.8, 0.05)) plt.xlim(0.5, 0.8) plt.ylim(0, 60) plt.grid(False) ax = plt.subplot(313) plt.hist(delta_samples, histtype='stepfilled', bins=50, normed=True, color='#377EB8', label='Delta') plt.vlines(0, 0, 300, linestyles='--', label='$H_0$ (No difference)') plt.legend() plt.xticks(np.arange(-0.1, 0.201, 0.1)) plt.xlim(-0.1, 0.2) plt.ylim(0, 60) plt.grid(False); print ("Field Goals less than 40 yards are less successful in temperatures < 40 degrees " "in {0:.1f}% of simulations").format((delta_samples > 0).mean() * 100) from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import train_test_split from sklearn.metrics import classification_report, roc_curve, auc, precision_recall_curve convert = lambda x: 1 if x == 'Y' else 0 data = df; data = data[data.TEMP.notnull()] data.TEMP = data.TEMP.astype(float) data.GOOD = data.GOOD.apply(convert) X = data[['DIST','TEMP']] y = data['GOOD'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) # Set up the random forest model with parameters found via grid search clf = RandomForestClassifier(oob_score = True, criterion='gini', n_estimators=100, min_samples_leaf=20, max_features=2) clf.fit(X_train, y_train) def summary(): print 'Training set score: %.1f%%' % (clf.score(X_train, y_train)*100) print 'Test set score: %.1f%%' %(clf.score(X_test, y_test)*100) preds = clf.predict(X_test) print classification_report(np.array(y_test), preds) print 'Confusion matrix: ' print pd.crosstab(y_test, preds, rownames=["Actual"], colnames=["Predicted"]) # summary() def roc(): full_probs = clf.predict_proba(X_test) fpr_full, tpr_full, thresholds_full = roc_curve(y_test, full_probs[:, 1]) roc_auc_full = auc(fpr_full, tpr_full) plt.figure(figsize=(6, 5)) plt.plot(fpr_full, tpr_full, label='Full random forest ROC curve (area = %0.2f)' % roc_auc_full) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic curve') plt.legend(loc="lower right"); roc() h = 5 # step size in the mesh x_min, x_max = X.DIST.min() - .5, X.DIST.max() + .5 y_min, y_max = X.TEMP.min() - .5, X.TEMP.max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) s = xx.ravel().shape cm = plt.cm.RdBu v = None for estimator in clf.estimators_: e = estimator.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] if v is None: v = e.reshape((len(e),1)) else: v = np.hstack((v, e.reshape((len(e),1)))) Z = v.mean(axis=1) p1 = np.percentile(v, 2.5, axis=1) p2 = np.percentile(v, 97.5, axis=1) P = p2-p1 fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,4)) plt.subplot(121) Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, cmap=cm, alpha=.8, levels=np.arange(0,1.1,0.1)) plt.title('Prediction') plt.ylabel('Temperature (${\ }^{\circ} F$)') plt.xlabel('Yards From Scrimmage') plt.subplot(122) P = P.reshape(xx.shape) co = plt.contourf(xx, yy, P, cmap=cm, alpha=.8, levels=np.arange(0,1.1,0.1)) plt.xlabel('Yards From Scrimmage') plt.title('Width of Confidence Interval') fig.subplots_adjust(right=0.8) cbar_ax = fig.add_axes([0.81, 0.15, 0.02, 0.7]) fig.colorbar(co, cax=cbar_ax); print "Distance: %f, Temperature: %f" %(clf.feature_importances_[0],clf.feature_importances_[1]) convert = lambda x: 1 if x == 'Y' else 0 data = df; data = data[data.TEMP.notnull()] data.TEMP = data.TEMP.astype(float) data.GOOD = data.GOOD.apply(convert) X = data[['DIST','TEMP','ALT']] y = data['GOOD'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) # Set up the random forest model with parameters found via grid search clf = RandomForestClassifier(oob_score = True, criterion='entropy', n_estimators=100, min_samples_leaf=15) clf.fit(X_train, y_train) roc() h = 5 # step size in the mesh x_min, x_max = X.DIST.min() - .5, X.DIST.max() + .5 y_min, y_max = X.TEMP.min() - .5, X.TEMP.max() + .5 z_min, z_max = X.ALT.min() - .5, X.ALT.max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) s = xx.ravel().shape cm = plt.cm.RdBu def predict(z): v = None for estimator in clf.estimators_: e = estimator.predict_proba(np.c_[xx.ravel(), yy.ravel(), np.repeat(z,s)])[:, 1] if v is None: v = e.reshape((len(e),1)) else: v = np.hstack((v, e.reshape((len(e),1)))) m = v.mean(axis=1) p1 = np.percentile(v, 2.5, axis=1) p2 = np.percentile(v, 97.5, axis=1) p = p2-p1 return m, p fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(10,8)) plt.subplot(321) Z,P = predict(5.0) # Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel(), np.repeat(5.0,s)])[:, 1] Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, cmap=cm, alpha=.8, levels=np.arange(0,1.1,0.1)) plt.title('5 Meters Alititude Prediction') plt.ylabel('Temperature (${\ }^{\circ} F$)') plt.subplot(322) P = P.reshape(xx.shape) plt.contourf(xx, yy, P, cmap=cm, alpha=.8, levels=np.arange(0,1.1,0.1)) plt.title('5 Meters Alitude Confidence Interval Width') plt.subplot(323) Z,P = predict(500.0) # Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel(), np.repeat(500.0,s)])[:, 1] Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, cmap=cm, alpha=.8, levels=np.arange(0,1.1,0.1)) plt.title('500 Meters Alititude Prediction') plt.ylabel('Temperature (${\ }^{\circ} F$)') plt.subplot(324) P = P.reshape(xx.shape) plt.contourf(xx, yy, P, cmap=cm, alpha=.8, levels=np.arange(0,1.1,0.1)) plt.title('500 Meters Alitude Confidence Interval Width') plt.subplot(325) Z,P = predict(1500.0) # Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel(), np.repeat(1500.0,s)])[:, 1] Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, cmap=cm, alpha=.8, levels=np.arange(0,1.1,0.1)) plt.title('1500 Meters Alititude Prediction') plt.xlabel('Yards From Scrimmage') plt.ylabel('Temperature (${\ }^{\circ} F$)') plt.subplot(326) P = P.reshape(xx.shape) plt.contourf(xx, yy, P, cmap=cm, alpha=.8, levels=np.arange(0,1.1,0.1)) plt.title('1500 Meters Alitude Confidence Interval Width') plt.xlabel('Yards From Scrimmage') fig.subplots_adjust(right=.8) cbar_ax = fig.add_axes([1, 0.15, 0.02, 0.7]) fig.colorbar(co, cax=cbar_ax) plt.tight_layout() plt.figure(figsize=(12,6)) plt.bar([0,1,2],clf.feature_importances_, align='center') plt.xticks([0,1,2],['Distance','Temperature','Altitude']); from IPython.core.display import HTML def css_styling(): styles = open("style.css", "r").read() return HTML(styles) css_styling()