import pandas as pd import numpy as np from matplotlib import pyplot as plt %matplotlib inline dfShots = pd.read_csv('..\datasets\cl-shots-2012.csv', index_col=0, na_values='N/A') dfShots.head() dfShots.describe() plt.scatter(dfShots.dist, dfShots.goal, alpha=0.01) bins = np.linspace(dfShots.dist.min(), dfShots.dist.max(), 20) groups = dfShots.groupby(np.digitize(dfShots.dist, bins)) chart = groups[['dist','goal']].mean() plt.plot(chart.dist, chart.goal, 'bo-') plt.ylim(0,1) plt.ylabel('probability') plt.xlabel('distance (m)') dfShots[dfShots.dist > 40] from IPython.display import YouTubeVideo YouTubeVideo('HhJ84p9KLKY') import statsmodels.api as sm # adding a constant column to represent the intercept dfShots['intercept']=1.0 # identify the independent and dependent variables ind_cols=['dist','intercept'] dep_cols=['goal'] #training the model logit = sm.Logit(dfShots[dep_cols], dfShots[ind_cols]) result=logit.fit() # get the fitted coefficients from the results coeff = result.params print coeff def prob(dist,coeff): z = coeff[0]*dist + coeff[1] return 1/(1+np.exp(-1*z)) lf = pd.DataFrame(range(1,60), columns=["dist"]) lf['prob']=prob(lf['dist'], coeff) lf.head() plt.scatter(chart.dist, chart.goal, label="frequency") plt.plot(lf['dist'],lf['prob'], label="regression") plt.xlim(0, 60) plt.xlabel("Distance (m)") plt.ylim(0, 0.8) plt.ylabel("Probability") plt.legend() x_size = 105.0 y_size = 68.0 #set up field fig = plt.figure() fig.patch.set_facecolor('green') axes = fig.add_subplot(1, 1, 1, axisbg='green') axes.xaxis.set_visible(False) axes.yaxis.set_visible(False) plt.xlim([0,x_size]) plt.ylim([0,y_size]) #draw shots for i, row in enumerate(dfShots.values): size = 1.0 if row[4]: color = 'red' alpha = 0.4 else: color = 'white' alpha = 0.1 plt.arrow(row[9],row[10],row[1],row[2],fc=color, ec=color, head_width=size, head_length=size, alpha=alpha) plt.show()