import numpy as np
import matplotlib.pyplot as plt
%pylab inline
from scipy import stats
import matplotlib.mlab as mlab
import bisect
colours=['y','c']
Populating the interactive namespace from numpy and matplotlib
Here we use Tom Tango's equation to find the true variance (variance due to skill) of win percentages for each league.
$ var(observed) = var(skill) + var(luck) $
where
$ var(luck)=.5*.5/gp $, where $gp$ is the number of games played
and $ var(observed) $ is found from looking at historical win percentages data for each league. Numbers from:
http://www.insidethebook.com/ee/index.php/site/comments/true_talent_levels_for_sports_leagues/
#actual variances of winning percentage
avNHL=0.0997**2
avNBA=0.1449**2
avNFL=0.1899**2
avMLB=0.0717**2
#random variace (variance due to luck) of winning percentage
rvNHL=.5*.5/82
rvNBA=.5*.5/82
rvNFL=.5*.5/16
rvMLB=.5*.5/162
#true variance (variance due to skill) of winning percentage
tvNHL=avNHL-rvNHL
tvNBA=avNBA-rvNBA
tvNFL=avNFL-rvNFL
tvMLB=avMLB-rvMLB
avs={'NHL':avNHL, 'NBA':avNBA, 'NFL':avNFL, 'MLB':avMLB}
rvs={'NHL':rvNHL, 'NBA':rvNBA, 'NFL':rvNFL, 'MLB':rvMLB}
tvs={'NHL':tvNHL, 'NBA':tvNBA, 'NFL':tvNFL, 'MLB':tvMLB}
leagues=['NHL','NBA','NFL','MLB']
This function takes 6 intputs:
It returns a list of three values:
I chose to model the 'skill points' that a team gets by creating a guassian-ish distribution between 0 and 1. That is; only very few high ranked teams get close to '1 point', and only very few low ranked teams get close to '0 points', and the majority of the teams get close to '0.5 points'.
The 'skill points' of the lower ranked team are subtracted from the 'skill points' of the higher ranked team. The difference is then weighted by the percentage of the actual variance that skill accounts for, given the number of games played. In order for the under dogs to win, the 'luck points' they get, subtracted from the 'luck points' the higher seeded team gets (each weighted by the percentage of the actual variance that luck accounts for), will have to be equal to the difference in 'skill points'.
So for an upset:
$ luck\%*(points(luck)_{lowseed} - points(luck)_{highseed}) = skill\%*(points(skill)_{highseed} - points(skill)_{lowseed}) $
where:
$ luck\% = \frac{.5*.5/gp}{var(skill)+.5*.5/gp} $
$ skill\% = \frac{var(skill)}{var(skill)+.5*.5/gp} $
To solve for the unknown, we rearange:
$ (points(luck)_{lowseed} - points(luck)_{highseed}) = skill\%*(points(skill)_{highseed} - points(skill)_{lowseed})/luck\% $
If we assume luck is normally-ish distributed between 0 and 1, and each team picks their 'luck points' from such a distribution, to find the probability that the difference between the lower seed's 'luck points' and higher seed's 'luck points' is equal to the RHS of the above equation, we can create a new difference distribution.
This distribution will have a mean of $\mu_{1} - \mu_{2} $, or $ 0 $, and the variance will be $2*\sigma^2$. Then we can find the z-score in that distribution corresponding to the LHS of the above equation, and find the corresponding probability.
I'm using normal approximations for the distributions here, so nothing is mathematically perfect. My 0-1 distribution is approximated by a normal distribution with mean $ 0.5 $ and $ \sigma = 0.14 $
#function that calculates the chance that the team that won the most games is the more skilled team
def calcchance(t1rank,t2rank,percentage,numgames,gp,league):
luckperc=(.5*.5/gp)/(tvs[league]+.5*.5/gp)
skillperc=tvs[league]/(tvs[league]+.5*.5/gp)
#find skill points for each team from the skill distribution
pvals=np.arange(1,numteams[league]+1)
pvals=pvals*(1.0/float(numteams[league]+1))
zscores=stats.norm.isf(pvals)
skillscores=.5+zscores/(2*max(zscores))
t1skillpoints=skillscores[t1rank-1]
t2skillpoints=skillscores[t2rank-1]
#find difference of skill points weighted by skill%
skilldiff=t1skillpoints-t2skillpoints
efskilldiff=skilldiff*skillperc
#find the required z-score in the difference distribution
z_score2=(efskilldiff-0)/(np.sqrt(2*(.14**2))*luckperc)
#find the probability of no upset corresponding to that z-score
p_val =1-stats.norm.sf(z_score2)
#create a plot only for the purpose of extracting x/y pairs
datapoints=plt.plot(gp,p_val)
xvalues = datapoints[0].get_xdata()
yvalues = datapoints[0].get_ydata()
#find number of games required for given win probability
for i in range(len(yvalues)):
if yvalues[i] >= percentage:
saveindex=i
break
#reuturn: [probabilities for all games in gp, probability at given games played, games played at given probability]
results=[p_val,yvalues[numgames-1],xvalues[saveindex]]
plt.close()
return results
Here we just set the first 5 function parameters and call the function while cycling through the leagues. If you want to play around with the function parameters, here's the place to do it.
#call function defined above and plot the results for every league
#function parameters (these are the first 5. The 6th, league, is varied in for loop)
t1rank=1
t2rank=15
percentage=.8
numgames=5
gp=np.arange(1,101)
numteams={'MLB':30, 'NHL':30, 'NBA':30, 'NFL':32}
allchances=[] #for the combined plot
for league in leagues:
#call the function
resultarray=calcchance(t1rank,t2rank,percentage,numgames,gp,league)
#plot results for each league
plt.title(league)
plt.xlabel("# of games")
plt.ylabel('chance seed %d wins more games than seed %d' %(t1rank,t2rank))
plt.legend()
datapoints=plt.plot(gp,resultarray[0])
plt.show()
allchances.append(resultarray[0].tolist()) #for the combined plot
print percentage*100,'% chance at', resultarray[2], 'games'
print resultarray[1]*100, '% chance at', numgames, 'games'
#combined plot
for i in range(len(allchances)):
plt.plot(gp,allchances[i],label=leagues[i])
plt.xlabel("# of games")
plt.ylabel('chance seed %d wins more games than seed %d' %(t1rank,t2rank))
plt.legend()
plt.show()
80.0 % chance at 13 games 63.3240914287 % chance at 5 games
80.0 % chance at 5 games 81.2364080839 % chance at 5 games
80.0 % chance at 5 games 83.3829447059 % chance at 5 games
80.0 % chance at 24 games 57.0534503331 % chance at 5 games
This section is like the previous section, but the function parameters are set to simulate an average first round playoff scenario in each league.
#playoffs
#playoff parameters
t1rank=1
t2rankNHL=16
t2rankNBA=16
t2rankNFL=8
t2rankMLB=8
numgamesNHL=7
numgamesNBA=7
numgamesMLB=5
numgamesNFL=1
percentage=.9
gp=np.arange(1,100)
rlplayoffchance=[]
rlplayoffgames=[]
#NHL
resultNHL=calcchance(t1rank,t2rankNHL,percentage,numgamesNHL,gp,'NHL')
rlplayoffchance.append(resultNHL[1])
rlplayoffgames.append(resultNHL[2])
#NBA
resultNBA=calcchance(t1rank,t2rankNBA,percentage,numgamesNBA,gp,'NBA')
rlplayoffchance.append(resultNBA[1])
rlplayoffgames.append(resultNBA[2])
#NFL
resultNFL=calcchance(t1rank,t2rankNFL,percentage,numgamesNFL,gp,'NFL')
rlplayoffchance.append(resultNFL[1])
rlplayoffgames.append(resultNFL[2])
#MLB
resultMLB=calcchance(t1rank,t2rankMLB,percentage,numgamesMLB,gp,'MLB')
rlplayoffchance.append(resultMLB[1])
rlplayoffgames.append(resultMLB[2])
print "bar graph values from left to right"
print rlplayoffchance
#plot chances
N = len( rlplayoffchance )
x = np.arange(1, N+1)
y = rlplayoffchance
labels = leagues
width = .75
bar1 = plt.bar( x, y, width, color='CornflowerBlue')
plt.ylabel( 'chances top seed wins their first playoff series' )
plt.xticks(x + width/2.0, labels )
plt.ylim(0,1)
plt.show()
print "bar graph values from left to right"
print rlplayoffgames
#plot games needed
N = len( rlplayoffgames )
x = np.arange(1, N+1)
y = rlplayoffgames
labels = leagues
width = .75
bar1 = plt.bar( x, y, width, color='CornflowerBlue')
plt.ylabel( 'games needed for .%d chance top seed wins first playoff series' %(percentage*100))
plt.xticks(x + width/2.0, labels )
plt.ylim(0,60)
plt.show()
bar graph values from left to right [0.69074010022432986, 0.90265398096013816, 0.55155467019250737, 0.54692025781761511]
bar graph values from left to right [19, 7, 10, 55]