Importing necessary libraries.
%pylab inline
from collections import defaultdict
import cPickle as pickle
import math
import numpy as np
import pandas as pd
import pylab
import glob
import seaborn
Populating the interactive namespace from numpy and matplotlib
colors = seaborn.color_palette()
from matplotlib import rc
rc("figure", facecolor="white")
rc("axes", facecolor="white")
rc("axes", edgecolor="grey")
rc("grid", alpha=0.9)
rc("grid", linewidth=0.2)
rc("grid", linestyle=":")
rc('font',**{'family':'serif','serif':['Palatino']})
# A utility function to compare the results side-by-side
def side_by_side(*objs, **kwds):
from pandas.core.common import adjoin
space = kwds.get('space', 4)
reprs = [repr(obj).split('\n') for obj in objs]
print adjoin(space, *reprs)
trainDF = pd.read_csv('trainCOL.csv', index_col=0)
testDF = pd.read_csv('testCOL.csv', index_col=0)
trainDF = trainDF.sort(columns='game_id').reset_index(drop=True)
testDF = testDF.sort(columns='game_id').reset_index(drop=True)
trdf = trainDF[['game_id','year','home_team','away_team','home_score','away_score','winner']]
def get_historic_games(team, gameId, trdf):
df = trdf[trdf.game_id < gameId]
hdf = df[df.home_team == team]
adf = df[df.away_team == team]
newdf = pd.concat([hdf, adf])
newdf = newdf.sort_index()
try:
return pd.concat([hdf, adf])
except:
return False
def get_points(team, df):
pFor = []
pAgainst = []
for r, v in df.iterrows():
hTeam = v.home_team
aTeam = v.away_team
hScore = v.home_score
aScore = v.away_score
if (team == hTeam):
pFor.append(hScore)
pAgainst.append(aScore)
elif (team == aTeam):
pFor.append(aScore)
pAgainst.append(hScore)
return (sum(pFor)/len(df), sum(pAgainst)/len(df))
for i in xrange(1, 50):
lst = []
for r, v in trdf.iterrows():
gameId = v.game_id
home = v.home_team
away = v.away_team
homehGames = get_historic_games(home, gameId, trdf)
awayhGames = get_historic_games(away, gameId, trdf)
if (isinstance(homehGames, pd.DataFrame) and isinstance(awayhGames, pd.DataFrame)):
if (len(homehGames.index) >= i and len(awayhGames.index) >= i):
homehGames = homehGames[-i:]
awayhGames = awayhGames[-i:]
hFor, hAgainst = get_points(home, homehGames)
aFor, aAgainst = get_points(away, awayhGames)
else:
hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
else:
hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
v = v.append(pd.Series([hFor, hAgainst, aFor, aAgainst], index=['hFor', 'hAgainst', 'aFor', 'aAgainst'], name=r))
lst.append(pd.DataFrame(v).T)
trdfn = pd.concat(lst)
csv_name = "baseline2/data_iter_"+str(i)+".csv"
trdfn.to_csv(csv_name)
def get_diff(hFor, hAgainst, aFor, aAgainst):
if sum([hFor, hAgainst, aFor, aAgainst]) > 0:
return (hFor + aAgainst)/2 - (aFor + hAgainst)/2 + 3
else:
return 3
def get_predicted_winner(diff_xy, home_team, away_team):
if diff_xy >= 0:
return home_team
else:
return away_team
def is_prediction_correct(winner, predicted_winner):
if winner == predicted_winner:
return True
else:
return False
dfs = []
for i in xrange(1, 50):
csv_name = "baseline2/data_iter_"+str(i)+".csv"
trdfn = pd.read_csv(csv_name, index_col=0)
years = set(trdfn.year)
lst = []
for year in years:
trdfny = trdfn[trdfn.year == year]
trdfny['diff_xy'] = trdfny.apply(lambda x: get_diff(x['hFor'], x['hAgainst'], x['aFor'], x['aAgainst']), axis=1)
trdfny['predicted_winner'] = trdfny.apply(lambda x: get_predicted_winner(x['diff_xy'], x['home_team'], x['away_team']), axis=1)
trdfny['correct_prediction'] = trdfny.apply(lambda x: is_prediction_correct(x['winner'], x['predicted_winner']), axis=1)
lst.append(float(sum(trdfny.correct_prediction))*100/len(trdfny.index))
dfs.append(pd.DataFrame(lst, columns=[i], index=years).T)
ansB2 = pd.concat(dfs)
tmpB2 = ansB2.T.mean()
tmpB2
1 60.565057 2 64.275654 3 64.916621 4 66.238334 5 66.206500 6 66.968334 7 67.559769 8 67.898077 9 68.060523 10 68.005607 11 68.275960 12 68.330202 13 68.091012 14 68.034593 15 68.259817 16 68.484057 17 68.407818 18 68.412952 19 68.248700 20 67.857821 21 68.004247 22 68.095474 23 67.652597 24 67.851372 25 67.810244 26 67.703698 27 67.551977 28 67.348082 29 67.630853 30 67.333432 31 67.358957 32 67.165778 33 67.266391 34 67.043527 35 67.114855 36 67.215772 37 66.827336 38 66.691250 39 66.689031 40 66.670639 41 66.503925 42 66.568545 43 66.442503 44 66.516423 45 66.487398 46 66.407171 47 66.231543 48 66.129490 49 66.037590 dtype: float64
max(tmpB2)
68.484057376234418
tmpB2.plot(label="Baseline 2")
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.xlabel('Number of games considered')
plt.ylabel('Prediction accurcay')
plt.ylim([0, 69])
plt.legend()
plt.show()
/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/font_manager.py:1236: UserWarning: findfont: Font family ['serif'] not found. Falling back to Bitstream Vera Sans (prop.get_family(), self.defaultFamily[fontext]))
trdf_alpha = trainDF
def get_historic_games(team, gameId, trdf):
df = trdf[trdf.game_id < gameId]
hdf = df[df.home_team == team]
adf = df[df.away_team == team]
newdf = pd.concat([hdf, adf])
newdf = newdf.sort_index()
try:
return pd.concat([hdf, adf])
except:
return False
def get_points(team, df, alpha):
pFor = []
pAgainst = []
for r, v in df.iterrows():
hTeam = v.home_team
aTeam = v.away_team
hScore = v.home_score
aScore = v.away_score
if (team == hTeam):
pFor.append(hScore)
pAgainst.append(aScore)
elif (team == aTeam):
pFor.append(aScore)
pAgainst.append(hScore)
numGames = len(pFor)
weights = []
for i in xrange(1, numGames+1):
weights.append(alpha**i)
return (int(round(np.dot(pFor, weights)/float(sum(weights)))), int(round(np.dot(pAgainst, weights)/float(sum(weights)))))
for alpha in np.arange(0.5, 1.01, 0.1):
for i in xrange(17, 18):
lst = []
for r, v in trdf_alpha.iterrows():
gameId = v.game_id
home = v.home_team
away = v.away_team
homehGames = get_historic_games(home, gameId, trdf_alpha)
awayhGames = get_historic_games(away, gameId, trdf_alpha)
if (isinstance(homehGames, pd.DataFrame) and isinstance(awayhGames, pd.DataFrame)):
if (len(homehGames.index) >= i and len(awayhGames.index) >= i):
homehGames = homehGames[-i:]
awayhGames = awayhGames[-i:]
hFor, hAgainst = get_points(home, homehGames, alpha)
aFor, aAgainst = get_points(away, awayhGames, alpha)
else:
hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
else:
hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
v = v.append(pd.Series([hFor, hAgainst, aFor, aAgainst], index=['hFor', 'hAgainst', 'aFor', 'aAgainst'], name=r))
lst.append(pd.DataFrame(v).T)
trdf_alpha_n = pd.concat(lst)
csv_name = "baseline2/alpha" + str(alpha) + "/train_data_iter_"+str(i)+".csv"
trdf_alpha_n.to_csv(csv_name)
def get_diff(hFor, hAgainst, aFor, aAgainst):
if sum([hFor, hAgainst, aFor, aAgainst]) > 0:
return (hFor + aAgainst)/2 - (aFor + hAgainst)/2 + 3
else:
return 3
def get_predicted_winner(diff_xy, home_team, away_team):
if diff_xy >= 0:
return home_team
else:
return away_team
def is_prediction_correct(winner, predicted_winner):
if winner == predicted_winner:
return True
else:
return False
adfs = []
for alpha in np.arange(0.5, 1.01, 0.1):
dfs = []
for i in xrange(17, 18):
csv_name = "baseline2/alpha" + str(alpha) + "/train_data_iter_"+str(i)+".csv"
trdfn = pd.read_csv(csv_name, index_col=0)
years = set(trdfn.year)
lst = []
for year in years:
trdfny = trdfn[trdfn.year == year]
trdfny['diff_xy'] = trdfny.apply(lambda x: get_diff(x['hFor'], x['hAgainst'], x['aFor'], x['aAgainst']), axis=1)
trdfny['predicted_winner'] = trdfny.apply(lambda x: get_predicted_winner(x['diff_xy'], x['home_team'], x['away_team']), axis=1)
trdfny['correct_prediction'] = trdfny.apply(lambda x: is_prediction_correct(x['winner'], x['predicted_winner']), axis=1)
lst.append(float(sum(trdfny.correct_prediction))*100/len(trdfny.index))
dfs.append(pd.DataFrame(lst, columns=[i], index=years).T)
tmp = pd.concat(dfs)
tmp = tmp.T.mean()
adfs.append(pd.DataFrame(tmp, columns=[alpha]).T)
ans = pd.concat(adfs).T
ans.T
17 | |
---|---|
0.5 | 64.177477 |
0.6 | 64.768426 |
0.7 | 65.269568 |
0.8 | 66.582841 |
0.9 | 67.668979 |
1.0 | 68.406462 |
6 rows × 1 columns