Importing necessary libraries.
%pylab inline
from collections import defaultdict
import cPickle as pickle
import math
import numpy as np
import pandas as pd
import pylab
import glob
import seaborn
Populating the interactive namespace from numpy and matplotlib
colors = seaborn.color_palette()
from matplotlib import rc
rc("figure", facecolor="white")
rc("axes", facecolor="white")
rc("axes", edgecolor="grey")
rc("grid", alpha=0.9)
rc("grid", linewidth=0.2)
rc("grid", linestyle=":")
rc('font',**{'family':'serif','serif':['Palatino']})
# A utility function to compare the results side-by-side
def side_by_side(*objs, **kwds):
from pandas.core.common import adjoin
space = kwds.get('space', 4)
reprs = [repr(obj).split('\n') for obj in objs]
print adjoin(space, *reprs)
trainDF = pd.read_csv('trainCOL.csv', index_col=0)
testDF = pd.read_csv('testCOL.csv', index_col=0)
trainDF = trainDF.sort(columns='game_id').reset_index(drop=True)
testDF = testDF.sort(columns='game_id').reset_index(drop=True)
trdf = trainDF[['game_id','year','home_team','away_team','home_score','away_score','winner']]
def get_historic_games(team, gameId, trdf):
df = trdf[trdf.game_id < gameId]
hdf = df[df.home_team == team]
adf = df[df.away_team == team]
newdf = pd.concat([hdf, adf])
newdf = newdf.sort_index()
try:
return pd.concat([hdf, adf])
except:
return False
def get_points(team, df):
pFor = []
pAgainst = []
for r, v in df.iterrows():
hTeam = v.home_team
aTeam = v.away_team
hScore = v.home_score
aScore = v.away_score
if (team == hTeam):
pFor.append(hScore)
pAgainst.append(aScore)
elif (team == aTeam):
pFor.append(aScore)
pAgainst.append(hScore)
return (sum(pFor)/len(df), sum(pAgainst)/len(df))
for i in xrange(1, 50):
lst = []
for r, v in trdf.iterrows():
gameId = v.game_id
home = v.home_team
away = v.away_team
homehGames = get_historic_games(home, gameId, trdf)
awayhGames = get_historic_games(away, gameId, trdf)
if (isinstance(homehGames, pd.DataFrame) and isinstance(awayhGames, pd.DataFrame)):
if (len(homehGames.index) >= i and len(awayhGames.index) >= i):
homehGames = homehGames[-i:]
awayhGames = awayhGames[-i:]
hFor, hAgainst = get_points(home, homehGames)
aFor, aAgainst = get_points(away, awayhGames)
else:
hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
else:
hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
v = v.append(pd.Series([hFor, hAgainst, aFor, aAgainst], index=['hFor', 'hAgainst', 'aFor', 'aAgainst'], name=r))
lst.append(pd.DataFrame(v).T)
trdfn = pd.concat(lst)
csv_name = "baseline1/data_iter_"+str(i)+".csv"
trdfn.to_csv(csv_name)
def get_diff(hFor, hAgainst, aFor, aAgainst):
if sum([hFor, hAgainst, aFor, aAgainst]) > 0:
return (hFor + aAgainst)/2 - (aFor + hAgainst)/2
else:
return 0
def get_predicted_winner(diff_xy, home_team, away_team):
if diff_xy >= 0:
return home_team
else:
return away_team
def is_prediction_correct(winner, predicted_winner):
if winner == predicted_winner:
return True
else:
return False
dfs = []
for i in xrange(1, 50):
csv_name = "baseline1/data_iter_"+str(i)+".csv"
trdfn = pd.read_csv(csv_name, index_col=0)
years = set(trdfn.year)
lst = []
for year in years:
trdfny = trdfn[trdfn.year == year]
trdfny['diff_xy'] = trdfny.apply(lambda x: get_diff(x['hFor'], x['hAgainst'], x['aFor'], x['aAgainst']), axis=1)
trdfny['predicted_winner'] = trdfny.apply(lambda x: get_predicted_winner(x['diff_xy'], x['home_team'], x['away_team']), axis=1)
trdfny['correct_prediction'] = trdfny.apply(lambda x: is_prediction_correct(x['winner'], x['predicted_winner']), axis=1)
lst.append(float(sum(trdfny.correct_prediction))*100/len(trdfny.index))
dfs.append(pd.DataFrame(lst, columns=[i], index=years).T)
ansB1 = pd.concat(dfs)
tmpB1 = ansB1.T.mean()
tmpB1
1 58.679652 2 62.370012 3 63.960777 4 64.882653 5 65.617572 6 65.579237 7 65.843909 8 66.775720 9 66.865880 10 67.329603 11 67.123369 12 67.328187 13 67.439274 14 67.169722 15 67.578752 16 67.620286 17 67.446574 18 67.743059 19 67.651776 20 67.725993 21 67.773268 22 67.797476 23 67.791740 24 67.383125 25 67.435805 26 67.412654 27 67.730099 28 67.711859 29 67.205141 30 67.172648 31 67.321478 32 67.328854 33 67.320027 34 67.153167 35 67.389960 36 67.068088 37 66.746560 38 66.537536 39 66.470723 40 66.521872 41 66.614384 42 66.602449 43 66.398296 44 66.325003 45 66.336004 46 66.121544 47 66.256656 48 66.173984 49 66.148152 dtype: float64
max(tmpB1)
67.797475992742307
tmpB1.plot(label="Baseline 1")
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.xlabel('Number of games considered')
plt.ylabel('Prediction accurcay')
plt.ylim([0, 70])
plt.legend()
plt.show()
trdf_alpha = trainDF
def get_historic_games(team, gameId, trdf):
df = trdf[trdf.game_id < gameId]
hdf = df[df.home_team == team]
adf = df[df.away_team == team]
newdf = pd.concat([hdf, adf])
newdf = newdf.sort_index()
try:
return pd.concat([hdf, adf])
except:
return False
def get_points(team, df, alpha):
pFor = []
pAgainst = []
for r, v in df.iterrows():
hTeam = v.home_team
aTeam = v.away_team
hScore = v.home_score
aScore = v.away_score
if (team == hTeam):
pFor.append(hScore)
pAgainst.append(aScore)
elif (team == aTeam):
pFor.append(aScore)
pAgainst.append(hScore)
numGames = len(pFor)
weights = []
for i in xrange(1, numGames+1):
weights.append(alpha**i)
return (int(round(np.dot(pFor, weights)/float(sum(weights)))), int(round(np.dot(pAgainst, weights)/float(sum(weights)))))
for alpha in np.arange(0.5, 1.01, 0.1):
for i in xrange(17, 18):
lst = []
for r, v in trdf_alpha.iterrows():
gameId = v.game_id
home = v.home_team
away = v.away_team
homehGames = get_historic_games(home, gameId, trdf_alpha)
awayhGames = get_historic_games(away, gameId, trdf_alpha)
if (isinstance(homehGames, pd.DataFrame) and isinstance(awayhGames, pd.DataFrame)):
if (len(homehGames.index) >= i and len(awayhGames.index) >= i):
homehGames = homehGames[-i:]
awayhGames = awayhGames[-i:]
hFor, hAgainst = get_points(home, homehGames, alpha)
aFor, aAgainst = get_points(away, awayhGames, alpha)
else:
hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
else:
hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
v = v.append(pd.Series([hFor, hAgainst, aFor, aAgainst], index=['hFor', 'hAgainst', 'aFor', 'aAgainst'], name=r))
lst.append(pd.DataFrame(v).T)
trdf_alpha_n = pd.concat(lst)
csv_name = "baseline2/alpha" + str(alpha) + "/train_data_iter_"+str(i)+".csv"
trdf_alpha_n.to_csv(csv_name)
def get_diff(hFor, hAgainst, aFor, aAgainst):
if sum([hFor, hAgainst, aFor, aAgainst]) > 0:
return (hFor + aAgainst)/2 - (aFor + hAgainst)/2
else:
return 0
def get_predicted_winner(diff_xy, home_team, away_team):
if diff_xy >= 0:
return home_team
else:
return away_team
def is_prediction_correct(winner, predicted_winner):
if winner == predicted_winner:
return True
else:
return False
adfs = []
for alpha in np.arange(0.5, 1.01, 0.1):
dfs = []
for i in xrange(17, 18):
csv_name = "baseline2/alpha" + str(alpha) + "/train_data_iter_"+str(i)+".csv"
trdfn = pd.read_csv(csv_name, index_col=0)
years = set(trdfn.year)
lst = []
for year in years:
trdfny = trdfn[trdfn.year == year]
trdfny['diff_xy'] = trdfny.apply(lambda x: get_diff(x['hFor'], x['hAgainst'], x['aFor'], x['aAgainst']), axis=1)
trdfny['predicted_winner'] = trdfny.apply(lambda x: get_predicted_winner(x['diff_xy'], x['home_team'], x['away_team']), axis=1)
trdfny['correct_prediction'] = trdfny.apply(lambda x: is_prediction_correct(x['winner'], x['predicted_winner']), axis=1)
lst.append(float(sum(trdfny.correct_prediction))*100/len(trdfny.index))
dfs.append(pd.DataFrame(lst, columns=[i], index=years).T)
tmp = pd.concat(dfs)
tmp = tmp.T.mean()
adfs.append(pd.DataFrame(tmp, columns=[alpha]).T)
ans = pd.concat(adfs).T
ans.T
17 | |
---|---|
0.5 | 63.453913 |
0.6 | 64.496069 |
0.7 | 65.155919 |
0.8 | 66.047140 |
0.9 | 67.005708 |
1.0 | 67.595777 |
6 rows × 1 columns