College Foorball Baseline 2¶

Importing necessary libraries.

In [1]:

%pylab inline

from collections import defaultdict

import cPickle as pickle
import math
import numpy as np
import pandas as pd
import pylab
import glob
import seaborn

Populating the interactive namespace from numpy and matplotlib

In [2]:

colors = seaborn.color_palette()

from matplotlib import rc

rc("figure", facecolor="white")
rc("axes", facecolor="white")
rc("axes", edgecolor="grey")


rc("grid", alpha=0.9)
rc("grid", linewidth=0.2)
rc("grid", linestyle=":")


rc('font',**{'family':'serif','serif':['Palatino']})

In [3]:

# A utility function to compare the results side-by-side
def side_by_side(*objs, **kwds):
    from pandas.core.common import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print adjoin(space, *reprs)

Creating Train and Test Set¶

In [4]:

trainDF = pd.read_csv('trainCOL.csv', index_col=0)
testDF = pd.read_csv('testCOL.csv', index_col=0)

In [5]:

trainDF = trainDF.sort(columns='game_id').reset_index(drop=True)
testDF = testDF.sort(columns='game_id').reset_index(drop=True)

Train Data operations¶

In [6]:

trdf = trainDF[['game_id','year','home_team','away_team','home_score','away_score','winner']]

In [7]:

def get_historic_games(team, gameId, trdf):
    df = trdf[trdf.game_id < gameId]
    hdf = df[df.home_team == team]
    adf = df[df.away_team == team]
    newdf = pd.concat([hdf, adf])
    newdf = newdf.sort_index()
    try:
        return pd.concat([hdf, adf])
    except:
        return False

In [8]:

def get_points(team, df):
    pFor = []
    pAgainst = []
    for r, v in df.iterrows():
        hTeam = v.home_team
        aTeam = v.away_team
        hScore = v.home_score
        aScore = v.away_score
        if (team == hTeam):
            pFor.append(hScore)
            pAgainst.append(aScore)
        elif (team == aTeam):
            pFor.append(aScore)
            pAgainst.append(hScore)
    return (sum(pFor)/len(df), sum(pAgainst)/len(df))

In [ ]:

for i in xrange(1, 50):
    lst = []
    for r, v in trdf.iterrows():
        gameId = v.game_id
        home = v.home_team
        away = v.away_team
        homehGames = get_historic_games(home, gameId, trdf)
        awayhGames = get_historic_games(away, gameId, trdf)
        if (isinstance(homehGames, pd.DataFrame) and isinstance(awayhGames, pd.DataFrame)):
            if (len(homehGames.index) >= i and len(awayhGames.index) >= i):
                homehGames = homehGames[-i:]
                awayhGames = awayhGames[-i:]
                hFor, hAgainst = get_points(home, homehGames)
                aFor, aAgainst = get_points(away, awayhGames)
            else:
                hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
        else:
            hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
        v = v.append(pd.Series([hFor, hAgainst, aFor, aAgainst], index=['hFor', 'hAgainst', 'aFor', 'aAgainst'], name=r))
        lst.append(pd.DataFrame(v).T)
    trdfn = pd.concat(lst)
    csv_name = "baseline2/data_iter_"+str(i)+".csv"
    trdfn.to_csv(csv_name)

In [9]:

def get_diff(hFor, hAgainst, aFor, aAgainst):
    if sum([hFor, hAgainst, aFor, aAgainst]) > 0:
        return (hFor + aAgainst)/2  - (aFor + hAgainst)/2 + 3
    else:
        return 3

In [10]:

def get_predicted_winner(diff_xy, home_team, away_team):
    if diff_xy >= 0:
        return home_team
    else:
        return away_team

In [11]:

def is_prediction_correct(winner, predicted_winner):
    if winner == predicted_winner:
        return True
    else:
        return False

In [31]:

dfs = []
for i in xrange(1, 50):
    csv_name = "baseline2/data_iter_"+str(i)+".csv"
    trdfn = pd.read_csv(csv_name, index_col=0)
    years = set(trdfn.year)
    lst = []
    for year in years:
        trdfny = trdfn[trdfn.year == year]
        trdfny['diff_xy'] = trdfny.apply(lambda x: get_diff(x['hFor'], x['hAgainst'], x['aFor'], x['aAgainst']), axis=1)
        trdfny['predicted_winner'] = trdfny.apply(lambda x: get_predicted_winner(x['diff_xy'], x['home_team'], x['away_team']), axis=1)
        trdfny['correct_prediction'] = trdfny.apply(lambda x: is_prediction_correct(x['winner'], x['predicted_winner']), axis=1)
        lst.append(float(sum(trdfny.correct_prediction))*100/len(trdfny.index))
    dfs.append(pd.DataFrame(lst, columns=[i], index=years).T)
ansB2 = pd.concat(dfs)    

In [32]:

tmpB2 = ansB2.T.mean()

In [33]:

tmpB2

Out[33]:

1     60.565057
2     64.275654
3     64.916621
4     66.238334
5     66.206500
6     66.968334
7     67.559769
8     67.898077
9     68.060523
10    68.005607
11    68.275960
12    68.330202
13    68.091012
14    68.034593
15    68.259817
16    68.484057
17    68.407818
18    68.412952
19    68.248700
20    67.857821
21    68.004247
22    68.095474
23    67.652597
24    67.851372
25    67.810244
26    67.703698
27    67.551977
28    67.348082
29    67.630853
30    67.333432
31    67.358957
32    67.165778
33    67.266391
34    67.043527
35    67.114855
36    67.215772
37    66.827336
38    66.691250
39    66.689031
40    66.670639
41    66.503925
42    66.568545
43    66.442503
44    66.516423
45    66.487398
46    66.407171
47    66.231543
48    66.129490
49    66.037590
dtype: float64

In [34]:

max(tmpB2)

Out[34]:

68.484057376234418

In [35]:

tmpB2.plot(label="Baseline 2")
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.xlabel('Number of games considered')
plt.ylabel('Prediction accurcay')
plt.ylim([0, 69])
plt.legend()
plt.show()

/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/matplotlib/font_manager.py:1236: UserWarning: findfont: Font family ['serif'] not found. Falling back to Bitstream Vera Sans
  (prop.get_family(), self.defaultFamily[fontext]))

Test Data Operations¶

In [12]:

trdf_alpha = trainDF

In [13]:

def get_historic_games(team, gameId, trdf):
    df = trdf[trdf.game_id < gameId]
    hdf = df[df.home_team == team]
    adf = df[df.away_team == team]
    newdf = pd.concat([hdf, adf])
    newdf = newdf.sort_index()
    try:
        return pd.concat([hdf, adf])
    except:
        return False

In [14]:

def get_points(team, df, alpha):
    pFor = []
    pAgainst = []
    for r, v in df.iterrows():
        hTeam = v.home_team
        aTeam = v.away_team
        hScore = v.home_score
        aScore = v.away_score
        if (team == hTeam):
            pFor.append(hScore)
            pAgainst.append(aScore)
        elif (team == aTeam):
            pFor.append(aScore)
            pAgainst.append(hScore)
    numGames = len(pFor)
    weights = []
    for i in xrange(1, numGames+1):
        weights.append(alpha**i)
    return (int(round(np.dot(pFor, weights)/float(sum(weights)))), int(round(np.dot(pAgainst, weights)/float(sum(weights)))))

In [21]:

for alpha in np.arange(0.5, 1.01, 0.1):
    for i in xrange(17, 18):
        lst = []
        for r, v in trdf_alpha.iterrows():
            gameId = v.game_id
            home = v.home_team
            away = v.away_team
            homehGames = get_historic_games(home, gameId, trdf_alpha)
            awayhGames = get_historic_games(away, gameId, trdf_alpha)
            if (isinstance(homehGames, pd.DataFrame) and isinstance(awayhGames, pd.DataFrame)):
                if (len(homehGames.index) >= i and len(awayhGames.index) >= i):
                    homehGames = homehGames[-i:]
                    awayhGames = awayhGames[-i:]
                    hFor, hAgainst = get_points(home, homehGames, alpha)
                    aFor, aAgainst = get_points(away, awayhGames, alpha)
                else:
                    hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
            else:
                hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
            v = v.append(pd.Series([hFor, hAgainst, aFor, aAgainst], index=['hFor', 'hAgainst', 'aFor', 'aAgainst'], name=r))
            lst.append(pd.DataFrame(v).T)
        trdf_alpha_n = pd.concat(lst)
        csv_name = "baseline2/alpha" + str(alpha) + "/train_data_iter_"+str(i)+".csv"
        trdf_alpha_n.to_csv(csv_name)

In [22]:

def get_diff(hFor, hAgainst, aFor, aAgainst):
    if sum([hFor, hAgainst, aFor, aAgainst]) > 0:
        return (hFor + aAgainst)/2  - (aFor + hAgainst)/2 + 3
    else:
        return 3

In [23]:

def get_predicted_winner(diff_xy, home_team, away_team):
    if diff_xy >= 0:
        return home_team
    else:
        return away_team

In [24]:

def is_prediction_correct(winner, predicted_winner):
    if winner == predicted_winner:
        return True
    else:
        return False

In [25]:

adfs = []
for alpha in np.arange(0.5, 1.01, 0.1):
    dfs = []
    for i in xrange(17, 18):
        csv_name = "baseline2/alpha" + str(alpha) + "/train_data_iter_"+str(i)+".csv"
        trdfn = pd.read_csv(csv_name, index_col=0)
        years = set(trdfn.year)
        lst = []
        for year in years:
            trdfny = trdfn[trdfn.year == year]
            trdfny['diff_xy'] = trdfny.apply(lambda x: get_diff(x['hFor'], x['hAgainst'], x['aFor'], x['aAgainst']), axis=1)
            trdfny['predicted_winner'] = trdfny.apply(lambda x: get_predicted_winner(x['diff_xy'], x['home_team'], x['away_team']), axis=1)
            trdfny['correct_prediction'] = trdfny.apply(lambda x: is_prediction_correct(x['winner'], x['predicted_winner']), axis=1)
            lst.append(float(sum(trdfny.correct_prediction))*100/len(trdfny.index))
        dfs.append(pd.DataFrame(lst, columns=[i], index=years).T)
    tmp = pd.concat(dfs)
    tmp = tmp.T.mean()
    adfs.append(pd.DataFrame(tmp, columns=[alpha]).T)
ans = pd.concat(adfs).T

In [26]:

ans.T

Out[26]:

	17
0.5	64.177477
0.6	64.768426
0.7	65.269568
0.8	66.582841
0.9	67.668979
1.0	68.406462

6 rows × 1 columns