College Football Baseline 1¶

Importing necessary libraries.

In [1]:

%pylab inline

from collections import defaultdict

import cPickle as pickle
import math
import numpy as np
import pandas as pd
import pylab
import glob
import seaborn

Populating the interactive namespace from numpy and matplotlib

In [2]:

colors = seaborn.color_palette()

from matplotlib import rc

rc("figure", facecolor="white")
rc("axes", facecolor="white")
rc("axes", edgecolor="grey")


rc("grid", alpha=0.9)
rc("grid", linewidth=0.2)
rc("grid", linestyle=":")


rc('font',**{'family':'serif','serif':['Palatino']})

In [3]:

# A utility function to compare the results side-by-side
def side_by_side(*objs, **kwds):
    from pandas.core.common import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print adjoin(space, *reprs)

Creating Train and Test Set¶

In [4]:

trainDF = pd.read_csv('trainCOL.csv', index_col=0)
testDF = pd.read_csv('testCOL.csv', index_col=0)

In [5]:

trainDF = trainDF.sort(columns='game_id').reset_index(drop=True)
testDF = testDF.sort(columns='game_id').reset_index(drop=True)

Train Data operations¶

In [6]:

trdf = trainDF[['game_id','year','home_team','away_team','home_score','away_score','winner']]

In [7]:

def get_historic_games(team, gameId, trdf):
    df = trdf[trdf.game_id < gameId]
    hdf = df[df.home_team == team]
    adf = df[df.away_team == team]
    newdf = pd.concat([hdf, adf])
    newdf = newdf.sort_index()
    try:
        return pd.concat([hdf, adf])
    except:
        return False

In [8]:

def get_points(team, df):
    pFor = []
    pAgainst = []
    for r, v in df.iterrows():
        hTeam = v.home_team
        aTeam = v.away_team
        hScore = v.home_score
        aScore = v.away_score
        if (team == hTeam):
            pFor.append(hScore)
            pAgainst.append(aScore)
        elif (team == aTeam):
            pFor.append(aScore)
            pAgainst.append(hScore)
    return (sum(pFor)/len(df), sum(pAgainst)/len(df))

In [36]:

for i in xrange(1, 50):
    lst = []
    for r, v in trdf.iterrows():
        gameId = v.game_id
        home = v.home_team
        away = v.away_team
        homehGames = get_historic_games(home, gameId, trdf)
        awayhGames = get_historic_games(away, gameId, trdf)
        if (isinstance(homehGames, pd.DataFrame) and isinstance(awayhGames, pd.DataFrame)):
            if (len(homehGames.index) >= i and len(awayhGames.index) >= i):
                homehGames = homehGames[-i:]
                awayhGames = awayhGames[-i:]
                hFor, hAgainst = get_points(home, homehGames)
                aFor, aAgainst = get_points(away, awayhGames)
            else:
                hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
        else:
            hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
        v = v.append(pd.Series([hFor, hAgainst, aFor, aAgainst], index=['hFor', 'hAgainst', 'aFor', 'aAgainst'], name=r))
        lst.append(pd.DataFrame(v).T)
    trdfn = pd.concat(lst)
    csv_name = "baseline1/data_iter_"+str(i)+".csv"
    trdfn.to_csv(csv_name)

In [9]:

def get_diff(hFor, hAgainst, aFor, aAgainst):
    if sum([hFor, hAgainst, aFor, aAgainst]) > 0:
        return (hFor + aAgainst)/2  - (aFor + hAgainst)/2
    else:
        return 0

In [10]:

def get_predicted_winner(diff_xy, home_team, away_team):
    if diff_xy >= 0:
        return home_team
    else:
        return away_team

In [11]:

def is_prediction_correct(winner, predicted_winner):
    if winner == predicted_winner:
        return True
    else:
        return False

In [40]:

dfs = []
for i in xrange(1, 50):
    csv_name = "baseline1/data_iter_"+str(i)+".csv"
    trdfn = pd.read_csv(csv_name, index_col=0)
    years = set(trdfn.year)
    lst = []
    for year in years:
        trdfny = trdfn[trdfn.year == year]
        trdfny['diff_xy'] = trdfny.apply(lambda x: get_diff(x['hFor'], x['hAgainst'], x['aFor'], x['aAgainst']), axis=1)
        trdfny['predicted_winner'] = trdfny.apply(lambda x: get_predicted_winner(x['diff_xy'], x['home_team'], x['away_team']), axis=1)
        trdfny['correct_prediction'] = trdfny.apply(lambda x: is_prediction_correct(x['winner'], x['predicted_winner']), axis=1)
        lst.append(float(sum(trdfny.correct_prediction))*100/len(trdfny.index))
    dfs.append(pd.DataFrame(lst, columns=[i], index=years).T)
ansB1 = pd.concat(dfs)   

In [41]:

tmpB1 = ansB1.T.mean()

In [42]:

tmpB1

Out[42]:

1     58.679652
2     62.370012
3     63.960777
4     64.882653
5     65.617572
6     65.579237
7     65.843909
8     66.775720
9     66.865880
10    67.329603
11    67.123369
12    67.328187
13    67.439274
14    67.169722
15    67.578752
16    67.620286
17    67.446574
18    67.743059
19    67.651776
20    67.725993
21    67.773268
22    67.797476
23    67.791740
24    67.383125
25    67.435805
26    67.412654
27    67.730099
28    67.711859
29    67.205141
30    67.172648
31    67.321478
32    67.328854
33    67.320027
34    67.153167
35    67.389960
36    67.068088
37    66.746560
38    66.537536
39    66.470723
40    66.521872
41    66.614384
42    66.602449
43    66.398296
44    66.325003
45    66.336004
46    66.121544
47    66.256656
48    66.173984
49    66.148152
dtype: float64

In [43]:

max(tmpB1)

Out[43]:

67.797475992742307

In [53]:

tmpB1.plot(label="Baseline 1")
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.xlabel('Number of games considered')
plt.ylabel('Prediction accurcay')
plt.ylim([0, 70])
plt.legend()
plt.show()

Test Data Operations¶

In [12]:

trdf_alpha = trainDF

In [13]:

def get_historic_games(team, gameId, trdf):
    df = trdf[trdf.game_id < gameId]
    hdf = df[df.home_team == team]
    adf = df[df.away_team == team]
    newdf = pd.concat([hdf, adf])
    newdf = newdf.sort_index()
    try:
        return pd.concat([hdf, adf])
    except:
        return False

In [14]:

def get_points(team, df, alpha):
    pFor = []
    pAgainst = []
    for r, v in df.iterrows():
        hTeam = v.home_team
        aTeam = v.away_team
        hScore = v.home_score
        aScore = v.away_score
        if (team == hTeam):
            pFor.append(hScore)
            pAgainst.append(aScore)
        elif (team == aTeam):
            pFor.append(aScore)
            pAgainst.append(hScore)
    numGames = len(pFor)
    weights = []
    for i in xrange(1, numGames+1):
        weights.append(alpha**i)
    return (int(round(np.dot(pFor, weights)/float(sum(weights)))), int(round(np.dot(pAgainst, weights)/float(sum(weights)))))

In [15]:

for alpha in np.arange(0.5, 1.01, 0.1):
    for i in xrange(17, 18):
        lst = []
        for r, v in trdf_alpha.iterrows():
            gameId = v.game_id
            home = v.home_team
            away = v.away_team
            homehGames = get_historic_games(home, gameId, trdf_alpha)
            awayhGames = get_historic_games(away, gameId, trdf_alpha)
            if (isinstance(homehGames, pd.DataFrame) and isinstance(awayhGames, pd.DataFrame)):
                if (len(homehGames.index) >= i and len(awayhGames.index) >= i):
                    homehGames = homehGames[-i:]
                    awayhGames = awayhGames[-i:]
                    hFor, hAgainst = get_points(home, homehGames, alpha)
                    aFor, aAgainst = get_points(away, awayhGames, alpha)
                else:
                    hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
            else:
                hFor, hAgainst, aFor, aAgainst = 0, 0, 0, 0
            v = v.append(pd.Series([hFor, hAgainst, aFor, aAgainst], index=['hFor', 'hAgainst', 'aFor', 'aAgainst'], name=r))
            lst.append(pd.DataFrame(v).T)
        trdf_alpha_n = pd.concat(lst)
        csv_name = "baseline2/alpha" + str(alpha) + "/train_data_iter_"+str(i)+".csv"
        trdf_alpha_n.to_csv(csv_name)

In [16]:

def get_diff(hFor, hAgainst, aFor, aAgainst):
    if sum([hFor, hAgainst, aFor, aAgainst]) > 0:
        return (hFor + aAgainst)/2  - (aFor + hAgainst)/2
    else:
        return 0

In [17]:

def get_predicted_winner(diff_xy, home_team, away_team):
    if diff_xy >= 0:
        return home_team
    else:
        return away_team

In [18]:

def is_prediction_correct(winner, predicted_winner):
    if winner == predicted_winner:
        return True
    else:
        return False

In [19]:

adfs = []
for alpha in np.arange(0.5, 1.01, 0.1):
    dfs = []
    for i in xrange(17, 18):
        csv_name = "baseline2/alpha" + str(alpha) + "/train_data_iter_"+str(i)+".csv"
        trdfn = pd.read_csv(csv_name, index_col=0)
        years = set(trdfn.year)
        lst = []
        for year in years:
            trdfny = trdfn[trdfn.year == year]
            trdfny['diff_xy'] = trdfny.apply(lambda x: get_diff(x['hFor'], x['hAgainst'], x['aFor'], x['aAgainst']), axis=1)
            trdfny['predicted_winner'] = trdfny.apply(lambda x: get_predicted_winner(x['diff_xy'], x['home_team'], x['away_team']), axis=1)
            trdfny['correct_prediction'] = trdfny.apply(lambda x: is_prediction_correct(x['winner'], x['predicted_winner']), axis=1)
            lst.append(float(sum(trdfny.correct_prediction))*100/len(trdfny.index))
        dfs.append(pd.DataFrame(lst, columns=[i], index=years).T)
    tmp = pd.concat(dfs)
    tmp = tmp.T.mean()
    adfs.append(pd.DataFrame(tmp, columns=[alpha]).T)
ans = pd.concat(adfs).T

In [20]:

ans.T

Out[20]:

	17
0.5	63.453913
0.6	64.496069
0.7	65.155919
0.8	66.047140
0.9	67.005708
1.0	67.595777

6 rows × 1 columns