NFL Linear Regression Model¶

In [1]:

%pylab inline

Populating the interactive namespace from numpy and matplotlib

In [2]:

from collections import defaultdict

import cPickle as pickle
import numpy as np
import pandas as pd
import pylab
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import BayesianRidge

In [3]:

import seaborn
colors = seaborn.color_palette()

from matplotlib import rc

rc("figure", facecolor="white")
rc("axes", facecolor="white")
rc("axes", edgecolor="grey")


rc("grid", alpha=0.9)
rc("grid", linewidth=0.2)
rc("grid", linestyle=":")


rc('font',**{'family':'serif','serif':['Palatino']})

In [18]:

trainDF = pd.read_csv('trainNFL.csv', index_col=0)
testDF = pd.read_csv('testNFL.csv', index_col=0)

In [19]:

trainDF['score_diff'] = trainDF.home_score - trainDF.away_score
testDF['score_diff'] = testDF.home_score - testDF.away_score

trainDF = trainDF.sort(column='game_id').reset_index(drop=True)
testDF = testDF.sort(column='game_id').reset_index(drop=True)

/Library/Python/2.7/site-packages/pandas-0.13.1_213_gc174c3d-py2.7-macosx-10.9-intel.egg/pandas/core/frame.py:2542: FutureWarning: column is deprecated, use columns
  warnings.warn("column is deprecated, use columns", FutureWarning)

Linear regression datset [NFL]:¶

In [22]:

def get_correct(score_diff, predicted_score):
    if ((predicted_score > 0 and score_diff > 0) or (predicted_score < 0 and score_diff < 0)):
        return 1
    else:
        return 0

In [23]:

def check_accuracy(allStats):
    accuracy = dict()
    for year, df in allStats.iteritems():    
        numGames = len(df.correct_prediction)
        numCorrect = sum(df.correct_prediction)
        pctCorrect = numCorrect*100/float(numGames)
        accuracy[year] = [numGames, numCorrect, pctCorrect]
    return accuracy

In [24]:

def initialize(data):
    X = data.ix[:, features]
    y = data.ix[:, 'score_diff']
    
    reg = LinearRegression(fit_intercept=True, normalize=True)

    model = reg.fit(X, y)
    return model

In [9]:

def get_games_till_now(game, df):
    curGameId = int(game.game_id)
    retdf = df[df.game_id < curGameId]
    return retdf

In [10]:

def get_game_features(g, gamesTillNow):
    hTeam = g.home_team
    aTeam = g.away_team
    gameYear = int(g.year)

    games = []    
    for row, game in gamesTillNow.iterrows():
        rhTeam = game.home_team
        raTeam = game.away_team
        if ((rhTeam == hTeam) or (rhTeam == aTeam) or (raTeam == hTeam) or (raTeam == aTeam)):
            games.append(pd.DataFrame(game).T)
    games = games[-15:]
    numGames = len(games)
    games = pd.concat(games)
    average = pd.DataFrame(games[features].mean().round()).T
    return (numGames, average[features])

In [23]:

def get_updated_data(game, df, g):
    curGameId = int(game.game_id)
    retdf = df[df.game_id <= curGameId]
    return retdf[-g:]

In [24]:

def train_updated_model(data):
    X = data.ix[:, features]
    y = data.ix[:, 'score_diff']

    reg = LinearRegression(fit_intercept=True, normalize=True)

    model = reg.fit(X, y)
    return model

In [25]:

def run_model(model, gameFeatures):
    gameResult = int(model.predict(gameFeatures[features]).round())
    return gameResult

In [25]:

features = ['temperature','humidity','wind_speed','over_or_under','visitor_point_spread','away_score','home_score','1QP_home','2QP_home','3QP_home','4QP_home','RFD_home','PFD_home','IFD_home','RY_home','RA_home','PY_home','PA_home','PC_home','SK_home','INT_home','FUM_home','PU_home','GPY_home','PR_home','PRY_home','KR_home','KRY_home','IR_home','IRY_home','PEN_home','TOP_home','TD_home','TDR_home','TDP_home','TDT_home','FGM_home','FGAT_home','FGY_home','RZA_home','RZC_home','BRY_home','BPY_home','SRP_home','S1RP_home','S2RP_home','S3RP_home','SPP_home','S1PP_home','S2PP_home','S3PP_home','LEA_home','LEY_home','LTA_home','LTY_home','LGA_home','LGY_home','MDA_home','MDY_home','RGA_home','RGY_home','RTA_home','RTY_home','REA_home','REY_home','R1A_home','R1Y_home','R2A_home','R2Y_home','R3A_home','R3Y_home','QBA_home','QBY_home','SLA_home','SLY_home','SMA_home','SMY_home','SRA_home','SRY_home','DLA_home','DLY_home','DMA_home','DMY_home','DRA_home','DRY_home','WR1A_home','WR1Y_home','WR3A_home','WR3Y_home','TEA_home','TEY_home','RBA_home','RBY_home','SGA_home','SGY_home','P1A_home','P1Y_home','P2A_home','P2Y_home','P3A_home','P3Y_home','SPC_home','MPC_home','LPC_home','Q1RA_home','Q1RY_home','Q1PA_home','Q1PY_home','LCRA_home','LCRY_home','LCPA_home','LCPY_home','RZRA_home','RZRY_home','RZPA_home','RZPY_home','SKY_home','LBS_home','DBS_home','SFPY_home','DRV_home','NPY_home','TB_home','I20_home','RTD_home','LNR_home','LNP_home','LBR_home','LBP_home','DBR_home','DBP_home','NHA_home','S3A_home','S3C_home','L3A_home','L3C_home','STF_home','DP_home','FSP_home','OHP_home','PBEP_home','DLP_home','DSP_home','DMP_home','PFN_home','1QP_away','2QP_away','3QP_away','4QP_away','RFD_away','PFD_away','IFD_away','RY_away','RA_away','PY_away','PA_away','PC_away','SK_away','INT_away','FUM_away','PU_away','GPY_away','PR_away','PRY_away','KR_away','KRY_away','IR_away','IRY_away','PEN_away','TOP_away','TD_away','TDR_away','TDP_away','TDT_away','FGM_away','FGAT_away','FGY_away','RZA_away','RZC_away','BRY_away','BPY_away','SRP_away','S1RP_away','S2RP_away','S3RP_away','SPP_away','S1PP_away','S2PP_away','S3PP_away','LEA_away','LEY_away','LTA_away','LTY_away','LGA_away','LGY_away','MDA_away','MDY_away','RGA_away','RGY_away','RTA_away','RTY_away','REA_away','REY_away','R1A_away','R1Y_away','R2A_away','R2Y_away','R3A_away','R3Y_away','QBA_away','QBY_away','SLA_away','SLY_away','SMA_away','SMY_away','SRA_away','SRY_away','DLA_away','DLY_away','DMA_away','DMY_away','DRA_away','DRY_away','WR1A_away','WR1Y_away','WR3A_away','WR3Y_away','TEA_away','TEY_away','RBA_away','RBY_away','SGA_away','SGY_away','P1A_away','P1Y_away','P2A_away','P2Y_away','P3A_away','P3Y_away','SPC_away','MPC_away','LPC_away','Q1RA_away','Q1RY_away','Q1PA_away','Q1PY_away','LCRA_away','LCRY_away','LCPA_away','LCPY_away','RZRA_away','RZRY_away','RZPA_away','RZPY_away','SKY_away','LBS_away','DBS_away','SFPY_away','DRV_away','NPY_away','TB_away','I20_away','RTD_away','LNR_away','LNP_away','LBR_away','LBP_away','DBR_away','DBP_away','NHA_away','S3A_away','S3C_away','L3A_away','L3C_away','STF_away','DP_away','FSP_away','OHP_away','PBEP_away','DLP_away','DSP_away','DMP_away','PFN_away']

In [26]:

clfs = [LinearRegression(fit_intercept=True),
        LinearRegression(fit_intercept=True, normalize=True),
        Ridge(fit_intercept=True),
        Ridge(fit_intercept=True, normalize=True),
        Lasso(fit_intercept=True),
        Lasso(fit_intercept=True, normalize=True)]
clf_names = ['Linear Regression', 'Linear Regression Normalized',
             'Ridge Regression', 'Ridge Regression Normalized',
             'Lasso Regression', 'Lasso Regression Normalized']

train_results = []
models = {}
for (i, clf_) in enumerate(clfs):
    data = trainDF[trainDF.year >= 2001]
    X = data.ix[:, features]
    y = data.ix[:, 'score_diff']
    clf = clf_.fit(X, y)
    models[clf_names[i]] = clf
    predicted_train = clf.predict(X)
    p = [int(round(val)) for val in predicted_train]
    data['predicted_score_diff'] = pd.Series(p, index=data.index)
    data['correct_prediction'] = data.apply(lambda x: get_correct(x['score_diff'], x['predicted_score_diff']), axis=1)
    years = [int(y) for y in unique(data.year)]
    ansdf = {}
    for y in years:
        ansdf[y] = data[data.year == y]
    accuracy = check_accuracy(ansdf)
    accuracy = pd.DataFrame(accuracy).T
    accuracy.columns = ['num_games', 'num_correct_prediction', 'pct_correct_prediction']
    accuracy = accuracy[['pct_correct_prediction']]
    accuracy.columns = [clf_names[i]]
    train_results.append(accuracy.T)

train_resultdf = pd.concat(train_results).T
train_resultdf

Out[26]:

	Linear Regression	Linear Regression Normalized	Ridge Regression	Ridge Regression Normalized	Lasso Regression	Lasso Regression Normalized
2001	100.000	100.000	100.000	94.83871	100.000	54.193548
2002	100.000	100.000	100.000	93.75000	100.000	59.375000
2003	100.000	100.000	100.000	95.00000	100.000	61.875000
2004	100.000	100.000	100.000	95.00000	100.000	55.000000
2005	100.000	100.000	100.000	93.12500	100.000	56.875000
2006	100.000	100.000	100.000	93.75000	100.000	53.125000
2007	100.000	100.000	100.000	98.12500	100.000	54.375000
2008	100.000	100.000	100.000	94.37500	100.000	56.250000
2009	100.000	100.000	100.000	93.75000	100.000	58.125000
2010	100.000	100.000	100.000	96.87500	100.000	57.500000
2011	100.000	100.000	100.000	92.50000	100.000	54.375000
2012	100.000	100.000	100.000	92.50000	100.000	52.500000
2013	99.375	99.375	99.375	91.87500	99.375	57.500000

13 rows × 6 columns

In [27]:

train_resultdf.plot(kind='bar', figsize=(17, 8))
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.xlabel('Years')
plt.ylabel('Prediction accurcay')
plt.ylim([0, 100])
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), fancybox=True, shadow=True, ncol=3)
plt.show()

/Library/Python/2.7/site-packages/matplotlib-1.4.x-py2.7-macosx-10.9-intel.egg/matplotlib/font_manager.py:1240: UserWarning: findfont: Font family [u'serif'] not found. Falling back to Bitstream Vera Sans
  (prop.get_family(), self.defaultFamily[fontext]))

In [28]:

test_results = []
for model_name, model in models.iteritems():
    data_test = testDF[testDF.year > 2000]
    X_test = data_test.ix[:, features]
    y_test = data_test.ix[:, 'score_diff']
    predicted_test = model.predict(X_test)
    pt = [int(round(val)) for val in predicted_test]
    data_test['predicted_score_diff'] = pd.Series(pt, index=data_test.index)
    data_test['correct_prediction'] = data_test.apply(lambda x: get_correct(x['score_diff'], x['predicted_score_diff']), axis=1)
    years = [int(y) for y in unique(data_test.year)]
    ansdf = {}
    for y in years:
        ansdf[y] = data_test[data_test.year == y]
    accuracy = check_accuracy(ansdf)
    accuracy = pd.DataFrame(accuracy).T
    accuracy.columns = ['num_games', 'num_correct_prediction', 'pct_correct_prediction']
    accuracy = accuracy[['pct_correct_prediction']]
    accuracy.columns = [model_name]
    test_results.append(accuracy.T)

test_resultdf = pd.concat(test_results).T
test_resultdf

Out[28]:

	Ridge Regression Normalized	Lasso Regression Normalized	Lasso Regression	Ridge Regression	Linear Regression	Linear Regression Normalized
2001	96.153846	57.692308	100.000000	100.000000	100.000000	100.000000
2002	94.392523	57.943925	99.065421	99.065421	99.065421	99.065421
2003	94.392523	60.747664	100.000000	100.000000	100.000000	100.000000
2004	93.457944	58.878505	100.000000	100.000000	100.000000	100.000000
2005	94.392523	60.747664	100.000000	100.000000	100.000000	100.000000
2006	97.196262	55.140187	100.000000	100.000000	100.000000	100.000000
2007	95.327103	60.747664	100.000000	100.000000	100.000000	100.000000
2008	92.523364	57.009346	99.065421	99.065421	99.065421	99.065421
2009	93.457944	56.074766	100.000000	100.000000	100.000000	100.000000
2010	96.261682	52.336449	100.000000	100.000000	100.000000	100.000000
2011	97.196262	61.682243	100.000000	100.000000	100.000000	100.000000
2012	92.523364	63.551402	99.065421	99.065421	99.065421	99.065421
2013	96.261682	62.616822	100.000000	100.000000	100.000000	100.000000

13 rows × 6 columns

In [29]:

test_resultdf.plot(kind='bar', figsize=(17, 8))
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.xlabel('Years')
plt.ylabel('Prediction accurcay')
plt.ylim([0, 100])
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), fancybox=True, shadow=True, ncol=3)
plt.show()

In [30]:

def get_game_features(g, gamesTillNow, itr, alpha):
    hTeam = g.home_team
    aTeam = g.away_team
    gameYear = int(g.year)

    games = []    
    for row, game in gamesTillNow.iterrows():
        rhTeam = game.home_team
        raTeam = game.away_team
        if ((rhTeam == hTeam) or (rhTeam == aTeam) or (raTeam == hTeam) or (raTeam == aTeam)):
            games.append(pd.DataFrame(game).T)
    games = games[-itr:]
    numGames = len(games)
    games = pd.concat(games)
    games = games[features]
    
    numGames = len(games.index)
    weights = []
    for i in xrange(1, numGames+1):
        weights.append(alpha**i)
    weights = np.array(weights)
    average = (games.mul(weights[::-1], axis=0)/float(sum(weights)))
    average = pd.DataFrame(average.sum()).T.apply(np.round)
    return (numGames, average[features])

In [31]:

test2_results = []
for model_name, model in models.iteritems():
    itr = 15
    alpha = 0.9
    mtemp = []
    data_test = testDF[testDF.year > 2007]
    for ix, game in data_test.iterrows():
        gamesTillNow = get_games_till_now(game, data)
        numGames, gameFeatures = get_game_features(game, gamesTillNow, itr, alpha)
        gameResult = run_model(model, gameFeatures)
        mtemp.append(gameResult)

    data_test['predicted_score_diff'] = pd.Series(mtemp, index=data_test.index)
    cols = ['year', 'week', 'home_team', 'away_team', 'home_score', 'away_score', 'winner', 'score_diff', 'predicted_score_diff']
    process = data_test[cols]
    process['error'] = process.score_diff - process.predicted_score_diff
    process['correct_prediction'] = process.apply(lambda x: get_correct(x['score_diff'], x['predicted_score_diff']), axis=1)
    years = [int(y) for y in unique(process.year)]
    ansdf = {}
    for y in years:
        ansdf[y] = process[process.year == y]
    accuracy = check_accuracy(ansdf)
    accuracy = pd.DataFrame(accuracy).T
    accuracy.columns = ['num_games', 'num_correct_prediction', 'pct_correct_prediction']
    accuracy = accuracy[['pct_correct_prediction']]
    accuracy.columns = [model_name]
    test2_results.append(accuracy.T)
    
test2_resultdf = pd.concat(test2_results).T
test2_resultdf

Out[31]:

	Ridge Regression Normalized	Lasso Regression Normalized	Lasso Regression	Ridge Regression	Linear Regression	Linear Regression Normalized
2008	46.728972	57.009346	44.859813	44.859813	44.859813	44.859813
2009	46.728972	56.074766	46.728972	46.728972	46.728972	46.728972
2010	44.859813	52.336449	44.859813	44.859813	44.859813	44.859813
2011	40.186916	61.682243	42.056075	42.056075	42.056075	42.056075
2012	53.271028	63.551402	51.401869	51.401869	51.401869	51.401869
2013	50.467290	62.616822	52.336449	52.336449	52.336449	52.336449

6 rows × 6 columns

In [32]:

test2_resultdf.plot(kind='bar', figsize=(17, 8))
plt.grid(b=True, which='major', color='gray', linestyle=':')
plt.xlabel('Years')
plt.ylabel('Prediction accurcay')
plt.ylim([0, 100])
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), fancybox=True, shadow=True, ncol=3)
plt.show()

In [33]:

def get_game_features(g, gamesTillNow, itr, alpha):
    hTeam = g.home_team
    aTeam = g.away_team
    gameYear = int(g.year)

    games = []    
    for row, game in gamesTillNow.iterrows():
        rhTeam = game.home_team
        raTeam = game.away_team
        if ((rhTeam == hTeam) or (rhTeam == aTeam) or (raTeam == hTeam) or (raTeam == aTeam)):
            games.append(pd.DataFrame(game).T)
    games = games[-itr:]
    numGames = len(games)
    games = pd.concat(games)
    games = games[features]
    
    numGames = len(games.index)
    weights = []
    for i in xrange(1, numGames+1):
        weights.append(alpha**i)
    weights = np.array(weights)
    average = (games.mul(weights[::-1], axis=0)/float(sum(weights)))
    average = pd.DataFrame(average.sum()).T.apply(np.round)
    return (numGames, average[features])

In [34]:

data = trainDF[trainDF.year >= 2001]

train2_results = []
for model_name, model in models.iteritems():
    itr = 15
    alpha = 0.9
    mtemp = []
    data_train = trainDF[trainDF.year > 2007]
    for ix, game in data_train.iterrows():
        gamesTillNow = get_games_till_now(game, data)
        numGames, gameFeatures = get_game_features(game, gamesTillNow, itr, alpha)
        gameResult = run_model(model, gameFeatures)
        mtemp.append(gameResult)

    data_train['predicted_score_diff'] = pd.Series(mtemp, index=data_train.index)
    cols = ['year', 'week', 'home_team', 'away_team', 'home_score', 'away_score', 'winner', 'score_diff', 'predicted_score_diff']
    process = data_train[cols]
    process['error'] = process.score_diff - process.predicted_score_diff
    process['correct_prediction'] = process.apply(lambda x: get_correct(x['score_diff'], x['predicted_score_diff']), axis=1)
    years = [int(y) for y in unique(process.year)]
    ansdf = {}
    for y in years:
        ansdf[y] = process[process.year == y]
    accuracy = check_accuracy(ansdf)
    accuracy = pd.DataFrame(accuracy).T
    accuracy.columns = ['num_games', 'num_correct_prediction', 'pct_correct_prediction']
    accuracy = accuracy[['pct_correct_prediction']]
    accuracy.columns = [model_name]
    train2_results.append(accuracy.T)
    
train2_resultdf = pd.concat(train2_results).T
train2_resultdf

Out[34]:

	Ridge Regression Normalized	Lasso Regression Normalized	Lasso Regression	Ridge Regression	Linear Regression	Linear Regression Normalized
2008	47.500	56.250	49.375	49.375	49.375	49.375
2009	48.125	58.125	50.625	50.625	50.625	50.625
2010	46.250	57.500	46.875	46.875	46.875	46.875
2011	50.625	54.375	51.250	51.250	51.250	51.250
2012	44.375	52.500	43.750	43.750	43.750	43.750
2013	43.750	57.500	46.875	46.875	46.875	46.875

6 rows × 6 columns