The League of Champions
Analyzing the return of investments made by English Premier League Clubs on their player purchases
Team : [ Natarajan Chakrapani , Kuldeep Kapade , Mark Davidoff ]
from IPython.core.display import Image
Image(filename='/Users/natarajan/Dropbox/Soccer - WWOD/Player_positions.png')
Image('http://www.soccer-training-guide.com/images/wikipedia_positions.png') #all player positions on the field
Image('http://www.soccer-training-guide.com/images/4-4-2-new.jpg')
def get_players_data(players, rows):
'''helper function to parse player data on who were transferred in the English premier league '''
for row in rows:
players_data = { }
for (i,item) in enumerate(row.find_all('td')):
#print i, item.get_text()
if i == 3:
players_data['p_name'] = item.get_text()
parts = item.find('a').get('href').split('/')
players_data['handle'] = parts[2]
players_data['id'] = parts[4].split('_')[1].split('.')[0]
if i == 4:
players_data['club'] = item.get_text()
if i == 5:
players_data['age'] = item.get_text()
if i == 7:
players_data['position'] = item.get_text()
if i == 9:
players_data['from'] = item.get_text()
if i == 11:
players_data['to'] = item.get_text()
if i == 12:
fee = item.get_text().split(' ')
players_data['transfer_fee_pounds'] = fee[0]
players_data['transfer_fee_euros'] = fee[1][1:]
players.append(players_data)
def fetch_data(players, year):
''' given a particular year , fetch transferred player data from English premier league'''
url = 'http://www.transfermarkt.co.uk/en/premier-league/transferrekorde/wettbewerb_GB1_%s_default_default_default_alle.html' % (year)
r = requests.get(url)
html_doc = r.text
soup = BeautifulSoup(html_doc)
table= soup.find('table' , {'class':"tabelle_grafik"})
rows = table.find_all('tr' , {'class':"hell"})
get_players_data(players, rows)
rows = table.find_all('tr' , {'class':"dunkel"})
get_players_data(players, rows[1:])
def extend_data(players, new_players, year):
for player in players:
url = 'http://www.transfermarkt.co.uk/en/%s/leistungsdaten/spieler_%s_%s.html' % (player['handle'], player['id'], year)
r = requests.get(url)
html_doc = r.text
soup = BeautifulSoup(html_doc)
table = soup.find('table' , {'class':"standard_tabelle"})
row = table.find('tr' , {'class':"hell"}) #Assuming its in first row
for (i,item) in enumerate(row.find_all('td')):
#print i, item.get_text()
if i == 2:
if item.get_text() != '-':
player['matches'] = item.get_text()
else:
player['matches'] = 0
if i == 3:
if item.get_text() != '-':
player['goals'] = item.get_text()
else:
player['goals'] = 0
if i == 5:
if item.get_text() != '-':
player['assists'] = item.get_text()
else:
player['assists'] = 0
if i == 11:
if item.get_text() != '-':
player['minutes_per_goal'] = item.get_text()
else:
player['minutes_per_goal'] = 0
if i == 12:
if item.get_text() != '-':
player['minutes'] = item.get_text()
else:
player['minutes'] = 0
new_players.append(player)
# dump data from source to pkl files - to be run just once
import pickle,requests
from bs4 import BeautifulSoup
import requests , re
players = []
years = ['2010', '2011' , '2012']
for year in years :
#year = '2012'
fetch_data(players, year)
new_players = []
extend_data(players, new_players, year)
f_name = "players_data_" + year +".pkl"
pickle.dump( new_players, open( f_name, "wb" ) )
#print new_players[0]
#pickle.dump( new_players, open( "players_data_2012.pkl", "wb" ) )
# load data from pkl files to a dataframe
import numpy as np
import pandas as pd
import pickle,matplotlib.pyplot as plt
from pandas import Series, DataFrame
f = open('players_data_2010.pkl')
#f = open(cloud.files.get('players_data_2010.pkl') )
p = pickle.load(f)
player_df_2010 = DataFrame(p)
player_df_2010['season']='10/11'
f = open('players_data_2011.pkl')
p = pickle.load(f)
player_df_2011 = DataFrame(p)
player_df_2011['season']='11/12'
f = open('players_data_2012.pkl')
p = pickle.load(f)
player_df_2012 = DataFrame(p)
player_df_2012['season']='12/13'
player_df = pd.concat([player_df_2010,player_df_2011,player_df_2012])
print player_df.columns
player_df.set_index('id',inplace=True,drop=False)
print player_df[:1]
# data cleanup - converting strings to numerics for the appropriate fields
player_df['transfer_fee_euros'] = player_df.transfer_fee_euros.str.replace('.','')
player_df['transfer_fee_euros'] =player_df.transfer_fee_euros.astype(float64)
player_df['minutes'] = player_df.minutes.astype(float64)
player_df['goals'] = player_df.goals.astype(float64)
player_df['age'] = player_df.age.astype(float64)
player_df['assists'] = player_df.assists.astype(float64)
player_df.set_index(['id'], inplace=True, drop=False)
player_df['money_by_minutes']= player_df['transfer_fee_euros']/player_df['minutes'].astype(float64)
player_df['money_by_goal_minutes'] = player_df['transfer_fee_euros']/player_df['minutes_per_goal'].astype(float64)
player_df['money_by_assists'] = player_df['transfer_fee_euros']/player_df['assists'].astype(float64)
player_df['money_by_goals'] = player_df['transfer_fee_euros']/player_df['goals'].astype(float64)
player_df['money_by_matches'] = player_df['transfer_fee_euros']/player_df['matches'].astype(float64)
# only consider player purchases done by teams from English premier league
epl_teams = [ u'Southampton', u'Aston Villa', u'Man Utd', u'Swansea', u'Liverpool', u'Blackburn', u'Everton', u'Fulham', u'Newcastle', u'West Ham', u'QPR', u'Wolves', u'Chelsea', u'Spurs', u'Man City', u'Stoke City', u'Arsenal', u'Sunderland', u'Wigan', u'Birmingham', u'West Brom']
epl_to=[]
for team in epl_teams:
epl_to.append( player_df[player_df.to == team])
player_df = pd.concat(epl_to)
#print len(player_df)
player_df = player_df.sort_index(by='season')
#print player_df
# consider only players transferred to EPL teams . Disregard players transferred to non EPL teams.
epl_teams = [ u'Southampton', u'Aston Villa', u'Man Utd', u'Swansea', u'Liverpool', u'Blackburn', u'Everton', u'Fulham', u'Newcastle', u'West Ham', u'QPR', u'Wolves', u'Chelsea', u'Spurs', u'Man City', u'Stoke City', u'Arsenal', u'Sunderland', u'Wigan', u'Birmingham', u'West Brom']
defence = ['SW' , 'CB' , 'LB' , 'RB']
midfield = ['DM', 'CM', 'RM','LM', 'AM', 'LW', 'RW']
striker = ['LW', 'RW', 'SS', 'CF']
attack ={}
def plot_acquisitions_by_positions(pos_list,label):
d = DataFrame(index=epl_teams)
for season in player_df.season.unique():
for team in epl_teams:
if label=='Midfield':
attack[team]=len(player_df[(player_df.season==season) & (player_df['position'].isin(pos_list) | ( player_df['id'].isin(['7858','14086','4063','15921','50202','15921','35251'])) ) & (player_df.to ==team)])
elif label== 'Attack':
attack[team]=len(player_df[(player_df.season==season) & ((player_df['position'].isin(pos_list)) | ( player_df['id'].isin(['31552','15185'])))& (player_df.to ==team)])
elif label =='Defence':
#print team,season, len(player_df[(player_df.season==season) & (player_df['position'].isin(['SW' , 'CB' ,'LB', 'RB'])) & (player_df.to ==team)][['p_name','position']])
attack[team]= len(player_df[(player_df.season==season) & (player_df['position'].isin(pos_list)) & (player_df.to ==team)])
g=Series(attack)
g.name= label +"_"+ season
d=d.join(g)
print d.plot(kind='barh', figsize=(10,10),title='trend of ' + label + ' player acquistions')
plot_acquisitions_by_positions(defence,'Defence')
plot_acquisitions_by_positions(midfield,'Midfield')
plot_acquisitions_by_positions(striker,'Attack')
#.plot(kind='barh',figsize=(5,10))
#Series().plot(kind='barh')
#print len(player_df[(player_df.season=='11/12') & (player_df['position'].isin(['SW' , 'CB' ,'LB', 'RB'])) & (player_df.to =='Arsenal')][['p_name','position']])
#print player_df[player_df['position'].isin(['SW' , 'CB' ,'LB', 'RB'])][['pos_type']]
#pieces = DataFrame()
#print player_df.groupby(['to','season'])['transfer_fee_euros'].mean()
for season in player_df.season.unique():
# print season
p1 = player_df[player_df.season==season].groupby(['to'])['transfer_fee_euros'].mean()
p1.name = season
#print p1
#print p1
if season == '10/11':
club_purchase = DataFrame(p1)
else:
club_purchase = club_purchase.join(p1)
club_purchase = club_purchase.fillna(value=0)
#position_compare.sort_index(
#position_compare = position_compare.transpose()
print club_purchase.sort_index(ascending=False).plot(kind='barh')
# get unique players who have been transferred more than once across seasons from 2010 to 2013
transfr_more_than_once = player_df.ix[player_df.id.value_counts() > 1 ].sort_index(by=['id','season'])['id'].unique()
print "Players who were transferred between the EPL teams more than once"
# list of all players who have been transferred more than once from 2010 -2013
print player_df.ix[transfr_more_than_once].sort_index(by=['id','season'])[['p_name','from','to','transfer_fee_euros','position','season','goals']]
# graph player transfer amount rise/fall through the 3 seasons.
def transfr_amt_by_season(p_id):
return player_df[(player_df.id==p_id)][['season','transfer_fee_euros']].set_index(keys='season')
name_df = DataFrame(index=['10/11','11/12','12/13'])
for item in np.unique(transfr_more_than_once):
#print item
#print np.unique(player_df.ix[item]['name'])
name_df[np.unique(player_df.ix[item]['p_name'])] = transfr_amt_by_season(item)
#name_df= name_df.where(pd.notnull(name_df),0)
name_df= name_df.fillna(method='ffill') # assuming the cost to club is same , if club retains a player for successive seasons
print " \n\n Tabulate player acquisition costs for players who were transferred more than once in the last 3 EPL seasons"
print name_df
print "\n\n plot of player acquisition costs for players who were transferred more than once in the last 3 EPL seasons"
print name_df.plot(figsize=(10,7))
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
#player_df.transfer_fee_euros.type
#player_df.position
player_df.sort_index(by='transfer_fee_euros')[::][["p_name", "club",'transfer_fee_euros','from','to']]
grouped= player_df.groupby('club').size()
#grouped.order(ascending=True).plot()
club_counts = player_df['club'].value_counts()
#print club_counts
#club_counts.plot(kind='barh', rot=0)
pos_counts = player_df['position'].value_counts() # players acquired by position - raw count
print " Distribution of players purchased by position"
print pos_counts
print "Total players purchased " , len(player_df)
# plot positions in demand
print player_df.position.value_counts().plot(kind='bar', figsize=(5,5), title = 'positions in demand')
#player_df.groupby('position')['transfer_fee_euros'].sum().plot(title='total price by position')
position_compare = DataFrame()
#print player_df.season.unique()
#pieces = DataFrame()
for season in player_df.season.unique():
p1 = player_df[player_df.season==season].groupby(['position'])['transfer_fee_euros'].mean()
p1.name = season
if season == '10/11':
position_compare = DataFrame(p1)
else:
position_compare = position_compare.join(p1)
position_compare= position_compare.where(pd.notnull(position_compare),0)
print position_compare
print "\n\n\n"
print position_compare.plot(kind='barh', title='seasonal trend for mean price paid by position')
#player_df[player_df.season=='10/11'].groupby(['position'])['transfer_fee_euros'].mean().plot(title='avg price by position' , kind='barh', rot=0) # shows high price paid for Strikers CF
#pos_counts.plot(kind='barh', rot=0)
#player_df['transfer_fee_euros'].hist() # distribution of money spent
#t1[seasons].sort_index(ascending=False).plot(figsize(5,5),kind='barh', title='season '+ seasons+ ':fee by ' + col , rot=0)
# grouped = player_df[player_df.season=='10/11'].groupby(['to'])
position_compare = DataFrame()
#print player_df.season.unique()
#pieces = DataFrame()
stats_for_money = ['goals', 'minutes','assists' ]
def return_per_euro(stat):
for season in player_df.season.unique():
grouped = player_df[player_df.season==season].groupby(['to'])
#print grouped
#p1= grouped['transfer_fee_euros'].sum()/grouped['minutes'].sum()
p1= grouped['transfer_fee_euros'].sum()/grouped[stat].sum()
p1.name = season
if season == '10/11':
euros_per_stat = DataFrame(p1)
else:
euros_per_stat = euros_per_stat.join(p1)
euros_per_stat = euros_per_stat.fillna(value=0)
print euros_per_stat
print euros_per_stat.plot(kind='barh', title='euros spent by ' + stat)
return_per_euro('goals')
#for item in stats_for_money:
# return_per_euro(item)
return_per_euro('assists')
return_per_euro('minutes')
#### An alternate view of comparing the clubs by season on various returns
def compare_stat(col):
''' takes column name from players_df dataframe and computes transfr_amount/col name '''
#col = 'minutes'
#Money spent each season by clubs on player acquisitions
grouped = player_df[player_df.season=='10/11'].groupby(['to'])
# money paid per minute on field time
t1 = (grouped['transfer_fee_euros'].sum()/ grouped[col].sum())
t1.name = '10/11'
t1 = DataFrame(t1)
grouped = player_df[player_df.season=='11/12'].groupby(['to'])
t2 = grouped['transfer_fee_euros'].sum()/ grouped[col].sum()
t2.name='11/12'
grouped = player_df[player_df.season=='12/13'].groupby(['to'])
t3 = grouped['transfer_fee_euros'].sum()/ grouped[col].sum()
t3.name='12/13'
t1= t1.join(t2)
t1= t1.join(t3)
t1= t1.where(pd.notnull(t1),0)
#print t1
fig = plt.figure()
fig.set_size_inches(20,20)
ax1 = fig.add_subplot(2, 2, 1)
ax2 = fig.add_subplot(2, 2, 2)
ax3 = fig.add_subplot(2, 2, 3)
i=1
for seasons in t1.columns:
#print seasons
x = fig.add_subplot(2,2,i)
t1[seasons].sort_index(ascending=False).plot(figsize(5,5),kind='barh', title='season '+ seasons+ ':fee by ' + col , rot=0)
i+=1
fig
t1= t1.transpose()
t1.plot(figsize(15,15))
# other parameters that can be passed - minutes , assists
compare_stat('goals')
stat = 'money_by_goals'
x = player_df.groupby('season').sort_index(by=stat)[['p_name','from', 'to', 'position','goals', 'transfer_fee_euros' , stat ]]
for season in player_df.season.unique():
print "\n\ntop 5 player acquisitions per goal scored in " + season
print x.xs(season)[:5]
stat = 'money_by_minutes'
x = player_df.groupby('season').sort_index(by=stat)[['p_name','from', 'minutes','to', 'position','goals', 'transfer_fee_euros' , stat ]]
for season in player_df.season.unique():
print "\n\ntop 5 player acquisitions per minute on field in " + season
print x.xs(season)[:5]
stat = 'money_by_goals'
x = player_df.groupby(['season']).sort_index(by=stat,ascending=True)[['p_name','age', 'to', 'position','goals', 'transfer_fee_euros' , stat ]]
#print x[:5]
for season in player_df.season.unique():
print "\n\ntop 5 player acquisitions per goal scored in " + season
# make sure you consider players who scored at least 1 goal
print x[x.goals > 0 ].xs(season).sort('money_by_goals').sort('age', ascending=True)[:5]
for season in player_df.season.unique():
print "\n\nSeason : " + season + "\n"
print x[x.goals > 0 ].xs(season).sort('money_by_goals').sort('age', ascending=False)[:5]
# some LW and RW positions are ambiguous
#print player_df[player_df.position == 'LW'][['p_name','position']]
##position codes categorized by position type
##defence = ['SW' , 'CB' , 'LB' , 'RB']
##midfield = ['DM', 'CM', 'RM','LM', 'AM', 'LW', 'RW']
##striker = ['LW', 'RW', 'SS', 'CF']
#del player_df['pos_type']
for season in player_df.season.unique():
# defensive positions
print "\n**************************************************************************************"
print '\nSeason: '+ season + ' - The top 5 defensive players by money spent per minute on field\n'
print player_df[player_df['position'].isin(['SW' , 'CB' ,'LB', 'RB'])].groupby(['season']).sort_index(by='money_by_minutes').xs(season)[:5][['p_name' ,'to', 'money_by_minutes']]
print '\nSeason: '+ season + ' - The top 5 midfield players by money spent per minute on field\n'
print player_df[player_df['position'].isin(['DM', 'CM', 'RM','LM', 'AM']) | ( player_df['id'].isin(['7858','14086','4063','15921','50202','15921','35251'])) ].groupby(['season']).sort_index(by='money_by_minutes').xs(season)[:5][['p_name' ,'to', 'money_by_minutes']]
print '\nSeason: '+ season + ' - The top 5 Attacking players by money spent by goal scored\n'
print player_df[player_df['position'].isin([ 'SS', 'CF']) | ( player_df['id'].isin(['31552','15185'])) ].groupby(['season']).sort_index(by='money_by_goals').xs(season)[:5][['p_name' ,'to', 'money_by_goals']]
print "-----------------------------------------------------------------------------------------\n\n\n"
#print player_df['pos_type'].value_counts()
#player_df.pos_type.value_counts()
# dictionary and list for stats
stats_list=['age', 'assists', 'goals', 'matches', 'minutes', 'minutes_per_goal', 'p_name', 'position', 'transfer_fee_euros', 'transfer_fee_pounds', 'money_by_minutes', 'money_by_goal_minutes', 'money_by_assists', 'money_by_goals', 'money_by_matches']
stats_dict={'age':'Age', 'assists':'Assists', 'goals':'Goals', 'matches':'Matches', 'minutes':'Minutes', 'minutes_per_goal':'Minutes per Goal', 'p_name':'Player Name', 'position':'Position',
'transfer_fee_euros':'Transfer Fee(Euros)', 'transfer_fee_pounds':'Transfer Fee(Pounds)', 'money_by_minutes':'Money by Minutes', 'money_by_goal_minutes':'Money by Goal Minutes',
'money_by_assists':'Money by Assists', 'money_by_goals':'Money by Goals', 'money_by_matches':'Money By Matches'}
import IPython.core.display
# start off form
dialog_form= '<div id="dialog" title="Select Data"><p>'
#define teams drop down menu
g = player_df
season_drop="<label>Season:<select id='season_drop'>"
for se in g.season.unique():
season_drop+='<option value="'+se+'">'+se+'</option>'
season_drop+="</select></label></br>"
dialog_form+=season_drop
team_drop="<label>Team:<select id='team_drop'>"
for team in g.to.unique():
team_drop+='<option value="'+team+'">'+team+'</option>'
team_drop+="</select></label></br>"
dialog_form+=team_drop
#start stat dropdown
stat_drop="<label>Stat:<select id='stat_drop'>"
for stat in g.columns:
if stat in stats_list:
stat_drop+='<option value="'+stat+'">'+stats_dict[stat]+'</option>'
stat_drop+="</select></label>"
dialog_form+=stat_drop
dialog_form+="</p></div>"
IPython.core.display.HTML(dialog_form)
team=None
stat=None
season=None
def prompt_results():
if not stat or not team or not season:
return "No Data Selected, Use Prompt"
# grouped = player_df[player_df.season=='11/12'].groupby(['to'])
g = player_df[player_df.season==season]
#g = player_df[player_df.season=='11/12']
gteam= g[g.club==team]
gstat= gteam[stat]
eq=""
if len(gstat.values)<=0:
return "No data for that selection, Try another."
for s in gstat:
eq+=str(s)+", "
eq= eq[:-2]
if type(gstat[0]) is numpy.float64:
eq+=" (Total: "+str(gstat.sum())+")"
print stats_dict[stat]+"(s) by "+team+" [with/of transfered players] = "+eq
#Working Non formatted
team=None
stat=None
season=None
IPython.core.display.Javascript("""
$( "#dialog" ).dialog({ buttons: [ { text: "Ok", click: function() {
var teami,stati,seasoni,py1,py2,py3;
teami= $('#team_drop').val();
stati= $('#stat_drop').val();
seasoni= $('#season_drop').val();
py1 = "team = '"+teami+"'";
py2 = "stat = '"+stati+"'";
py3 = "season = '"+seasoni+"'";
IPython.notebook.kernel.execute(py1);
IPython.notebook.kernel.execute(py2);
IPython.notebook.kernel.execute(py3);
$( this ).dialog( "close" ); } } ] });
""")
#html formated code
team=None
stat=None
season=None
IPython.core.display.Javascript("""
$( '#dialog' ).dialog({ buttons: [ { text: 'Ok', click: function() {
var teami,stati,seasoni,py1,py2,pyr3;
teami= $('#team_drop').val();
stati= $('#stat_drop').val();
seasoni= $('#season_drop').val()
py1 = 'team = "'+teami+'"';
py2 = 'stat = "'+stati+'"';
pyr3 = 'season = "'+seasoni+'"';
IPython.notebook.kernel.execute(py1);
IPython.notebook.kernel.execute(py2);
IPython.notebook.kernel.execute(pyr3);
$( this ).dialog( 'close' ); } } ] });
""")
prompt_results()