#!/usr/bin/env python # coding: utf-8 # # LFC Data Analysis: The Transfers # See Terry's blog [Inspiring Transfers](http://terrydolan.blogspot.co.uk/2015/09/inspiring-transfers.html) for a discussion of the data analysis. # This notebook analyses Liverpool FC's transfers over the last 5 seasons, from 2011-2012 to 2014-15. It also compares Liverpool's average net transfer spend, revenue and wage bill to the other teams in the top 6. # # The analysis uses [IPython Notebook](http://ipython.org/notebook.html), [python](https://www.python.org/), [pandas](http://pandas.pydata.org/) and [matplotlib](http://matplotlib.org/) to explore the data. # ## Set-up # Import the modules needed for the analysis. # In[1]: import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np import sys import collections from datetime import datetime from __future__ import division # enable inline plotting get_ipython().run_line_magic('matplotlib', 'inline') # Print version numbers. # In[2]: print 'python version: {}'.format(sys.version) print 'pandas version: {}'.format(pd.__version__) print 'matplotlib version: {}'.format(mpl.__version__) print 'numpy version: {}'.format(np.__version__) # ## Load the data into a dataframes and munge # #### Create dataframe of LFC transfers # Data source: lfchistory.net # # Note that all transfers are shown for FSG's ownership, from Oct 2010 to September 2015. # In[3]: LFC_TRANSFERS_CSV_FILE = 'data\lfc_transfers_2010-2011_2015-2016.csv' dflfc_transfers = pd.read_csv(LFC_TRANSFERS_CSV_FILE, parse_dates=['Date']) # convert Fee to millions of pounds dflfc_transfers.Fee = dflfc_transfers.Fee/1000000 # show shape dflfc_transfers.shape # In[4]: dflfc_transfers.head() # In[5]: dflfc_transfers.tail() # In[6]: # check Date (of transfer) column is datetime data type dflfc_transfers.dtypes # #### Create dataframe of LFC players giving birthdate of each player # Data source: lfchistory.net # In[7]: LFC_PLAYERS_CSV_FILE = 'data\lfc_players_september2015_upd.csv' dflfc_players = pd.read_csv(LFC_PLAYERS_CSV_FILE, parse_dates=['birthdate']) dflfc_players.rename(columns={'player': 'Player', 'birthdate': 'Birthdate', 'country': 'Country'}, inplace=True) dflfc_players.shape # In[8]: dflfc_players.dtypes # In[9]: dflfc_players.head(10) # #### Create dataframe of premier league transfers over last 5 seasons # Data source: transferleague.co.uk # In[10]: PREM_TRANSFERS_CSV_FILE = 'data\prem_transfers_2011-2012_2015-2016.csv' dfprem_transfers = pd.read_csv(PREM_TRANSFERS_CSV_FILE, skiprows=2, header=True) # convert money to millions of pounds dfprem_transfers.Purchased = np.round(dfprem_transfers.Purchased/1000000, 1) dfprem_transfers.Sold = np.round(dfprem_transfers.Sold/1000000, 1) dfprem_transfers.NetSpend = np.round(dfprem_transfers.NetSpend/1000000, 1) dfprem_transfers.PerSeasonSpend = np.round(dfprem_transfers.PerSeasonSpend/1000000, 1) # show shape dfprem_transfers.shape # In[12]: dfprem_transfers.head() # In[13]: dfprem_transfers.tail() # #### Create dataframe of premier league wage bill over last 5 seasons # Data source: guardian.co.uk etc # In[14]: PREM_WAGES_CSV_FILE = 'data\prem_wages_2012-2013_2014-2015.csv' dfprem_wages = pd.read_csv(PREM_WAGES_CSV_FILE, skiprows=4, header=True) # show shape dfprem_wages.shape # In[15]: dfprem_wages # #### Create dataframe of league table for 2014-2015 # Data source: http://www.sportsmole.co.uk/football/premier-league/2014-15/table.html # In[16]: PREM_TABLE_CSV_FILE = 'data\prem_table_2014-2015.csv' dfprem_table_2014_2015 = pd.read_csv(PREM_TABLE_CSV_FILE, skiprows=2, header=True) # show shape dfprem_table_2014_2015.shape # In[17]: dfprem_table_2014_2015.head(6) # In[18]: dfprem_table_2014_2015.tail() # #### Create dataframe of Deloiite Football Money League # Data source: [Deloitte Football Money League](http://www2.deloitte.com/uk/en/pages/sports-business-group/articles/deloitte-football-money-league.html) # # The revenue (in pounds) is in the detailed report. # In[19]: # note that the revenue figures are for 2014 deloitte_review_2015 = {'Team': ['Manchester United', 'Manchester City', 'Chelsea', 'Arsenal', 'Liverpool', 'Tottenham Hotspur'], 'Revenue': [433.2, 346.5, 324.4, 300.5, 255.8, 180.5]} df_revenue = pd.DataFrame(data=deloitte_review_2015, columns=['Team', 'Revenue']) df_revenue = df_revenue.set_index('Team') # In[20]: df_revenue # ## Analyse the data # # Ask a question and find the answer! # Show ins and outs for 2015-2016 # In[21]: dflfc_transfers[dflfc_transfers.Season == '2015-2016'] # What was biggest fee paid for a Liverpool player over the data period? # In[22]: dflfc_transfers_in = dflfc_transfers[dflfc_transfers.Direction == 'In'] dflfc_transfers_in[dflfc_transfers_in.Fee == dflfc_transfers_in.Fee.max()] # What was biggest fee received for a player? # In[23]: dflfc_transfers_out = dflfc_transfers[dflfc_transfers.Direction == 'Out'] dflfc_transfers_out[dflfc_transfers_out.Fee == dflfc_transfers_out.Fee.max()] # Summarise Fees In and Out # In[24]: dflfc_transfers.groupby(['Season', 'Direction']).sum() # Show as 'unstacked' dataframe # In[25]: df_fsg = dflfc_transfers.groupby(['Season', 'Direction']).sum().unstack() df_fsg.columns = df_fsg.columns.droplevel() del df_fsg.columns.name df_fsg Calculate the net spend for the data period, Oct 2011 to Aug 2015 # In[27]: df_fsg['NetSpend'] = df_fsg.In - df_fsg.Out df_fsg # Calculate total Fees and NetSpend, over the 6 seasons # In[28]: df_fsg.sum() # Calculate average (mean) Fees and NetSpend, over the 6 seasons # In[29]: df_fsg.mean() # What is the the average Net Spend over last 5 seasons? # In[30]: df_fsg['2011-2012':'2015-2016'].NetSpend.mean() # Where do most players come from? # In[31]: dflfc_transfers_in.Club.value_counts().head() # Where do most players go to? # In[32]: dflfc_transfers_out.Club.value_counts().head() # Which players were bought for more than £15M? # In[33]: dflfc_transfers_in[dflfc_transfers_in.Fee >= 15] # Which players were sold for more than £15M? # In[34]: dflfc_transfers_out[dflfc_transfers_out.Fee >= 15] # Plot FSG's LFC Transfers per Season for last 5 seasons (2011-12 to 2015-16) # # Note that the analysis is carried out in September 2015. Therefore the 2015-2016 NetSpend does not include the January 2016 tranfer window. # In[35]: df_fsg.ix['2011-2012':] # In[36]: avg_netspend = df_fsg.ix['2011-2012':].NetSpend.mean() print 'Average Transfer Net Spend per Season is £{}M'.format(round(avg_netspend, 1)) ax = df_fsg.ix['2011-2012':].plot(kind='bar', figsize=(12, 9), color=('r', 'y', 'b'), legend=False) plt.axhline(avg_netspend, color='b', linestyle='--') ax.set_ylabel('Transfers in Pounds (Millions)') ax.set_title("FSG's LFC Transfers per Season for the Last 5 Seasons\n(2011-2012 to 2015-2016)") ax.text(-.4, 97, 'prepared by: @lfcsorted', fontsize=9) # create legend l1 = plt.Line2D([], [], linewidth=7, color='r') l2 = plt.Line2D([], [], linewidth=7, color='y') l3 = plt.Line2D([], [], linewidth=7, color='b') l4 = plt.Line2D([], [], linewidth=1, color='b', linestyle='--') labels = ['Transfers In', 'Transfers Out', 'Transfer Net Spend', 'Average Transfer Net Spend per Season'] ax.legend([l1, l2, l3, l4], labels, fancybox=True, shadow=True, framealpha=0.8, loc='upper left') # plot and save current figure fig = plt.gcf() plt.show() fig.savefig('FSGTransfersPerSeason2011-2015.png', bbox_inches='tight') # Now let's compare Liverpool's transfers to the other top 6 teams. # What is net spend per season for top 5 teams over last 5 seasons? # In[37]: dfprem_transfers # In[39]: top6_2014_15 = [ 'Chelsea', 'Manchester City', 'Manchester United', 'Arsenal', 'Tottenham Hotspur', 'Liverpool'] df_prem_transfers_top6 = dfprem_transfers[['Team', 'PerSeasonSpend']][dfprem_transfers.Team.isin(top6_2014_15)] df_prem_transfers_top6 # Plot Top 6 Premier League Teams Average Transfer Net Spend per Season for Last 5 Seasons # In[40]: ax = df_prem_transfers_top6.plot(x='Team', kind='bar', figsize=(12, 9), color=('b', 'b', 'b', 'r', 'b', 'b'), legend=False) ax.set_ylabel('Net Spend per Season in Pounds (Millions)') ax.set_title("Top 6 Premier League Teams Average Transfer Net Spend per Season for Last 5 Seasons\n(2011-2012 to 2015-2016)") plt.axhline(0, color='grey') # x axis at net spend = 0 ax.text(-0.45, -9, 'prepared by: @lfcsorted', fontsize=9) # plot and save current figure fig = plt.gcf() plt.show() fig.savefig('PremTransferSpendPerSeason2011-2015.png', bbox_inches='tight') # What is the revenue of top 6 premier league clubs? # In[42]: df_revenue.head(6) # Plot Top 6 English Premier League Teams by Revenue # In[43]: ax = df_revenue.plot(kind='bar', figsize=(12, 9), color=('b', 'b', 'b', 'b', 'r', 'b'), legend=False) ax.set_ylabel('Revenue in Pounds (Millions)') ax.set_title('Top 6 English Premier League Teams by Revenue (2014)') ax.text(4.45, 440, 'prepared by: @lfcsorted', fontsize=9) # plot and save current figure fig = plt.gcf() plt.show() fig.savefig('PremRevenue2014.png', bbox_inches='tight') # Let's now examine the age of the LIverpool transfers # First create new transfer dataframe with birthdate of player # In[44]: dflfc_transfers.head() # In[45]: dflfc_transfers.shape # In[46]: dflfc_players.head() # In[47]: dflfc_transfers_with_dob = pd.DataFrame.merge(dflfc_transfers, dflfc_players, how='left') dflfc_transfers_with_dob.shape # In[48]: dflfc_transfers_with_dob.head() # In[49]: dflfc_transfers_with_dob.dtypes # In[50]: # check to see if any Birthdates are missing dflfc_transfers_with_dob.Birthdate.isnull().any() # In[51]: # show missing entries (these have been reported to lfchistory.net) dflfc_transfers_with_dob[dflfc_transfers_with_dob.Birthdate.isnull()] # In[52]: # fill in mising data dflfc_transfers_with_dob.loc[dflfc_transfers_with_dob.Player == 'Chris Mavinga', 'Country'] = 'France' dflfc_transfers_with_dob.loc[dflfc_transfers_with_dob.Player == 'Chris Mavinga', 'Birthdate'] = pd.Timestamp('19910526') dflfc_transfers_with_dob.loc[dflfc_transfers_with_dob.Player == 'Kristoffer Peterson', 'Country'] = 'Sweden' dflfc_transfers_with_dob.loc[dflfc_transfers_with_dob.Player == 'Kristoffer Peterson', 'Birthdate'] = pd.Timestamp('19941128') dflfc_transfers_with_dob[(dflfc_transfers_with_dob.Player == 'Chris Mavinga') | (dflfc_transfers_with_dob.Player == 'Kristoffer Peterson')] # Add age at transfer date to the dataframe # In[53]: def age_at(dob, this_date): """Return age in years at this_date for given date of birth. Note that both dob and this_date are of type datetime.""" return round((this_date - dob).days/365.0, 1) # In[54]: dflfc_transfers_with_dob['AgeAtTransfer'] = dflfc_transfers_with_dob.apply(lambda row: age_at(row.Birthdate, row.Date), axis=1) dflfc_transfers_with_dob.tail() # What is average age of incoming players by season? # Average age of all transfers in, including free # In[55]: dflfc_transfers_with_dob[(dflfc_transfers_with_dob.Direction == 'In') & (dflfc_transfers_with_dob.Fee >= 0)]\ ['AgeAtTransfer'].mean() # Average age of all transfers in, excluding free # In[56]: dflfc_transfers_with_dob[(dflfc_transfers_with_dob.Direction == 'In') & (dflfc_transfers_with_dob.Fee > 0)]\ ['AgeAtTransfer'].mean() # Average age of all transfers in, excluding free, by season # In[57]: dflfc_transfers_with_dob[(dflfc_transfers_with_dob.Direction == 'In') & (dflfc_transfers_with_dob.Fee > 0)]\ [['Season', 'AgeAtTransfer']].groupby('Season').mean() # Average age of all transfers in, including free, by season (rounded) # In[58]: dflfc_transfers_with_dob[dflfc_transfers_with_dob.Direction == 'In']\ [['Season', 'AgeAtTransfer']].groupby('Season').agg(lambda x: round(x.mean(), 1)) # Plot age at transfer in # In[59]: dflfc_transfers_with_dob[dflfc_transfers_with_dob.Direction == 'In']\ [['Season', 'AgeAtTransfer']].groupby('Season').agg(lambda x: round(x.mean(), 1)).plot(kind='bar', legend=False, title='Age at Transfer In') # What is average age of incoming and outgoing players by season # In[60]: dflfc_transfers_with_dob[['Season', 'Direction', 'AgeAtTransfer']].groupby(['Season', 'Direction'])\ .agg(lambda x: round(x.mean(), 1)).unstack()\ .plot(kind='bar', title='Age at Transfer', ylim=(0,40), yticks=range(0,35,5)) # Excluding free # In[61]: dflfc_transfers_with_dob[dflfc_transfers_with_dob.Fee > 0][['Season', 'Direction', 'AgeAtTransfer']].groupby(['Season', 'Direction'])\ .agg(lambda x: round(x.mean(), 1)).unstack()\ .plot(kind='bar', title='Age at Transfer (non-zero)', ylim=(0,40), yticks=range(0,35,5)) # In[62]: dflfc_transfers_with_dob[['Season', 'Direction', 'AgeAtTransfer']].groupby(['Season', 'Direction']).agg(lambda x: round(x.mean(), 1)) # In[63]: dflfc_transfers_with_dob[dflfc_transfers_with_dob.Fee > 0][['Season', 'Direction', 'AgeAtTransfer']].groupby(['Season', 'Direction']).agg(lambda x: round(x.mean(), 1)) # #### Note that Outs do not include players leaving at end of contract e.g. Gerrard or those retiring e.g. Carragher # Analyse age of starting line-ups over last 3 seasons # In[64]: LFC_GAME1_CSV_FILE = 'data\lfc_pl_opening_games_aug2015.csv' dflfc_openers = pd.read_csv(LFC_GAME1_CSV_FILE, parse_dates=['Date']) # show shape dflfc_openers.shape # In[65]: dflfc_openers.tail() # Add age at opening game # In[66]: dflfc_openers = pd.DataFrame.merge(dflfc_openers, dflfc_players[['Player', 'Birthdate']], how='left') dflfc_openers.shape # In[67]: dflfc_openers.head() # In[68]: dflfc_openers['AgeAtOpener'] = dflfc_openers.apply(lambda row: age_at(row.Birthdate, row.Date), axis=1) dflfc_openers.tail() # Calculate average age of team # In[69]: dflfc_openers[['Season', 'AgeAtOpener']].groupby('Season').agg(lambda x: round(x.mean(), 1)) # In[70]: dflfc_openers[['Season', 'AgeAtOpener']].groupby('Season').agg(lambda x: round(x.mean(), 1)).plot(kind='bar', ylim=(24,28)) # In[71]: dflfc_transfers_with_dob[dflfc_transfers_with_dob.Direction == 'In'][['Season', 'Player', 'Fee', 'AgeAtTransfer']] # What is average age of team that *finished* againt Arsenal? # In[71]: LFC_ARSENAL_CSV_FILE = 'data\lfc_pl_vs_Arsenal_aug2015.csv' dflfc_arsenal = pd.read_csv(LFC_ARSENAL_CSV_FILE, parse_dates=['Date'], skiprows=2) # show shape dflfc_arsenal.shape # In[72]: dflfc_arsenal = pd.merge(dflfc_arsenal, dflfc_players[['Player', 'Birthdate']]) dflfc_arsenal # In[73]: dflfc_arsenal['AgeAtGame'] = dflfc_arsenal.apply(lambda row: age_at(row.Birthdate, row.Date), axis=1) dflfc_arsenal # In[74]: dflfc_arsenal.mean() # Compare team wage bills for 2014-15 # In[75]: dfprem_wages[['Team', '2014-2015']].head() # In[76]: dfprem_table_2014_2015[['Team', 'PTS']].head() # In[77]: dfprem_table_2014_2015_wages = pd.merge(dfprem_table_2014_2015[['Team', 'PTS']], dfprem_wages[['Team', '2014-2015']]) # In[78]: dfprem_table_2014_2015_wages dfprem_table_2014_2015_wages.rename(columns={'2014-2015': 'WageBill','PTS': 'Points'}, inplace=True) # In[79]: dfprem_table_2014_2015_wages.head() # In[80]: dfprem_table_2014_2015.Rank[dfprem_table_2014_2015.Team == 'Chelsea'].values[0] # In[81]: dfprem_table_2014_2015_wages.plot(kind='scatter', x='WageBill', y='Points') # In[82]: df = dfprem_table_2014_2015_wages.set_index('Team') # In[83]: (a,b) = df.ix['Liverpool'] print a, b # In[84]: # Ref: http://stackoverflow.com/questions/739241/date-ordinal-output def n_plus_suffix(n): """Return n plus the suffix e.g. 1 becomes 1st, 2 becomes 2nd.""" assert isinstance(n, (int, long)), '{} is not an integer'.format(n) if 10 <= n % 100 < 20: return str(n) + 'th' else: return str(n) + {1 : 'st', 2 : 'nd', 3 : 'rd'}.get(n % 10, "th") # Plot Wage Bill vs Points, with Top 6 highlighted (2014-2015) # In[85]: # calculate points total for 6th place sixth_place_points = df.iloc[5].Points # plot top 6 as blue circles ax = df[df.Points >= sixth_place_points].plot(kind='scatter', x='WageBill', y='Points', figsize=(12,9), color='b') # plot others as black circles df[df.Points < sixth_place_points].plot(ax=ax, kind='scatter', x='WageBill', y='Points', figsize=(12,9), color='k') # calculate Liverpool's points and plot as red circles lfcpoints, lfcwage = df.ix['Liverpool'] ax.plot(lfcwage, lfcpoints, 'ro') # add x and y labels etc ax.set_xlabel('Wage Bill in Pounds (millions)') ax.set_ylabel('Points') ax.set_title('Wage Bill vs Points, with Top 6 highlighted (2014-2015)') ax.text(2, 21, 'prepared by: @lfcsorted', fontsize=9) # add text showing team and position for top 6 for team, (points, wagebill) in df.iterrows(): pos = int(dfprem_table_2014_2015.Rank[dfprem_table_2014_2015.Team == team].values[0]) team_pos = '{} ({})'.format(team, n_plus_suffix(pos)) if points >= sixth_place_points: ax.annotate(s=team_pos, xy=(wagebill,points), xytext=(wagebill-len(team)-3, points+1)) # Prettify the plot - label all teams and highlight the top 4 and top 6 areas. # In[86]: # list all of the teams df.index.values # In[87]: # set position of text, default is centred on top of the circle # note that posotionis decided by trial and error, to give clearest plot TEAM_CL = ['Sunderland', 'West Ham United'] TEAM_CR = ['Stoke City', 'Aston Villa', 'Everton'] TEAM_CT = [] for team in df.index.values: if team not in TEAM_CL + TEAM_CR: TEAM_CT.append(team) # calculate points total for 6th place sixth_place_points = df.iloc[5].Points # plot top 6 as blue circles ax = df[df.Points >= sixth_place_points].plot(kind='scatter', x='WageBill', y='Points', figsize=(12,9), color='b') # plot others as black circles df[df.Points < sixth_place_points].plot(ax=ax, kind='scatter', x='WageBill', y='Points', figsize=(12,9), color='k') # calculate Liverpool's points and plot as red circles lfcpoints, lfcwage = df.ix['Liverpool'] ax.plot(lfcwage, lfcpoints, 'ro') # add x and y labels etc ax.set_xlabel('Wage Bill in Pounds (millions)') ax.set_ylabel('Points') ax.set_title('Wage Bill vs Points, with Top 6 highlighted (2014-2015)') ax.text(2, 21, 'prepared by: @lfcsorted', fontsize=9) # add text showing team and position for team, (points, wagebill) in df.iterrows(): pos = int(dfprem_table_2014_2015.Rank[dfprem_table_2014_2015.Team == team].values[0]) team_pos = '{} ({})'.format(team, n_plus_suffix(pos)) if team in TEAM_CT: ax.annotate(s=team_pos, xy=(wagebill,points), xytext=(wagebill-len(team)-3, points+1)) elif team in TEAM_CR: #print 'team cr: {}'.format(team) ax.annotate(s=team_pos, xy=(wagebill,points), xytext=(wagebill+2, points-0.5)) elif team in TEAM_CL: #print 'team cl: {}'.format(team) ax.annotate(s=team_pos, xy=(wagebill,points), xytext=(wagebill-3*len(team)-6, points-0.5)) else: raise('unexpected error') # add key areas to the graph and label from matplotlib.patches import Rectangle XTEXT_OFFSET = 1 YTEXT_OFFSET = -2 # add top 4 area, with annotation in top left hand corner of rectangle top4rect_bl_x = df[0:4].WageBill.min() # bottom left x co-ord of rectangle top4rect_bl_y = df[0:4].Points.min() # bottom left y co-ord of rectangle top4rect_width = df[0:4].WageBill.max() - top4rect_bl_x # width of rectangle top4rect_height = df[0:4].Points.max() - top4rect_bl_y # height of rectangle top4rect_tl_x = df[0:4].WageBill.min() # top left x co-ord for annotation top4rect_tl_y = df[0:4].Points.max() # top left y co-ord for annotation top4rect_xtext = top4rect_tl_x + XTEXT_OFFSET # text x co-ord for annotation top4rect_ytext = top4rect_tl_y + YTEXT_OFFSET # text y co-ord for annotation ax.add_patch(Rectangle((top4rect_bl_x, top4rect_bl_y), top4rect_width, top4rect_height, facecolor="blue", alpha=0.2)) ax.annotate(s='Top 4 area', xy=(top4rect_tl_x, top4rect_tl_y), xytext=(top4rect_xtext, top4rect_ytext), color='blue') # add top 6 area, with annotation in top left hand corner of rectangle top6rect_bl_x = df[0:6].WageBill.min() # bottom left x co-ord of rectangle top6rect_bl_y = df[0:6].Points.min() # bottom left y co-ord of rectangle top6rect_width = df[0:6].WageBill.max() - top6rect_bl_x # width of rectangle top6rect_height = df[0:6].Points.max() - top6rect_bl_y # height of rectangle top6rect_tl_x = df[0:6].WageBill.min() # top left x co-ord for annotation top6rect_tl_y = df[0:6].Points.max() # top left y co-ord for annotation top6rect_xtext = top6rect_tl_x + XTEXT_OFFSET # text x co-ord for annotation top6rect_ytext = top6rect_tl_y + YTEXT_OFFSET # text y co-ord for annotation ax.add_patch(Rectangle((top6rect_bl_x, top6rect_bl_y), top6rect_width, top6rect_height, facecolor="lightblue", alpha=0.2)) ax.annotate(s='Top 6 area', xy=(top6rect_tl_x, top6rect_tl_y), xytext=(top6rect_xtext, top6rect_ytext), color='blue', alpha=0.7) # plot and save current figure fig = plt.gcf() plt.show() fig.savefig('PremWageBillvsPoints2014-2015.png', bbox_inches='tight') # In[ ]: