#!/usr/bin/env python # coding: utf-8 # # LFC Data Analysis: The Elite Strikers # See Terry's blog [LFC: The Elite Strikers](http://terrydolan.blogspot.co.uk/2015/07/lfc-the-elite-strikers.html) for a discussion of of the data analysis. # This notebook analyses Liverpool FC's elite strikers data from 1962-1963 to 2014-2015. An elite striker is defined as a player who scored 20 or more league goals in a season. I've started the analysis at 1962 as this was when Bill Shankly returned Liverpool to the top flight of English football. The analysis uses [IPython Notebook](http://ipython.org/notebook.html), [python](https://www.python.org/), [pandas](http://pandas.pydata.org/) and [matplotlib](http://matplotlib.org/) to explore the data. # #### Notebook Change Log # In[1]: get_ipython().run_cell_magic('html', '', '\n\n') # | Date | Change Description | # | :------------ | :----------------- | # | 23rd July 2015 | Initial baseline | # ## Set-up # Import the modules needed for the analysis. # In[2]: import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np import sys import collections from datetime import datetime from __future__ import division # enable inline plotting get_ipython().run_line_magic('matplotlib', 'inline') # Print version numbers. # In[3]: print 'python version: {}'.format(sys.version) print 'pandas version: {}'.format(pd.__version__) print 'matplotlib version: {}'.format(mpl.__version__) print 'numpy version: {}'.format(np.__version__) # ## Load the LFC data into a dataframes and munge # #### Create dataframe of scorers in top level seasons # In[4]: LFC_SCORERS_CSV_FILE = 'data\lfc_scorers_1892-1893_2014-2015.csv' dflfc_scorers = pd.read_csv(LFC_SCORERS_CSV_FILE) # sort by season, then league goals dflfc_scorers = dflfc_scorers.sort(['season', 'league'], ascending=([False, False])) dflfc_scorers.shape # In[5]: dflfc_scorers.head() # In[6]: dflfc_scorers.tail() # In[7]: # note that scorers includes own goals dflfc_scorers[dflfc_scorers.player == 'Own goals'].head() Filter out seasons that LFC weren't in top level # In[8]: # note: war years already excluded LANCS_YRS = ['1892-1893'] SECOND_DIV_YRS = ['1893-1894', '1895-1896', '1904-1905', '1961-1962', '1954-1955', '1955-1956', '1956-1957', '1957-1958', '1958-1959', '1959-1960', '1960-1961'] NOT_TOP_LEVEL_YRS = LANCS_YRS + SECOND_DIV_YRS dflfc_scorers_tl = dflfc_scorers[~dflfc_scorers.season.isin(NOT_TOP_LEVEL_YRS)].copy() dflfc_scorers_tl.shape # In[9]: ## check number of seasons aligns with http://www.lfchistory.net/Stats/LeagueOverall.aspx ## expect 100 total print 'num seasons is {}'.format(len(dflfc_scorers_tl.season.unique())) # In[10]: # show most league goals in a season in top level # cross-check with http://en.wikipedia.org/wiki/List_of_Liverpool_F.C._records_and_statistics#Goalscorers # expect 101 in 2013-14 dflfc_scorers_tl[['season', 'league']].groupby(['season']).sum().sort('league', ascending=False).head(1) # In[11]: # remove OG dflfc_scorers_tl = dflfc_scorers_tl[dflfc_scorers_tl.player != 'Own goals'] dflfc_scorers_tl.shape # In[12]: # check 2013-14 dflfc_scorers_tl[dflfc_scorers_tl.season == '2013-2014'].head(10) # #### Create dataframe of squads giving age of each player # In[13]: LFC_SQUADS_CSV_FILE = 'data/lfc_squads_1892-1893_2014-2015.csv' dflfc_squads = pd.read_csv(LFC_SQUADS_CSV_FILE) dflfc_squads.shape # In[14]: dflfc_squads.head() # In[15]: dflfc_squads.tail() # #### Create dataframe of league tables # In[16]: LFC_LEAGUE_CSV_FILE = 'data/lfc_league_1892-1893_2014-2015.csv' dflfc_league = pd.read_csv(LFC_LEAGUE_CSV_FILE) dflfc_league.shape # In[17]: dflfc_league.head() # In[18]: dflfc_league.tail() # #### Create merged dataframe, combining scorers with position # In[19]: dflfc_scorers_tl_pos = pd.DataFrame.merge(dflfc_scorers_tl, dflfc_squads) dflfc_scorers_tl_pos.shape # In[20]: dflfc_scorers_tl_pos.head() # In[21]: dflfc_scorers_tl_pos.tail() # #### Create dataframe of players giving birthdate of each player # In[22]: LFC_PLAYERS_CSV_FILE = 'data\lfc_players_july2015_upd.csv' dflfc_players = pd.read_csv(LFC_PLAYERS_CSV_FILE, parse_dates=['birthdate']) dflfc_players.shape # In[23]: dflfc_players.head() # In[24]: dflfc_players.tail() # ##### Add players age to the dataframe # In[25]: def age_at_season(row): """Return player's age at mid-point of season, assumed to be 1st Jan. row.player -> player's name row.season -> season uses dflfc_players to look-up birthdate, keyed on player - return average age if player is missing from dflfc_players """ AVERAGE_AGE = 26.5 mid_point = '01 January {}'.format(row.season[-4:]) try: dob = dflfc_players[dflfc_players.player == row.player].birthdate.values[0] except: # use average age if player's birthdate not available print 'error: age not found for player {} in season {}, using average age {}'.format(row.player, row.season, AVERAGE_AGE) return AVERAGE_AGE return round((pd.Timestamp(mid_point) - dob).days/365.0, 1) # In[26]: # add age column dflfc_scorers_tl_pos['age'] = dflfc_scorers_tl_pos.apply(lambda row: age_at_season(row), axis=1) # In[27]: dflfc_scorers_tl_pos_age = dflfc_scorers_tl_pos.copy() # In[28]: dflfc_scorers_tl_pos_age.head() # ## Analyse the data # # Ask a question and find the answer. # In[29]: # check number of seasons in top flight between 1960 and 2014 (expect 53) tf = dflfc_scorers_tl_pos_age[(dflfc_scorers_tl_pos_age.season.str[0:4] >= '1960') & (dflfc_scorers_tl_pos_age.season.str[0:4] <= '2014')].season.unique() tf[-1], tf[0], len(tf) # In[30]: # show modern scorers who scored >20, with age - aka elite strikers dflfc_elite = dflfc_scorers_tl_pos_age[(dflfc_scorers_tl_pos_age.season.str[0:4] >= '1962') & (dflfc_scorers_tl_pos_age.league >= 20)].sort('league', ascending=False) dflfc_elite.reset_index(drop=True, inplace=True) dflfc_elite # In[31]: # show near misses - modern scorers who scored >=18 and <=19, with age dflfc_scorers_tl_pos_age[(dflfc_scorers_tl_pos_age.season.str[0:4] >= '1962') & (dflfc_scorers_tl_pos_age.league >= 18) & (dflfc_scorers_tl_pos_age.league <= 19)].sort('league', ascending=False) # In[32]: # show number of games played in seasons when strikers scored >20 dflfc_league[['Season', 'League', 'Pos', 'PLD']][dflfc_league.Season.isin(dflfc_elite.season)] # In[33]: # show strikers with total scored >20 df_striker_tot = dflfc_elite[['player', 'league']].groupby('player').sum() df_striker_tot.rename(columns={'league': 'Tot'}, inplace=True) df_striker_tot.index.name = 'Player' df_striker_tot.sort('Tot', ascending=False).head() # In[34]: # show number of elite strikers who scored >20 len(dflfc_elite) # In[35]: # show number of unique strikers who scored >20 top20_lu = dflfc_elite.sort('league', ascending=False).player.unique() print len(top20_lu) print top20_lu # In[36]: # show number of unique scorers who scored >20 - pretty print player_d = collections.defaultdict(list) top20_l = dflfc_elite.sort('league', ascending=False).values for season, player, goals, position, age, in top20_l: # season, player, goals, age player_d[player].append('{} ({})'.format(season, goals)) for k,v in player_d.iteritems(): print '{}: {}'.format(k, ', '.join(v)) # In[37]: # construct table (dataframe) to summarise the data # col 0 of dataframe is the list of season (goals) converted to a comma separated string df = pd.DataFrame.from_dict({k: ', '.join(v) for k,v in player_d.iteritems()}, orient='index') # col 1 is number of times player has scored 20+ (derived from number of brackets) df[1] = df[0].apply(lambda x: x.count('(')) # change column order and rename df = df[[1, 0]] df.rename(columns={1: 'Times 20+ Achieved', 0: 'Seasons in which player scored 20+ goals (with number of league goals scored)'}, inplace=True) df.index.name = 'Player' # add total (sum of goals in brackets) df = df.merge(df_striker_tot, left_index=True, right_index=True) # set pandas option that avoids the default curtailing of long rows with ... pd.set_option('display.max_colwidth', -1) # show dataframe sorted by number of times achieved df = df.sort(['Times 20+ Achieved', 'Tot'], ascending=(False, False)) df[['Times 20+ Achieved', 'Seasons in which player scored 20+ goals (with number of league goals scored)']] # In[38]: # show youngest elite scorers who scored >20 dflfc_elite.sort('league', ascending=False).sort('age').head(5) # In[39]: # check to see if anyone has matched Robbie's feat... # show youngest elite scorers who scored >20 for ALL seasons dflfc_scorers_tl_pos_age[(dflfc_scorers_tl_pos_age.league >= 20)].sort('league', ascending=False).sort('age').head(5) # No, only God has done that! # In[40]: # show oldest elite scorers who scored >20 dflfc_elite.sort('league', ascending=False).sort('age').tail(5) # In[41]: # show most appearances of elite strikers who scored >20 dflfc_elite.sort('league', ascending=False).player.value_counts() # In[42]: # show average (mean) age of elite strikers who scored >20 dflfc_elite.sort('league', ascending=False).mean() # In[43]: # show average (median) age of elite strikers who scored >20 dflfc_elite.sort('league', ascending=False).median() # In[44]: # show most appearances of elite strikers who scored >20 for ALL seasons dflfc_scorers_tl_pos_age[(dflfc_scorers_tl_pos_age.league >= 20)].sort('league', ascending=False).player.value_counts() # In[45]: # show most appearances of elite strikers who scored >25 for ALL seasons dflfc_scorers_tl_pos_age[(dflfc_scorers_tl_pos_age.league >= 25)].sort('league', ascending=False).player.value_counts() # In[46]: # show frequency of elite scorers who scored >20 top20_freq = dflfc_elite['player'].groupby(dflfc_elite.season).count() top20_freq # In[47]: # produce list of all modern seasons (since 1962) START_MOD_YR = 1962 END_MOD_YR = 2015 all_mod_yrs = list('{}-{}'.format(i, i+1) for i in range(START_MOD_YR, END_MOD_YR)) print len(all_mod_yrs) # expect 53 all_mod_yrs[0:5] # In[48]: # identify missing years and update frequency so it shows all years missing_mod_years = [year for year in all_mod_yrs if year not in top20_freq.index.values] for yr in missing_mod_years: top20_freq[yr] = 0 top20_freq = top20_freq.sort_index() top20_freq[0:5] # In[49]: # create list of seasons when LFC were champions CHAMPS = ['1900-1901', '1905-1906', '1921-1922', '1922-1923', '1946-1947', '1963-1964',\ '1965-1966', '1972-1973', '1975-1976', '1976-1977', '1978-1979', '1979-1980',\ '1981-1982', '1982-1983', '1983-1984', '1985-1986', '1987-1988', '1989-1990'] CHAMPS[0:5] # Plot the number of players scoring 20 or more goals by season # In[50]: ax = top20_freq.plot(kind='bar', figsize=(14, 5), color='y', yticks=[0, 1, 2], ylim=(0,2.5), grid=False) ax.set_xlabel('Season') ax.set_ylabel('Number of players') ax.set_title('Number of players scoring 20 or more goals by Season') ax.text(-0.2, 2.36, 'prepared by: @terry8dolan', bbox=dict(facecolor='none', edgecolor='none', alpha=0.6)) # add vertical line for title winning seasons since 1962 champs_since_62 = [list(top20_freq.index.values).index(yr) for yr in CHAMPS if yr[0:4] >= '1962'] for x in champs_since_62: plt.axvline(x, color='r', linestyle='--') # create fake legend l1 = plt.Line2D([], [], linewidth=1, color='r', linestyle='--') l2 = plt.Line2D([], [], linewidth=5, color='y') labels = ['LFC Title Winning Seasons', 'Players scoring 20+ goals in the league'] ax.legend([l1, l2], labels, fancybox=True, shadow=True, framealpha=0.8) # plot and save current figure fig = plt.gcf() plt.show() fig.savefig('20+PlayervsGoals.png', bbox_inches='tight') # Show percentage of title winning seasons that have had striker with >20 goals # In[51]: # create a dataframe showing position in league in top flight season since 1962 dflfc_league_pos = dflfc_league[['Season', 'Pos']][dflfc_league.Season.str[0:4] >= '1962'] dflfc_league_pos.rename(columns={'Season': 'season', 'Pos': 'pos'}, inplace=True) dflfc_league_pos.head() # In[52]: # create dataframe showing position in league when players have scored >20 top20_seasons = dflfc_elite.season.unique() dflfc_elite_pos = dflfc_league_pos[dflfc_league_pos.season.isin(top20_seasons)] dflfc_elite_pos.head() # In[53]: # show position in league when players have scored >20 val_cnts = dflfc_league_pos[['season', 'pos']][dflfc_league_pos.season.isin(top20_seasons)]['pos'].value_counts() val_cnts # In[54]: # show % of title winning seasons that have had striker with >20 goals tot_champs_since_62 = len(dflfc_league_pos[(dflfc_league_pos.pos == 1)]) tot_elite_champs = len(dflfc_elite_pos[(dflfc_elite_pos.pos == 1)]) print '{} of {} ({}%) of title winning seasons have had striker with >20 goals'\ .format(tot_elite_champs, tot_champs_since_62, round(100*tot_elite_champs/tot_champs_since_62, 1)) # In[55]: # show the 4 title winning seasons since 1962 that didn't have an elite scorer dflfc_scorers_tl_pos_age[(dflfc_scorers_tl_pos_age.season.str[0:4] >= '1962') & (dflfc_scorers_tl_pos_age.season.isin(CHAMPS))].sort(['season', 'league'], ascending=[True, False])\ .groupby(dflfc_scorers_tl_pos_age.season).head(1).sort('league').head(4) # In[56]: # show top goalscorers in modern seasons when Liverpool won the league without a striker scoring >20 s = ['1976-1977', '1972-1973', '1975-1976', '1981-1982'] dflfc_scorers_tl_pos_age[dflfc_scorers_tl_pos_age.season.isin(s)].groupby('season').head(3) # Show distribution of strikers with more than 20 goals by age # In[57]: # create 'cut' of value counts pd.value_counts(pd.cut(dflfc_elite.age.values, range(19,32))) # In[58]: # create dataframe of value counts of age cut into 1 year bins from 19 to 31 MIN_AGE = int(np.floor(dflfc_elite.age.min())) MAX_AGE = int(np.ceil(dflfc_elite.age.max())) df = pd.DataFrame(pd.value_counts(pd.cut(dflfc_elite.age.values, range(MIN_AGE, MAX_AGE+1), labels=['{} to {}'.format(i, i+1) for i in range(MIN_AGE, MAX_AGE)]))) df.sort_index(inplace=True) df # In[59]: # plot ax = df.plot(kind='bar', figsize=(9, 6), grid=False, ylim=(0, 4.5), yticks=range(1,5), legend=False, color='r') ax.set_xlabel('Age') ax.set_ylabel('Number of players') ax.set_title('Number of players scoring 20 or more goals by Age') ax.text(-0.4, 4.3, 'prepared by: @terry8dolan', bbox=dict(facecolor='none', edgecolor='none', alpha=0.6)) # plot and save current figure fig = plt.gcf() plt.show() fig.savefig('20+PlayervsAge.png', bbox_inches='tight') # In[ ]: