#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', "-a 'Sebastian Raschka' -v -d") #
#
# # Collecting Premier League Data # ### Sections # - [dreamteamfc.com](#dreamteamfc.com) # - [Getting General Player Statistics](#Getting-General-Player-Statistics) # - [Getting Injuries and Cards Information](#Getting-Injuries-and-Cards-Information) # - [Getting Player Form Information](#Getting-Player-Form-Information) # - [Saving the Data to CSV](#Saving-the-Data-to-CSV) # - [espnfc.com](#espnfc.com) # - [Getting Team Ranks and Stats](#Getting-Team-Ranks-and-Stats) # - [Saving ESPN Data to CSV](#Saving-ESPN-Data-to-CSV) # - [Getting Top Scorer](#Getting-Top-Scorer) # - [Getting Top Assists](#Getting-Top-Assists) # - [365stats.com](#365stats.com) # - [Getting Injury Data](#Getting-Injury-Data) # - [Saving 365stats Data to CSV](#Saving-365stats-Data-to-CSV) # - [Transfermarkt.com](#Transfermarkt.com) # - [Getting Home and Away Teams](#Getting-Home-and-Away-Teams) # - [Saving Home and Away Teams to CSV](#Saving-Home-and-Away-Teams-to-CSV) # - [premierleague.com](#premierleague.com) # - [telegraph.co.uk](#telegraph.co.uk) # - [Getting Current Week Points](#Getting-Current-Week-Points) # - [Getting 6-Week Points](#Getting-6---Week-Points) # - [Saving telegraph.co.uk to CSV](#Saving-telegraph.co.uk-to-CSV) # - [m.premierleague.com](#m.premierleague.com) # - [Combined Form of Previous 6 Days](#Combined-Form-of-Previous-6-Days) # - [Saving m.premierleague.com to CSV](#Saving-m.premierleague.com-to-CSV) # - [fantasyfootballscout.co.uk](#fantasyfootballscout.co.uk) # - [Predicted Line-Ups](#Predicted Line-Ups) # - [Saving fantasyfootballscout.co.uk to CSV](#Saving-fantasyfootballscout.co.uk-to-CSV) #
#
# # dreamteamfc.com # [[back to top](#Sections)] #
#
# ## Getting General Player Statistics # [[back to top](#Sections)] # In[1]: import pandas as pd from bs4 import BeautifulSoup import bs4 import requests # In[3]: # Downloading and parsing the data into a Python dict player_dict = {} url = 'https://www.dreamteamfc.com/statistics/players/ALL/' r = requests.get(url) soup = BeautifulSoup(r.text, 'html5lib') # Note: html5lib deals better with broken html than lxml name_list = [] for td in soup.findAll("td", { "class" : "tabName" }): name = td.text.split('Statistics')[-1].strip() if name: name_list.append(name) res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)] position, team, vfm, value, points = res value = value.strip('m') player_dict[name] = [name, position, team, vfm, value, points] print('Found: %s' % len(name_list)) print(name_list[-1]) # In[4]: # Reading the data into a pandas DataFrame df = pd.DataFrame.from_dict(player_dict, orient='index') df.columns = ['name', 'position', 'team', 'vfm', 'value', 'pts'] df[['vfm','value']] = df[['vfm','value']].astype(float) df[['pts']] = df[['pts']].astype(int) df.tail() # In[5]: df.describe() #
#
# ## Getting Injuries and Cards Information # [[back to top](#Sections)] # In[6]: df['status'] = pd.Series('', index=df.index) df['description'] = pd.Series('', index=df.index) df['returns'] = pd.Series('', index=df.index) # In[7]: url = 'https://www.dreamteamfc.com/statistics/injuries-and-cards/ALL/' r = requests.get(url) soup = BeautifulSoup(r.text, 'html5lib') name_list = [] for td in soup.findAll("td", { "class" : "tabName2" }): name = td.text.split('stats')[-1].strip() if name: name_list.append(name) res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)] position, team, status, description, returns = res df.loc[df.index==name,['status', 'description', 'returns']] = status, description, returns print('Found: %s' % len(name_list)) print(name_list[-1]) # In[8]: df.tail() #
#
# ## Getting Player Form Information # [[back to top](#Sections)] # In[9]: df['month_pts'] = pd.Series(0, index=df.index) df['week_pts'] = pd.Series(0, index=df.index) # In[11]: url = 'https://www.dreamteamfc.com/statistics/form-guide/all' r = requests.get(url) soup = BeautifulSoup(r.text, 'html5lib') name_list = [] for td in soup.findAll("td", { "class" : "tabName" }): name = td.text.strip() if name: name_list.append(name) res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag)] try: month_pts, week_pts = float(res[-2]), float(res[-1]) df.loc[df.index==name, ['month_pts', 'week_pts']] = month_pts, week_pts except ValueError: pass print('Found: %s' % len(name_list)) print(name_list[-1]) # In[13]: # Reordering the columns df = df[['name', 'position', 'team', 'vfm', 'value', 'pts', 'month_pts', 'week_pts', 'status', 'description', 'returns']] df.tail() #
#
# ## Saving the Data to CSV # [[back to top](#Sections)] # In[13]: # Getting the current time stamp for the data from datetime import datetime url = 'https://www.dreamteamfc.com/statistics/players/ALL/' r = requests.get(url) data = r.text soup = BeautifulSoup(data) raw_date = soup.find('li', {'class' : 'pointsupdateinfo' }).text raw_date = raw_date.split()[-1].replace('/', '').strip() d = datetime.strptime(raw_date, '%d%m%Y').date() date = d.strftime('%Y%m%d') print(date) # In[14]: df.to_csv('../data/dreamteamfc_%s.csv' % date, index=False) #
#
# # espnfc.com # [[back to top](#Sections)] #
#
# ## Getting Team Ranks and Stats # [[back to top](#Sections)] # In[2]: import pandas as pd from bs4 import BeautifulSoup import bs4 import requests # In[13]: # Downloading and parsing the data into a Python dict team_dict = {} url = 'http://www.espnfc.com/barclays-premier-league/23/table' r = requests.get(url) soup = BeautifulSoup(r.text, 'html5lib') # Note: html5lib deals better with broken html than lxml for td in soup.findAll('td', { 'class' : 'pos' }): rank = int(td.text) res = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag) and i.text!='\xa0'] team_name = res[0].strip() values = [int(i) for i in res[1:]] team_dict[team_name] = [rank] + values # Column legend: # # - Pos: POSITION # - P: GAMES PLAYED # - W: WINS # - D: DRAWS # - L: LOSSES # - F: GOALS FOR # - A: GOALS AGAINST # - GD: GOAL DIFFERENCE # - PTS: POINTS # # suffixes: # - _ov: OVERALL # - _hm: HOME GAMES # - _aw: AWAY GAMES # In[14]: df = pd.DataFrame.from_dict(team_dict, orient='index') cols = ['Pos','P_ov','W_ov','D_ov','L_ov','F_ov','A_ov', 'W_hm','D_hm','L_hm','F_hm','A_hm', 'W_aw', 'D_aw','L_aw','F_aw','A_aw','GD','PTS'] df.columns = cols df = df.sort('Pos') df['team'] = df.index df = df[['team']+cols] df #
#
# ## Saving ESPN Data to CSV # [[back to top](#Sections)] # In[12]: df.to_csv('../data/2014_epl_day_17/espn_20141222.csv', index=False) #
#
# ## Getting Top Scorer # [[back to top](#Sections)] # In[10]: # Downloading and parsing the data into a Python dict player_dict = {} url = 'http://www.espnfc.com/barclays-premier-league/23/statistics/scorers' r = requests.get(url) soup = BeautifulSoup(r.text, 'html5lib') # Note: html5lib deals better with broken html than lxml for td in soup.findAll('td', { 'headers' : 'player' }): name = td.text team, goals = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag) and i.text!='\xa0'] player_dict[name] = [team, int(goals)] df_essc = pd.DataFrame.from_dict(player_dict, orient='index') df_essc['name'] = df_essc.index df_essc.columns = ['team', 'goals', 'name'] df_essc = df_essc[['name', 'team', 'goals']] df_essc.sort('goals', ascending=False, inplace=True) df_essc.head() #
#
# ## Getting Top Assists # [[back to top](#Sections)] # In[8]: player_dict = {} url = 'http://www.espnfc.com/barclays-premier-league/23/statistics/assists' r = requests.get(url) soup = BeautifulSoup(r.text, 'html5lib') # Note: html5lib deals better with broken html than lxml for td in soup.findAll('td', { 'headers' : 'player' }): name = td.text team, assists = [i.text for i in td.next_siblings if isinstance(i, bs4.element.Tag) and i.text!='\xa0'] player_dict[name] = [team, int(assists)] df_esas = pd.DataFrame.from_dict(player_dict, orient='index') df_esas['name'] = df_esas.index df_esas.columns = ['team', 'assists', 'name'] df_esas = df_esas[['name', 'team', 'assists']] df_esas.sort('assists', ascending=False, inplace=True) df_esas.head() #
#
# # 365stats.com # [[back to top](#Sections)] #
#
# ## Getting Injury Data # [[back to top](#Sections)] # In[1]: import pandas as pd from bs4 import BeautifulSoup import bs4 import requests # In[2]: # Downloading and parsing the data into a Python dict injury_dict = {} url = 'http://365stats.com/football/injuries' r = requests.get(url) soup = BeautifulSoup(r.text, 'html5lib') # Note: html5lib deals better with broken html than lxml for td in soup.findAll('td', { 'nowrap' : 'nowrap' }): name = td.text.split() player_info = ['%s, %s' % (' '.join(name[1:]), name[0])] for i in td.next_siblings: if isinstance(i, bs4.Tag): player_info.append(i.text) injury_dict[player_info[0]] = player_info[1:3] # In[3]: df = pd.DataFrame.from_dict(injury_dict, orient='index') df.columns=['injury', 'returns'] df['name'] = df.index df = df[['name', 'injury', 'returns']] df.tail() #
#
# ## Saving 365stats Data to CSV # [[back to top](#Sections)] # In[4]: df.to_csv('../data/2014_epl_day_17/365stats_injury_20141222.csv') #
#
# # Transfermarkt.com # [[back to top](#Sections)] #
#
# ### Getting Home and Away Teams # [[back to top](#Sections)] # In[1]: import pandas as pd from bs4 import BeautifulSoup import bs4 import requests # In[2]: # Downloading and parsing the data into a Python dict url = 'http://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1' s = requests.Session() s.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36' s.headers['Host'] = 'www.transfermarkt.com' url = 'http://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1' r = s.get(url) soup = BeautifulSoup(r.text, 'html5lib') # Note: html5lib deals better with broken html than lxml # Find tab for the upcoming fixtures tab = 'spieltagtabs-2' div = soup.find('div', { 'id' : tab }) tit = div.findAll('a', { 'class' : 'ergebnis-link' }) if len(tit) > 0: tab = 'spieltagtabs-3' # Get fixtures home = [] away = [] div = soup.find('div', { 'id' : tab }) for t in div.findAll('td', { 'class' : 'text-right no-border-rechts no-border-links' }): team = t.text.strip() if team: home.append(team) for t in div.findAll('td', { 'class' : 'no-border-links no-border-rechts' }): team = t.text.strip() if team: away.append(team) df = pd.DataFrame(home, columns=['home']) df['away'] = away df #
#
# ### Saving Home and Away Teams to CSV # [[back to top](#Sections)] # In[11]: df.to_csv('../data/2014_epl_day_19/transfermarkt_20141227.csv', index=False) #
#
# # premierleague.com # [[back to top](#Sections)] # In[2]: import pandas as pd from bs4 import BeautifulSoup import bs4 import requests # In[9]: # Downloading and parsing the data into a Python dict url = 'http://www.premierleague.com/en-gb/matchday.html' r = requests.get(url) soup = BeautifulSoup(r.text, 'html5lib') # Note: html5lib deals better with broken html than lxml home = [] away = [] for t in soup.findAll('td', { 'width' : '30%' }): team = t.text.strip().split(' v ') print(team) #
#
# # telegraph.co.uk # [[back to top](#Sections)] #
#
# ## Getting Current Week Points # [[back to top](#Sections)] # In[1]: import pandas as pd from bs4 import BeautifulSoup import bs4 import requests # In[15]: url = 'https://fantasyfootball.telegraph.co.uk/premierleague/players/' r = requests.get(url) soup = BeautifulSoup(r.text, 'html5lib') # Note: html5lib deals better with broken html than lxml player_dict = {} for t in soup.findAll('td', { 'class' : 'first' }): player = t.text.strip() player_dict[player] = [] for s in t.next_siblings: if isinstance(s, bs4.Tag): player_dict[player].append(s.text) # parse the player dictionary df = pd.DataFrame.from_dict(player_dict, orient='index') # make name column df['name'] = df.index # assign column names and reorder columns df.columns = ['team', 'salary', 'pts/salary', 'week_pts', 'total_pts', 'name'] df = df[['name', 'team', 'salary', 'pts/salary', 'week_pts', 'total_pts']] # parse data into the right format df['salary'] = df['salary'].apply(lambda x: x.strip('£').strip(' m')) df[['salary', 'pts/salary']] = df[['salary', 'pts/salary']].astype(float) df[['week_pts', 'total_pts']] = df[['week_pts', 'total_pts']].astype(int) print(df.shape) df.tail() #
#
# ## Getting 6-Week Points # [[back to top](#Sections)] # In[16]: url = 'https://fantasyfootball.telegraph.co.uk/premierleague/formguide/' r = requests.get(url) soup = BeautifulSoup(r.text, 'html5lib') # Note: html5lib deals better with broken html than lxml df['6week_pts'] = pd.Series(0, index=df.index) for t in soup.findAll('td', { 'class' : 'first' }): player = t.text.strip() if player: week6 = t.parent.find('td', { 'class' : 'sixth last' }) df.loc[df['name'] == player, '6week_pts'] = week6.text df.tail() #
#
# ## Saving telegraph.co.uk to CSV # [[back to top](#Sections)] # In[39]: df.to_csv('../data/2014_epl_day_20/telegraph_20141229.csv', index=False) #
#
# # m.premierleague.com # [[back to top](#Sections)] #
#
# ## Combined Form of Previous 6 Days # [[back to top](#Sections)] # In[2]: import pandas as pd from bs4 import BeautifulSoup import bs4 import requests # In[3]: url = 'http://m.premierleague.com/en-gb/form-guide.html' r = requests.get(url) soup = BeautifulSoup(r.text, 'html5lib') # Note: html5lib deals better with broken html than lxml #df['6week_pts'] = pd.Series(0, index=df.index) team_dict = {} for d in soup.findAll('td', { 'class' : 'col-pos' }): if len(team_dict) > 20: break pos = d.text for e in d.next_siblings: if isinstance(e, bs4.Tag): if 'class' in e.attrs and 'col-club' in e.attrs['class']: club = e.text team_dict[club] = pos break df = pd.DataFrame.from_dict(team_dict, orient='index') df.columns = ['position-last-6-games'] df['team'] = df.index df.tail() #
#
# ## Saving m.premierleague.com to CSV # [[back to top](#Sections)] # In[26]: df.to_csv('../data/2014_epl_day_20/mpremierleague_20141230.csv', index=False) #
#
# # fantasyfootballscout.co.uk # [[back to top](#Sections)] #
#
# ## Predicted Line-Ups # [[back to top](#Sections)] # In[31]: import pandas as pd from bs4 import BeautifulSoup import bs4 import requests # In[85]: url = 'http://www.fantasyfootballscout.co.uk/team-news/' r = requests.get(url) soup = BeautifulSoup(r.text, 'html5lib') # Note: html5lib deals better with broken html than lxml team_dict = {} for li in soup.findAll('li'): for h2 in li.findAll('h2'): team = h2.text team_dict[team] = [] for p in li.findAll('span', { 'class' : 'player-name' }): player = p.text team_dict[team].append(player) df = pd.DataFrame.from_dict(team_dict) df.tail() #
#
# ## Saving fantasyfootballscout.co.uk to CSV # [[back to top](#Sections)] # In[86]: df.to_csv('../data/epl_1314_21/fantasyfootballscout.csv', index=False)