Notebook

FIFA World Cup data¶

http://www.linguasport.com/futbol/internacional/mundial/seekff.asp has a list of all results. It's a fairly simple paginated HTML table structure.

In [1]:

import os
import re
import hashlib
import requests
from lxml.html import parse

if not os.path.exists('.cache'):
    os.makedirs('.cache')

ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36'
session = requests.Session()

def get(url):
    '''Return cached lxml tree for url'''
    path = os.path.join('.cache', hashlib.md5(url).hexdigest() + '.html')
    if not os.path.exists(path):
        print url
        response = session.get(url, headers={'User-Agent': ua})
        with open(path, 'w') as fd:
            fd.write(response.text.encode('utf-8'))
    return parse(open(path))

In [2]:

result = []

def process(page):
    headers = 'edition,year,venue,round,team1,team2,score'.split(',')
    url = 'http://www.linguasport.com/futbol/internacional/mundial/seekff.asp'
    if page > 1:
        url += '?pn=%d' % page
    tree = get(url)
    count = 0
    for row in tree.findall('.//tr')[1:]:
        cells = [cell.text_content().strip() for cell in row.findall('.//td')]
        if len(cells) == 7:
            match = dict(zip(headers, cells))
            match['url'] = row.find('.//a').get('href')
            result.append(match)
            count += 1
    return count

In [3]:

page = 1
while True:
    count = process(page)
    if count == 0:
        break
    else:
        page += 1

In [4]:

data = pd.DataFrame(result)
data.to_csv('matches.csv', index=False, encoding='utf-8')

Scrape goal-level data¶

We're restricting ourselves to the FS (final stage) details pages

In [5]:

urls = set([url.split('#')[0] for url in data['url'].unique() if '_FS' in url])

In [6]:

re_space = re.compile(r'\s+', re.DOTALL)
re_goals = re.compile(r'(\d)\-(\d) +\((\D+)([\d\+]+).*?\)', re.DOTALL)
head_goals = 'team1score,team2score,player,minute'.split(',')
re_books = re.compile(r'([^\(]+)\(.*?([\d\+]+).*?\) *[,/]', re.DOTALL)
head_books = 'player,minute'.split(',')

base = 'http://www.linguasport.com/futbol/internacional/mundial/'
goals = []
games = []
books = []
for url in urls:
    tree = get(base + url)
    for table in tree.findall('.//table[@class="MsoNormalTable"]'):
        game = {
            'url': url,
            'stage': table.find('.//tr[1]//td').text_content()
        }
        games.append(game)
        game['game_id'] = len(games)
        for para in table.findall('.//tr[2]//p'):
            text = para.text_content().strip()
            match = re.match(r'([A-Z]+): +(.*)', text, re.DOTALL)
            if match:
                game[match.group(1)] = re_space.sub(' ', match.group(2)).strip()
            if 'ATTENDANCE' in game:
                game['ATTENDANCE'] = game['ATTENDANCE'].replace('.', '')
            if 'GOALS' in game:
                for matches in re_goals.findall(game['GOALS']):
                    goal = dict(zip(head_goals, [m.strip() for m in matches]))
                    goal['game_id'] = game['game_id']
                    goals.append(goal)
                del game['GOALS']
            if 'BOOKED' in game:
                for matches in re_books.findall(game['BOOKED']):
                    book = dict(zip(head_books, [m.strip() for m in matches]))
                    book['game_id'] = game['game_id']
                    books.append(book)
                del game['BOOKED']
        teams = re_space.sub(' ', table.find('.//tr[3]//td[3]').text_content())
        teams = teams.replace(u'\xe2\x80\x93', '-').strip().split(' - ')
        if len(teams) == 2:
            game['team1'], game['team2'] = teams

In [7]:

pd.DataFrame(games).to_csv('games.csv', index=False, encoding='utf-8')
pd.DataFrame(goals).to_csv('goals.csv', index=False, encoding='utf-8')
pd.DataFrame(books).to_csv('books.csv', index=False, encoding='utf-8')