http://www.linguasport.com/futbol/internacional/mundial/seekff.asp has a list of all results. It's a fairly simple paginated HTML table structure.
import os
import re
import hashlib
import requests
from lxml.html import parse
if not os.path.exists('.cache'):
os.makedirs('.cache')
ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36'
session = requests.Session()
def get(url):
'''Return cached lxml tree for url'''
path = os.path.join('.cache', hashlib.md5(url).hexdigest() + '.html')
if not os.path.exists(path):
print url
response = session.get(url, headers={'User-Agent': ua})
with open(path, 'w') as fd:
fd.write(response.text.encode('utf-8'))
return parse(open(path))
result = []
def process(page):
headers = 'edition,year,venue,round,team1,team2,score'.split(',')
url = 'http://www.linguasport.com/futbol/internacional/mundial/seekff.asp'
if page > 1:
url += '?pn=%d' % page
tree = get(url)
count = 0
for row in tree.findall('.//tr')[1:]:
cells = [cell.text_content().strip() for cell in row.findall('.//td')]
if len(cells) == 7:
match = dict(zip(headers, cells))
match['url'] = row.find('.//a').get('href')
result.append(match)
count += 1
return count
page = 1
while True:
count = process(page)
if count == 0:
break
else:
page += 1
data = pd.DataFrame(result)
data.to_csv('matches.csv', index=False, encoding='utf-8')
We're restricting ourselves to the FS (final stage) details pages
urls = set([url.split('#')[0] for url in data['url'].unique() if '_FS' in url])
re_space = re.compile(r'\s+', re.DOTALL)
re_goals = re.compile(r'(\d)\-(\d) +\((\D+)([\d\+]+).*?\)', re.DOTALL)
head_goals = 'team1score,team2score,player,minute'.split(',')
re_books = re.compile(r'([^\(]+)\(.*?([\d\+]+).*?\) *[,/]', re.DOTALL)
head_books = 'player,minute'.split(',')
base = 'http://www.linguasport.com/futbol/internacional/mundial/'
goals = []
games = []
books = []
for url in urls:
tree = get(base + url)
for table in tree.findall('.//table[@class="MsoNormalTable"]'):
game = {
'url': url,
'stage': table.find('.//tr[1]//td').text_content()
}
games.append(game)
game['game_id'] = len(games)
for para in table.findall('.//tr[2]//p'):
text = para.text_content().strip()
match = re.match(r'([A-Z]+): +(.*)', text, re.DOTALL)
if match:
game[match.group(1)] = re_space.sub(' ', match.group(2)).strip()
if 'ATTENDANCE' in game:
game['ATTENDANCE'] = game['ATTENDANCE'].replace('.', '')
if 'GOALS' in game:
for matches in re_goals.findall(game['GOALS']):
goal = dict(zip(head_goals, [m.strip() for m in matches]))
goal['game_id'] = game['game_id']
goals.append(goal)
del game['GOALS']
if 'BOOKED' in game:
for matches in re_books.findall(game['BOOKED']):
book = dict(zip(head_books, [m.strip() for m in matches]))
book['game_id'] = game['game_id']
books.append(book)
del game['BOOKED']
teams = re_space.sub(' ', table.find('.//tr[3]//td[3]').text_content())
teams = teams.replace(u'\xe2\x80\x93', '-').strip().split(' - ')
if len(teams) == 2:
game['team1'], game['team2'] = teams
pd.DataFrame(games).to_csv('games.csv', index=False, encoding='utf-8')
pd.DataFrame(goals).to_csv('goals.csv', index=False, encoding='utf-8')
pd.DataFrame(books).to_csv('books.csv', index=False, encoding='utf-8')