# I'll keep it to standard Python 2.7 libraries, but for lxml. import os import re import urllib from hashlib import sha256 from lxml.html import parse def get(url): """Retrieves a URL as an lxml tree, cached where possible""" filename = '.cache.' + sha256(url).hexdigest() if not os.path.exists(filename): html = urllib.urlretrieve(url, filename) return parse(filename) def constituencies(url): """Yields dicts with state, state_code, constituency, constituency_code.""" tree = get(url) # States and codes are stored in Javascript, like this: # if (st.value == 'S26') { # strValues = document.getElementById('HdnFldChhattisgarh').value; # This is a crude parsing of that code statecode = re.findall('st.value *=+ *\'([^\']+).*?HdnFld([^\']+)', tree.findall('.//script')[0].text, re.S) statecode = {state:code for code, state in statecode} # Constituency codes are in hidden input fields. Format is: # code,constituency; code,constituency; ... for el in tree.findall('.//input[@id]'): id = el.get('id', '').strip() if id.startswith('HdnFld'): state = id.replace('HdnFld', '') for row in el.get('value').split(';'): row = row.strip() if row: cells = row.split(',') yield { 'state': state, 'statecode': statecode.get(state), 'constituency': cells[1], 'constituencycode': cells[0] } def results(url): """For a constituency URL, yields dicts with candidate, party, votes.""" tree = get(url) # Results are inside a table in a