# I'll keep it to standard Python 2.7 libraries, but for lxml. import os import re import urllib from hashlib import sha256 from lxml.html import parse def get(url): """Retrieves a URL as an lxml tree, cached where possible""" filename = '.cache.' + sha256(url).hexdigest() if not os.path.exists(filename): html = urllib.urlretrieve(url, filename) return parse(filename) def constituencies(url): """Yields dicts with state, state_code, constituency, constituency_code.""" tree = get(url) # States and codes are stored in Javascript, like this: # if (st.value == 'S26') { # strValues = document.getElementById('HdnFldChhattisgarh').value; # This is a crude parsing of that code statecode = re.findall('st.value *=+ *\'([^\']+).*?HdnFld([^\']+)', tree.findall('.//script')[0].text, re.S) statecode = {state:code for code, state in statecode} # Constituency codes are in hidden input fields. Format is: # code,constituency; code,constituency; ... for el in tree.findall('.//input[@id]'): id = el.get('id', '').strip() if id.startswith('HdnFld'): state = id.replace('HdnFld', '') for row in el.get('value').split(';'): row = row.strip() if row: cells = row.split(',') yield { 'state': state, 'statecode': statecode.get(state), 'constituency': cells[1], 'constituencycode': cells[0] } def results(url): """For a constituency URL, yields dicts with candidate, party, votes.""" tree = get(url) # Results are inside a table in a
for row in tree.findall('.//*[@id="div1"]//tr'): cells = row.findall('td') if len(cells) >= 3: yield { 'candidate': cells[0].text.strip(), 'party': cells[1].text.strip(), 'votes': cells[2].text.strip(), } dataset = [] for place in constituencies('http://eciresults.nic.in/ConstituencywiseS2653.htm'): url = 'http://eciresults.nic.in/Constituencywise{:s}{:s}.htm?ac={:s}'.format( place['statecode'], place['constituencycode'], place['constituencycode']) # print 'Debug: scraping', place['state'], place['constituency'] for result in results(url): result.update(place) dataset.append(result) # Let's save this as tab-delimited UTF-8 file. (Sadly, csv doesn't do UTF-8) with open('2013-result.txt', 'wb') as out: fields = ['state', 'constituency', 'votes', 'candidate', 'party'] out.write('\t'.join(fields) + '\n') for row in dataset: out.write('\t'.join(row[f] for f in fields).encode('utf-8') + '\n')