The main portal that holds the results is http://eciresults.nic.in/. Of these, the constituency-wise results http://eciresults.nic.in/ConstituencywiseS2653.htm appears to have the maximum detail, with the following fields:
So let's scrape that.
# I'll keep it to standard Python 2.7 libraries, but for lxml.
import os
import re
import urllib
from hashlib import sha256
from lxml.html import parse
def get(url):
"""Retrieves a URL as an lxml tree, cached where possible"""
filename = '.cache.' + sha256(url).hexdigest()
if not os.path.exists(filename):
html = urllib.urlretrieve(url, filename)
return parse(filename)
def constituencies(url):
"""Yields dicts with state, state_code, constituency, constituency_code."""
tree = get(url)
# States and codes are stored in Javascript, like this:
# if (st.value == 'S26') {
# strValues = document.getElementById('HdnFldChhattisgarh').value;
# This is a crude parsing of that code
statecode = re.findall('st.value *=+ *\'([^\']+).*?HdnFld([^\']+)',
tree.findall('.//script')[0].text, re.S)
statecode = {state:code for code, state in statecode}
# Constituency codes are in hidden input fields. Format is:
# code,constituency; code,constituency; ...
for el in tree.findall('.//input[@id]'):
id = el.get('id', '').strip()
if id.startswith('HdnFld'):
state = id.replace('HdnFld', '')
for row in el.get('value').split(';'):
row = row.strip()
if row:
cells = row.split(',')
yield {
'state': state,
'statecode': statecode.get(state),
'constituency': cells[1],
'constituencycode': cells[0]
}
def results(url):
"""For a constituency URL, yields dicts with candidate, party, votes."""
tree = get(url)
# Results are inside a table in a <div id="div1">
for row in tree.findall('.//*[@id="div1"]//tr'):
cells = row.findall('td')
if len(cells) >= 3:
yield {
'candidate': cells[0].text.strip(),
'party': cells[1].text.strip(),
'votes': cells[2].text.strip(),
}
dataset = []
for place in constituencies('http://eciresults.nic.in/ConstituencywiseS2653.htm'):
url = 'http://eciresults.nic.in/Constituencywise{:s}{:s}.htm?ac={:s}'.format(
place['statecode'], place['constituencycode'], place['constituencycode'])
# print 'Debug: scraping', place['state'], place['constituency']
for result in results(url):
result.update(place)
dataset.append(result)
# Let's save this as tab-delimited UTF-8 file. (Sadly, csv doesn't do UTF-8)
with open('2013-result.txt', 'wb') as out:
fields = ['state', 'constituency', 'votes', 'candidate', 'party']
out.write('\t'.join(fields) + '\n')
for row in dataset:
out.write('\t'.join(row[f] for f in fields).encode('utf-8') + '\n')