Notebook

Scrape the 2013 Indian election results¶

The main portal that holds the results is http://eciresults.nic.in/. Of these, the constituency-wise results http://eciresults.nic.in/ConstituencywiseS2653.htm appears to have the maximum detail, with the following fields:

State
Constituency
Candidate
Party
Votes

So let's scrape that.

In [1]:

# I'll keep it to standard Python 2.7 libraries, but for lxml.
import os
import re
import urllib
from hashlib import sha256
from lxml.html import parse

In [2]:

def get(url):
    """Retrieves a URL as an lxml tree, cached where possible"""
    filename = '.cache.' + sha256(url).hexdigest()
    if not os.path.exists(filename):
        html = urllib.urlretrieve(url, filename)
    return parse(filename)

In [3]:

def constituencies(url):
    """Yields dicts with state, state_code, constituency, constituency_code."""
    tree = get(url)

    # States and codes are stored in Javascript, like this:
    #     if (st.value == 'S26') {
    #         strValues = document.getElementById('HdnFldChhattisgarh').value;
    # This is a crude parsing of that code
    statecode = re.findall('st.value *=+ *\'([^\']+).*?HdnFld([^\']+)',
                           tree.findall('.//script')[0].text, re.S)
    statecode = {state:code for code, state in statecode}
    
    # Constituency codes are in hidden input fields. Format is:
    # code,constituency; code,constituency; ...
    for el in tree.findall('.//input[@id]'):
        id = el.get('id', '').strip()
        if id.startswith('HdnFld'):
            state = id.replace('HdnFld', '')
            for row in el.get('value').split(';'):
                row = row.strip()
                if row:
                    cells = row.split(',')
                    yield {
                        'state': state,
                        'statecode': statecode.get(state),
                        'constituency': cells[1],
                        'constituencycode': cells[0]
                    }

In [4]:

def results(url):
    """For a constituency URL, yields dicts with candidate, party, votes."""
    tree = get(url)

    # Results are inside a table in a <div id="div1">
    for row in tree.findall('.//*[@id="div1"]//tr'):
        cells = row.findall('td')
        if len(cells) >= 3:
            yield {
                'candidate': cells[0].text.strip(),
                'party': cells[1].text.strip(),
                'votes': cells[2].text.strip(),
            }

In [5]:

dataset = []
for place in constituencies('http://eciresults.nic.in/ConstituencywiseS2653.htm'):
    url = 'http://eciresults.nic.in/Constituencywise{:s}{:s}.htm?ac={:s}'.format(
        place['statecode'], place['constituencycode'], place['constituencycode'])
    # print 'Debug: scraping', place['state'], place['constituency']
    for result in results(url):
        result.update(place)
        dataset.append(result)

In [6]:

# Let's save this as tab-delimited UTF-8 file. (Sadly, csv doesn't do UTF-8)
with open('2013-result.txt', 'wb') as out:
    fields = ['state', 'constituency', 'votes', 'candidate', 'party']
    out.write('\t'.join(fields) + '\n')
    for row in dataset:
        out.write('\t'.join(row[f] for f in fields).encode('utf-8') + '\n')