http://eci.nic.in/eci_main1/ElectionStatistics.aspx has PDFs of past election results. Let's first download them all and convert to text.
import os
from urllib import urlopen, urlretrieve
from urlparse import urljoin
from lxml.html import parse
from os.path import exists
from subprocess import call
# Use [xpdf](http://www.foolabs.com/xpdf/) to convert PDF to text
PDF_TO_TEXT = 'D:/Apps/xpdf/pdftotext.exe'
base = 'http://eci.nic.in/eci_main1/ElectionStatistics.aspx'
tree = parse(urlopen(base))
files = set()
def download(year, link):
'''Download a year's election results from link and convert to text'''
pdf_file = os.path.join('raw', year + '.pdf')
if not exists(pdf_file):
urlretrieve(urljoin(base, link), pdf_file)
text_file = pdf_file.replace('.pdf', '.txt')
if not exists(text_file):
call([PDF_TO_TEXT, '-layout', pdf_file, text_file])
files.add(year + '.txt')
# Get all rows from the first table in <div id="c">
# We pick only the first link, that has the
# constituency-wise detailed results
for td in tree.findall('//*[@id="c"]/table[1]//td'):
if td.text is None:
continue
year = td.text.strip().split(' ')[0]
download(year, td.find('.//a').get('href'))
# 2009 results are elsewhere. Hard code the link
download('2009', 'http://eci.nic.in/eci_main/archiveofge2009/Stats/VOLI/25_ConstituencyWiseDetailedResult.pdf')
Now, we'll convert these into a CSV file with the relevant data.
At this point, there's some manual munging of the text files. I'd ideally like to have avoided this, but it's just so much faster to manually process some of this content than write a program to do it.
I'll document what I did at some point. But a few notes in the meantime:
import re
import logging
fieldlist = {
'1951.txt': ['NAME', 'PARTY', 'VOTES', '%'],
'1957.txt': ['NAME', 'PARTY', 'VOTES', '%'],
'1962.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1967.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1971.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1977.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1980.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1984.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1985.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1989.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1991.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1992.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1996.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1998.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1999.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'2004.txt': ['NAME', 'SEX', 'AGE', 'CATEGORY', 'PARTY', 'GENERAL VOTES', 'POSTAL VOTES', 'VOTES'],
'2009.txt': ['#', 'NAME', 'SEX', 'AGE', 'CATEGORY', 'PARTY', 'GENERAL VOTES', 'POSTAL VOTES', 'VOTES', '% ELECTORS', '% VOTES'],
}
def old_text_parse(filename):
if filename.startswith('1'):
re_state = re.compile(r'^ {25,}[A-Za-z].*')
re_electors = re.compile(r'ELECTORS *: *(\d+)')
else:
re_state = re.compile(r'^[A-Z][A-Za-z& ]+$')
re_electors = re.compile(r'Total Electors *(\d+)(.*)')
re_constituency = re.compile(r'Constituency *:? *(\d+) *\.? *(.*)', re.IGNORECASE)
re_name = re.compile(r'^\d+ *\. *')
re_scst = re.compile(r' *\((SC|ST)\)')
fields = fieldlist[filename]
results, electors = [], {}
state, constituency = None, None
for ln, line in enumerate(open(filename)):
match = re_constituency.match(line)
if match:
constituency = match.group(2).split(' ')[0].upper()
constituency = re_scst.sub('', constituency)
continue
match = re_state.match(line)
if match:
state = line.strip().upper()
continue
match = re_electors.match(line)
if match:
electors[state, constituency] = match.group(1)
continue
parts = re.split(r' +', line.strip())
if len(parts) == len(fields):
row = dict(zip(fields, parts))
elif len(parts) == 1:
row['NAME'] = row['NAME'] + ' ' + line.strip()
continue
else:
logging.warn('%s:%d: %d parts, not %d: %s',
filename, ln + 1, len(parts), len(fields), line)
continue
row['STATE'] = state
row['PC'] = constituency
row['NAME'] = re_name.sub('', row['NAME'])
results.append(row)
results = pd.DataFrame(results).set_index(['STATE', 'PC'])
results['YEAR'] = filename.split('.')[0]
results['ELECTORS'] = pd.Series(electors)
if '%' in results:
del results['%']
return results.reset_index()
# Parse the text files
logging.basicConfig(level=logging.INFO)
results = []
for filename in sorted(fieldlist):
results.append(old_text_parse(filename))
results = pd.concat(results, ignore_index=True)['YEAR STATE PC NAME SEX PARTY AGE CATEGORY VOTES ELECTORS'.split(' ')]
# Cleanse the results
rename = pd.read_csv('rename.csv').set_index(['Field', 'Source'])['Target']
for col in rename.index.get_level_values(0).unique():
# print rename.ix[col]
results[col].replace(rename.ix[col].to_dict(), inplace=True)
# Calculations
results['VOTES'] = results['VOTES'].astype(float)
results['#'] = results.groupby(['YEAR', 'STATE', 'PC'])['VOTES'].rank(method='min', ascending=False)
results.sort(['YEAR', 'STATE', 'PC', 'VOTES'], ascending=(True, True, True, False), inplace=True)
results.to_csv('parliament.csv', index=False, float_format='%.0f')