import os
from urllib import urlopen, urlretrieve
from urlparse import urljoin
from lxml.html import parse
from os.path import exists
from subprocess import call
# Use [xpdf](http://www.foolabs.com/xpdf/) to convert PDF to text
PDF_TO_TEXT = 'D:/Apps/xpdf/pdftotext.exe'
base = 'http://eci.nic.in/eci_main1/ElectionStatistics.aspx'
tree = parse(urlopen(base))
files = set()
def download(year, link):
'''Download a year's election results from link and convert to text'''
pdf_file = os.path.join('raw', year + '.pdf')
if not exists(pdf_file):
urlretrieve(urljoin(base, link), pdf_file)
text_file = pdf_file.replace('.pdf', '.txt')
if not exists(text_file):
call([PDF_TO_TEXT, '-layout', pdf_file, text_file])
files.add(year + '.txt')
# Get all rows from the first table in
# We pick only the first link, that has the
# constituency-wise detailed results
for td in tree.findall('//*[@id="c"]/table[1]//td'):
if td.text is None:
continue
year = td.text.strip().split(' ')[0]
download(year, td.find('.//a').get('href'))
# 2009 results are elsewhere. Hard code the link
download('2009', 'http://eci.nic.in/eci_main/archiveofge2009/Stats/VOLI/25_ConstituencyWiseDetailedResult.pdf')
import re
import logging
fieldlist = {
'1951.txt': ['NAME', 'PARTY', 'VOTES', '%'],
'1957.txt': ['NAME', 'PARTY', 'VOTES', '%'],
'1962.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1967.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1971.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1977.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1980.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1984.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1985.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1989.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1991.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1992.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1996.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1998.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'1999.txt': ['NAME', 'SEX', 'PARTY', 'VOTES', '%'],
'2004.txt': ['NAME', 'SEX', 'AGE', 'CATEGORY', 'PARTY', 'GENERAL VOTES', 'POSTAL VOTES', 'VOTES'],
'2009.txt': ['#', 'NAME', 'SEX', 'AGE', 'CATEGORY', 'PARTY', 'GENERAL VOTES', 'POSTAL VOTES', 'VOTES', '% ELECTORS', '% VOTES'],
}
def old_text_parse(filename):
if filename.startswith('1'):
re_state = re.compile(r'^ {25,}[A-Za-z].*')
re_electors = re.compile(r'ELECTORS *: *(\d+)')
else:
re_state = re.compile(r'^[A-Z][A-Za-z& ]+$')
re_electors = re.compile(r'Total Electors *(\d+)(.*)')
re_constituency = re.compile(r'Constituency *:? *(\d+) *\.? *(.*)', re.IGNORECASE)
re_name = re.compile(r'^\d+ *\. *')
re_scst = re.compile(r' *\((SC|ST)\)')
fields = fieldlist[filename]
results, electors = [], {}
state, constituency = None, None
for ln, line in enumerate(open(filename)):
match = re_constituency.match(line)
if match:
constituency = match.group(2).split(' ')[0].upper()
constituency = re_scst.sub('', constituency)
continue
match = re_state.match(line)
if match:
state = line.strip().upper()
continue
match = re_electors.match(line)
if match:
electors[state, constituency] = match.group(1)
continue
parts = re.split(r' +', line.strip())
if len(parts) == len(fields):
row = dict(zip(fields, parts))
elif len(parts) == 1:
row['NAME'] = row['NAME'] + ' ' + line.strip()
continue
else:
logging.warn('%s:%d: %d parts, not %d: %s',
filename, ln + 1, len(parts), len(fields), line)
continue
row['STATE'] = state
row['PC'] = constituency
row['NAME'] = re_name.sub('', row['NAME'])
results.append(row)
results = pd.DataFrame(results).set_index(['STATE', 'PC'])
results['YEAR'] = filename.split('.')[0]
results['ELECTORS'] = pd.Series(electors)
if '%' in results:
del results['%']
return results.reset_index()
# Parse the text files
logging.basicConfig(level=logging.INFO)
results = []
for filename in sorted(fieldlist):
results.append(old_text_parse(filename))
results = pd.concat(results, ignore_index=True)['YEAR STATE PC NAME SEX PARTY AGE CATEGORY VOTES ELECTORS'.split(' ')]
# Cleanse the results
rename = pd.read_csv('rename.csv').set_index(['Field', 'Source'])['Target']
for col in rename.index.get_level_values(0).unique():
# print rename.ix[col]
results[col].replace(rename.ix[col].to_dict(), inplace=True)
# Calculations
results['VOTES'] = results['VOTES'].astype(float)
results['#'] = results.groupby(['YEAR', 'STATE', 'PC'])['VOTES'].rank(method='min', ascending=False)
results.sort(['YEAR', 'STATE', 'PC', 'VOTES'], ascending=(True, True, True, False), inplace=True)
results.to_csv('parliament.csv', index=False, float_format='%.0f')