!mkdir mkdata import requests, re, urllib.request from bs4 import BeautifulSoup def linkgrabber_MK(response): soup=BeautifulSoup(response.content) results=[] #Get the list of results resultsblock=soup.find('div',{'class':'content right'}) for result in resultsblock.findAll('a',text=re.compile('^Council Spend Data.*')): csvurl=result.attrs['href'] fn=result.attrs['href'].split('/')[-1] results.append((csvurl,fn)) return results def harvester(links,path='.'): for csvurl,fn in links: print('Grabbing {0} as {1}'.format(csvurl, fn)) urllib.request.urlretrieve(csvurl.strip('/'), '/'.join([path,fn])) url="http://www.milton-keynes.gov.uk/your-council-and-elections/council-information-and-accounts/data-performance-and-spending/Milton-Keynes-Council-spend-archive" html2= requests.get(url) cc=linkgrabber_MK(html2) harvester(cc,'mkdata') def linkgrabber_MK2(response): soup=BeautifulSoup(response.content) results=[] #Get the list of results resultsblock=soup.find('ul',{'class':'documents'}) for result in resultsblock.findAll('a',href=re.compile('.*\.csv$')): csvurl=result.attrs['href'] csvurl='http://www.milton-keynes.gov.uk/'+csvurl fn=result.attrs['href'].split('/')[-1] results.append((csvurl,fn)) return results url='http://www.milton-keynes.gov.uk/your-council-and-elections/council-information-and-accounts/data-performance-and-spending/milton-keynes-council-spend' html2= requests.get(url) cc=linkgrabber_MK2(html2) harvester(cc,'mkdata') !head mkdata/04-2014.csv #derived from /usr/local/bin/chardetect.py import os from chardet.universaldetector import UniversalDetector def sniffer(fname): file=open(fname, 'rb') u = UniversalDetector() for line in file: u.feed(line) u.close() result = u.result if result['encoding']: return (result['encoding'], result['confidence']) for fname in os.listdir('mkdata'): if fname.endswith('csv'): print(fname,sniffer('mkdata/'+fname)) import pandas as pd dd=pd.DataFrame() for fname in os.listdir('mkdata/'): enc,cert=sniffer('mkdata/'+fname) dd=pd.concat([dd,pd.read_csv('mkdata/'+fname,encoding=enc)]) dd.to_csv('mkSpendingData.csv',index=False) dd.dtypes from dateutil.parser import * dd['Date']=dd['Date'].apply(parse) dd[:5] from ggplot import * dd.sort('Date',inplace=True) dd['Cumul Amount'] = dd['Amount'].cumsum() #Financial year models #http://stackoverflow.com/a/26342880/454773 def getFiscalYear(dt): year = dt.year if dt.month<4: year -= 1 return year dd['Financial Year']= dd['Date'].apply(getFiscalYear) dd['Cumul Amount Tax Year'] = dd.groupby('Financial Year')['Amount'].cumsum() #I don't know (yet?!) how to cast an April-March tax year x-axis that would allow us to compare # accumulated totals using differently coloured lines in the same chart. # Also not that the incomplete 2014 tax year has an x-axis of fewer days than the other years... ggplot(dd,aes(x='Date',y='Cumul Amount Tax Year'))+geom_line()+facet_wrap('Financial Year',scales="free_x") pd.options.display.float_format = '{:20,.2f}'.format servSpend=pd.pivot_table(dd, values='Amount', index=['Service area categorisation'], columns=['Financial Year'], aggfunc=np.sum) servSpend dd.groupby(['Service area categorisation'])['Financial Year'].unique()