# Good practice 1: # - never do from module import * # => always name things # requests is a very nice http lib import requests from requests import get # Get subventions data r = get('http://www.regardscitoyens.org/wp-content/uploads/reserve-parlementaire-2011-parletdep.csv') with open('../data/reserve-parlementaire-2011-parletdep.csv', 'w') as f: f.write(r.text.encode('utf-8')) from bs4 import BeautifulSoup r = get('http://www.nosdeputes.fr/synthese') soup = BeautifulSoup(r.text) # Scrape: select with css attributes table = soup.find('div', class_='tableau_synthese').table activity_data = [] for tr in table.find_all('tr'): tds = tr.find_all('td')[:2] activity_data.append({'name': tds[0].text, 'weeks': int(tds[1].text)}) activity_data[1] import json import pandas as pd from fiona import collection from matplotlib.ticker import Formatter, FixedLocator # Import GEOFLA data geofla_filepath = '../data/COMMUNES/COMMUNE_4326.SHP' # Set a simple function to extract only useful data def extract_data(com): properties = com['properties'] return { 'id': com['id'], 'insee_code': properties['INSEE_COM'], 'name': properties['NOM_COMM'], 'population': properties['POPULATION'] * 1000, 'POPULATION': properties['POPULATION'], 'department': properties['NOM_DEPT'], 'code_dep': properties['CODE_DEPT'], } with collection(geofla_filepath) as communes: # DataFrame are like excel tables but far better com_data = pd.DataFrame([extract_data(com) for com in communes]) com_data.head() # Export to geojson with collection(geofla_filepath) as communes: data = [] for feat in communes: feat['type'] = 'Feature' data.append(feat) my_layer = { "type": "FeatureCollection", "features": [com for com in data if com['properties']['POPULATION'] > 1] } with open("my_layer.json", "w") as f: f.write(json.dumps(my_layer)) # # XXX: Test folium pacakge to make a chloropleth map with d3.js # XXX: Works only with PYTHON 2.7 # import folium map_2 = folium.Map(location=[46.907, 1.662], zoom_start=6) map_2.geo_json(geo_path="my_layer.json", data_out='data2.json', data=com_data[com_data['POPULATION']>1], columns=['id', 'POPULATION'], key_on='feature.id', fill_color='BuPu', fill_opacity=0.9, line_opacity=0.4, legend_name='POPULATION') map_2.create_map(path='map_population.html') import codecs reserve_file = codecs.open("../data/reserve-parlementaire-2011-parletdep.csv", encoding='utf-8') res_data = pd.io.parsers.read_csv(reserve_file, sep=';') # print columns res_data.keys() res_data[['Département', 'Subvention allouée', 'Parlementaire transmetteur', 'Bénéficiaire']].head() res_data['Groupe politique du parlementaire'].unique() fig, ax = subplots(figsize=(8,6)) sorted_sub = res_data['Subvention allouée'].copy() sorted_sub.sort() id_max = int(sorted_sub.size * 0.9) + 1 ax.plot(sorted_sub[:id_max], pd.np.arange(id_max) * 1. / id_max) ax.grid(True) ax.set_title(u"Répartition des subventions") print """Le nombre de commune ayant reçu des subvention est de %s sur %s communes au total."""%(res_data['Bénéficiaire'].unique().size, com_data.shape[0]) gp = res_data.groupby(['Bénéficiaire', 'Département']) agg = gp.agg({'Subvention allouée' : np.sum}) agg = agg.sort(column='Subvention allouée', ascending=False) agg.head(n=10) fig, ax = subplots(figsize=(8,6)) agg_cumsum = agg.cumsum() total_sub = agg.sum() quantiles = pd.np.arange(0.01, 1, 0.01) ax.plot(quantiles, [agg_cumsum.quantile(q)/total_sub for q in quantiles]) ax.grid(True) ax.set_title(u"Répartition des subventions par bénéficaire") ax.set_xlabel(u'% des bénéficiaires ayant reçu une subvention') ax.set_ylabel(u'% de la réserve parlementaire allouée') second_tour = pd.ExcelFile('../data/municipales_2008_grosses_communes.xls').parse('Tour 2') second_tour.columns[:15] # Get columns of votes pct and parties votes_pct = second_tour.filter(regex='\% Voix\/Ins(|\.\d{1,2})') parties = second_tour.filter(regex='Code Nuance(|\.\d{1,2})') # Get the winner id parties['winner_id'] = votes_pct.apply(lambda row: row.dropna().argmax(), axis=1) # And the winner party! winner_parties = parties.apply(lambda row: row.ix[row['winner_id']], axis=1) second_tour['winner_party'] = winner_parties print winner_parties.unique() # join only data of interest data = pd.DataFrame({'code_dep': res_data['Département'], 'name': res_data['Bénéficiaire'], 'subvention': res_data['Subvention allouée'], 'politician': res_data['Parlementaire transmetteur'], 'party': res_data['Groupe politique du parlementaire']}) com_data.head() # Join with municipal data # XXX: this does not work for DOM cities def insee_code(row): try: com_code = "%03d"%int(row[u'Code de la commune']) except: com_code = ("00%s"%row[u'Code de la commune'])[-3:] try: dep_code = "%02d"%int(row[u'Code du département']) except: dep_code = ("0%s"%row[u'Code du département'])[-2:] return "%s%s"%(dep_code, com_code) second_tour['insee_code'] = second_tour.apply(insee_code, axis=1) join_data_2 = join_data.merge(second_tour[['insee_code', 'winner_party']], on=['insee_code']) # Join with geofla data join_data = data.merge(com_data, on=['code_dep', 'name']) second_tour[[u'Code du département', u'Code de la commune', 'insee_code']].head(n=10) join_data_2[['subvention', 'party', 'code_dep', 'population']].head() # Merge policitical party second_tour['winner_party'].unique() print res_data['Groupe politique du parlementaire'].unique() # To simplify, we guess that have 4 parties: EG, G, C, D, ED municipal_mapping = {'LUG': 'G', 'LDVD': 'D', 'LMAJ': 'D', 'LVEC': 'G', 'LMC': 'D', 'LSOC': 'G', 'LGC': '?', 'LCOM': 'EG', 'LAUT': '?', 'LREG': '?'} party_mapping = {'UMP': 'D', 'UDI': 'C', 'NI': '?', 'SRC': '?', 'SOC': 'G', 'SOCV': 'G', 'RDSE': 'G', 'GDR': 'D', 'UC': 'C', 'CRC-SPG': '?', 'NC': '?', 'CRC': '?', 'ECO': '?'}