import geopandas as gp from geopandas.tools import sjoin import pandas as pd import matplotlib.pyplot as plt from lxml import etree import numpy as np from shapely.geometry import Point, MultiPoint from fiona.crs import from_epsg %matplotlib inline # get the shapefile here http://sensitivecities.com/extra/london.zip df_london = gp.GeoDataFrame.from_file('data/london') df_london.crs = from_epsg(4326) df_london.head() # parse XML into dict # available from openplaques: https://dl.dropbox.com/u/21695507/openplaques/plaques_20140619.xml tree = etree.parse("data/plaques_20140619.xml") root = tree.getroot() output = dict() output['raw'] = [] output['crs'] = [] output['lon'] = [] output['lat'] = [] for each in root.xpath('/openplaques/plaque/geo'): # check what we got back output['crs'].append(each.get('reference_system', None)) output['lon'].append(each.get('longitude', None)) output['lat'].append(each.get('latitude', None)) # now go back up to plaque r = each.getparent().xpath('inscription/raw')[0] if isinstance(r.text, str): output['raw'].append(r.text.lstrip().rstrip()) else: output['raw'].append(None) df = pd.DataFrame(output) df = df.replace({'raw': 0}, None) df = df.dropna() df[['lon', 'lat']] = df[['lon', 'lat']].astype(float) df.head() df_plaques = gp.GeoDataFrame({ 'geometry': [Point(x, y) for x, y in zip(df['lon'], df['lat'])], 'raw': df['raw']}) # set crs, then convert to OSGB36 if necessary df_plaques.crs = from_epsg(4326) # df_plaques['geometry'] = df_plaques['geometry'].to_crs(epsg=2770) # Perform an inner join between the geometry columns (points, wards) of the plaques and wards dataframes join_inner_df = sjoin(df_plaques, df_london, how="inner") len(join_inner_df) join_inner_df.head() # Join the plaque counts *per ward* to the wards # This works because the result of the dataframe join shows all points and their associated ward code df_london = df_london.merge( join_inner_df['CODE'].value_counts().to_frame(), how='left', left_on='CODE', right_index=True) # rename count column to something meaningful df_london.rename(columns={0: 'Plaque_Count'}, inplace=True) df_london[['NAME', 'HECTARES', 'Plaque_Count']].dropna().sort('Plaque_Count', ascending=False).head(10) # plaque density per square km df_london['Plaque_Density'] = (df_london['Plaque_Count'] / df_london.area) / 100000 df_london.replace(to_replace={'Plaque_Density': {0: np.nan}}, inplace=True) df_london[['NAME', 'HECTARES', 'Plaque_Count', 'Plaque_Density']].dropna().sort('Plaque_Density', ascending=False).head(10) # fill empty densities with 0. Not ideal. df_london['Plaque_Density'].fillna(0, inplace=True) plt.clf() fig = plt.figure(figsize=(20, 15), dpi=100) ax = fig.add_subplot(111, axisbg='w', frame_on=True) df_london.plot(column='Plaque_Density', scheme='natural_breaks', k=7, colormap='Blues', axes=ax) plt.tight_layout() plt.show()