#!/usr/bin/env python # coding: utf-8 # # What NYC boroughs should we avoid? # ## (Or, what borough has the fastest taxi drivers?) # In[1]: from __future__ import division # In[2]: import sqlparse def psql(query): print( sqlparse.format( str(query.compile( compile_kwargs=dict(literal_binds=True) )), reindent=True ) ) # In[46]: get_ipython().run_line_magic('reload_ext', 'autotime') get_ipython().run_line_magic('matplotlib', 'nbagg') from blaze import by, compute, transform, Data from odo import odo, drop, resource, Temp, CSV import matplotlib.pyplot as plt from matplotlib.patches import Rectangle from IPython.display import Image from sqlalchemy import func as f from bokeh.charts import Histogram from bokeh.charts import defaults, vplot, hplot, show, output_notebook output_notebook() # ## Load up NYC Taxi from Postgres # In[4]: d = Data('postgresql://localhost/nyc::bigsample') # In[5]: d.head(5) # ## What is the bounding box of NYC? # In[6]: from IPython.display import HTML, IFrame # In[7]: IFrame( 'https://www.maptechnica.com/us-city-boundary-map/city/New%20York/state/NY/cityid/3651000', width=800, height=600 ) # ## Postgres has some funky operators for doing cool things # In[8]: get_ipython().system("psql nyc -c 'select point(40.477399, -74.259090) <@> point(40.917577, -73.700272) as box_dist'") # In[9]: max_distance = d.data.bind.execute( sa.select([ f.point(40.477399, -74.259090).op('<@>')(f.point(40.917577, -73.700272)) ]) ).scalar() max_distance # ## Reduce the data to the area defined as NYC # In[10]: # http://hafen.github.io/taxi/#reading-in-to-r min_lat, max_lat = 40.477399, 40.917577 min_lon, max_lon = -74.259090, -73.700272 valid = d[ (d.trip_distance > 0) & (d.trip_distance <= max_distance) & (d.trip_time_in_secs > 0) & (d.passenger_count > 0) & (d.passenger_count <= 5) & (d.pickup_latitude >= min_lat) & (d.pickup_latitude <= max_lat) & (d.dropoff_latitude >= min_lat) & (d.dropoff_latitude <= max_lat) & (d.pickup_longitude >= min_lon) & (d.pickup_longitude <= max_lon) & (d.dropoff_longitude >= min_lon) & (d.dropoff_longitude <= max_lon) ] # In[11]: psql(compute(valid)) # ## Add a column containing the average speed in MPH # In[12]: trip_time_in_hours = valid.trip_time_in_secs.coerce('float64') / 3600.0 calcd = transform(valid, avg_speed_in_mph=valid.trip_distance / trip_time_in_hours) # In[13]: calcd.avg_speed_in_mph.head(5) # In[14]: s = odo(calcd.avg_speed_in_mph, pd.Series) # In[55]: show(Histogram( s.loc[s > 120], title='Average Speed in MPH', bins=100, xlabel='Average Speed in MPH', ylabel='Count' )) # In[ ]: p = Scatter(df, x='mpg', y='hp', color='cyl', title="HP vs MPG (shaded by CYL)", xlabel="Miles Per Gallon", ylabel="Horsepower") # ### Assume that > 120 MPH is invalid # In[16]: data = calcd[(calcd.avg_speed_in_mph > 5) & (calcd.avg_speed_in_mph <= 120)][ [ 'avg_speed_in_mph', 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude' ] ] # In[17]: data.head(5) # In[18]: psql(compute(data)) # ## A small aside, with `odo` # ### Let's look at the distribution of rides by borough # # #### Q: Where do we get borough geolocation data? # #### A: From NYC's own API! # In[19]: import requests # In[20]: def get_kml(wifi): if wifi: result = requests.get( 'https://data.cityofnewyork.us/api/geospatial/tqmj-j8zm', params=dict(method='export', format='KML') ).text with open('cityofnewyork.kml', 'wt') as f: f.write(result) return result else: with open('cityofnewyork.kml', 'rt') as f: return f.read() # In[21]: raw = get_kml(wifi=False) # In[22]: import lxml import lxml.etree # #### Parse the KML with `lxml` # In[23]: kml = lxml.etree.fromstring(raw.encode('utf8')) # In[24]: print(raw[:1000]) # #### `Placemark` elements delineate boroughs # In[25]: placemarks = kml.xpath("//*[local-name()='Placemark']") # #### Create some functions to pull out the borough names and points # In[26]: from toolz.curried import map from toolz import pipe, concat import io def find_borough_name(text): return re.search(r'.*(Brooklyn|Manhattan|Bronx|Staten Island|Queens).*', text).group(1) def parse_coords(p, name): coords = p.xpath('.//*[local-name()="coordinates"]/text()') return pipe( coords, map(str.split), map(lambda x: '\n'.join(map(lambda y: '%s,%s' % (y, name), x))), '\n'.join ) def parse_kml(placemarks): result = [] for p in placemarks: desc, = p.xpath(".//*[local-name()='description']") name = find_borough_name(desc.text) result.append(parse_coords(p, name)) return '\n'.join(result) # #### Parse the location data # In[27]: geodata = parse_kml(placemarks) # In[28]: geodata[:geodata.find('\n')] # #### Inspect our data with pandas # In[29]: df = pd.read_csv( io.StringIO(geodata), names=['lon', 'lat', 'name'] ).drop_duplicates().reset_index(drop=True) # In[30]: df.describe() # In[31]: import seaborn as sns # In[ ]: p = Scatter( df, x='lon', y='lat', color='name', title="NYC Borough Polygons" xlabel="Longitude", ylabel="Horsepower" ) output_notebook() show(p) # #### After we're satisfied, let's throw everything into postgres for later analysis # In[33]: boros = odo( odo(df, Temp(CSV)), 'postgresql://localhost/nyc::boros' ) # In[34]: Data(boros).head(5) # #### Make a table of `name`, `polygon` for each borough # In[35]: bounds = sa.select([ boros.c.name, f.box( f.point(f.min(boros.c.lon), f.min(boros.c.lat)), f.point(f.max(boros.c.lon), f.max(boros.c.lat)) ).label('boro_box') ]).group_by(boros.c.name).alias() # #### Get out the SQLAlchemy table # In[36]: data.head(5) # In[37]: nyc = compute(data).alias() # #### Join on both the pickup and dropoff latitude and longtiude being in the same borough # In[38]: joined = nyc.join( bounds, onclause=f.lseg( f.point(nyc.c.pickup_longitude, nyc.c.pickup_latitude), f.point(nyc.c.dropoff_longitude, nyc.c.dropoff_latitude) ).op('<@')(bounds.c.boro_box) ) # #### Group By `name` and compute the average speed and the number of samples that went into the average # In[39]: sel = sa.select([ bounds.c.name, nyc.c.avg_speed_in_mph, nyc.c.pickup_latitude, nyc.c.pickup_longitude, nyc.c.dropoff_latitude, nyc.c.dropoff_longitude ]).select_from(joined).alias() # In[40]: per_boro_speeds = sa.select([ sel.c.name, f.avg(sel.c.avg_speed_in_mph).label('avg_speed'), f.count(sel.c.avg_speed_in_mph).label('nsamples') ]).group_by(sel.c.name).order_by(sa.desc('avg_speed')) # In[41]: psql(per_boro_speeds) # In[42]: speeds = odo(per_boro_speeds, pd.DataFrame) speeds # In[43]: Image('i.jpg')