What borough has the fastest NYC taxi drivers?¶

In [ ]:

from __future__ import division

In [ ]:

%reload_ext autotime

In [ ]:

from blaze import by, compute, transform, Data, sin, cos, atan2, sqrt, radians, summary, greatest, symbol
from odo import odo, drop, resource, Temp, CSV, S3

In [ ]:

connect_args = dict(sslmode='verify-ca')

In [ ]:

d = Data('redshift://cio@localhost:15439/dev::trip', connect_args=connect_args)

In [ ]:

d.count()

In [ ]:

def haversine_distance(start, stop, R=3959):
    """Compute the distance between two sets of `start` and `stop` lat, lon points
    """
    # http://andrew.hedges.name/experiments/haversine/
    lat1, lon1 = start
    lat2, lon2 = stop
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(radians(dlat) / 2.0) ** 2 + cos(lat1) * cos(lat2) * sin(radians(dlon) / 2.0) ** 2
    return R * 2 * atan2(sqrt(greatest(a, 0.0)), sqrt(greatest(1.0 - a, 0.0)))

In [ ]:

max_distance = haversine_distance(start=[40.477399, -74.259090], stop=[40.917577, -73.700272])
max_distance

Reduce the data to the area defined as NYC¶

In [ ]:

# http://hafen.github.io/taxi/#reading-in-to-r

min_lat, max_lat = 40.477399, 40.917577
min_lon, max_lon = -74.259090, -73.700272

valid = d[
  (d.trip_distance > 0) &
  (d.trip_distance <= max_distance) &
  (d.trip_time_in_secs > 0) &
  (d.passenger_count > 0) & (d.passenger_count <= 5) &
  (d.pickup_latitude >= min_lat) & (d.pickup_latitude <= max_lat) &
  (d.dropoff_latitude >= min_lat) & (d.dropoff_latitude <= max_lat) &
  (d.pickup_longitude >= min_lon) & (d.pickup_longitude <= max_lon) &
  (d.dropoff_longitude >= min_lon) & (d.dropoff_longitude <= max_lon)
]

In [ ]:

valid

In [ ]:

computed_distance = haversine_distance(start=[valid.pickup_latitude, valid.pickup_longitude],
                                       stop=[valid.dropoff_latitude, valid.dropoff_longitude])
trip_time_in_hours = valid.trip_time_in_secs.coerce('float64') / 3600.0
calcd = transform(valid,
                  avg_speed_in_mph=valid.trip_distance / trip_time_in_hours,
                  trip_time_in_hours=trip_time_in_hours)

In [ ]:

calcd

Assume that > 120 MPH is invalid¶

In [ ]:

data = calcd[(calcd.avg_speed_in_mph > 5) & (calcd.avg_speed_in_mph <= 120)][
    [
        'avg_speed_in_mph', 
        'pickup_latitude', 
        'pickup_longitude', 
        'dropoff_latitude', 
        'dropoff_longitude'
    ]
]

In [ ]:

data

In [ ]:

print(compute(data))

A small aside, with `odo`¶

Let's look at the distribution of rides by borough¶

Q: Where do we get borough geolocation data?¶

A: From NYC's own API!¶

In [ ]:

import requests as r

In [ ]:

resp = r.get(
    'https://data.cityofnewyork.us/api/geospatial/tqmj-j8zm',
    params=dict(method='export', format='KML')
)

In [ ]:

import lxml
import lxml.etree

Parse the KML with `lxml`¶

In [ ]:

kml = lxml.etree.fromstring(resp.text.encode('utf8'))

In [ ]:

print(resp.text)

`Placemark` elements delineate boroughs¶

In [ ]:

placemarks = kml.xpath("//*[local-name()='Placemark']")

Create some functions to pull out the borough names and points¶

In [ ]:

from toolz.curried import map
from toolz import pipe, concat

import io


def find_borough_name(text):
    return re.search(r'.*(Brooklyn|Manhattan|Bronx|Staten Island|Queens).*', text).group(1)


def parse_coords(p, name):
    coords = p.xpath('.//*[local-name()="coordinates"]/text()')
    return pipe(coords,
                map(str.split),
                map(lambda x: '\n'.join(map(lambda y: '%s,%s' % (y, name), x))),
                '\n'.join)


def parse_kml(placemarks):
    result = []
    
    for p in placemarks:
        desc, = p.xpath(".//*[local-name()='description']")
        name = find_borough_name(desc.text)        
        result.append(parse_coords(p, name))
    return '\n'.join(result)

Parse the location data¶

In [ ]:

geodata = parse_kml(placemarks)
geodata[:geodata.find('\n')]

Inspect our data with pandas¶

In [ ]:

df = pd.read_csv(io.StringIO(geodata), names=['lon', 'lat', 'name']).drop_duplicates().reset_index(drop=True)

In [ ]:

df.head()

In [ ]:

df.describe()

After we're satisfied, let's throw everything into redshift for later analysis¶

In [ ]:

boros = odo(
    odo(df, Temp(S3(CSV))),
    'redshift://cio@localhost:15439/dev::boros',
    connect_args=connect_args
)

In [ ]:

len(df)

Compute the bounding box for each borough¶

In [ ]:

bounds = sa.select([
    boros.c.name,
    sa.func.min(boros.c.lon).label('min_lon'),
    sa.func.min(boros.c.lat).label('min_lat'),
    sa.func.max(boros.c.lon).label('max_lon'),
    sa.func.max(boros.c.lat).label('max_lat')
]).group_by(boros.c.name).alias()

In [ ]:

print(bounds)

Get out the SQLAlchemy table when blaze isn't enough¶

In [ ]:

data

In [ ]:

nyc = compute(data).alias()

In [ ]:

joined = nyc.join(
    bounds,
    onclause=(
        (nyc.c.pickup_longitude >= bounds.c.min_lon) &
        (nyc.c.pickup_longitude <= bounds.c.max_lon) &
        (nyc.c.pickup_latitude >= bounds.c.min_lat) &
        (nyc.c.pickup_latitude <= bounds.c.max_lat) &
        (nyc.c.dropoff_longitude >= bounds.c.min_lon) &
        (nyc.c.dropoff_longitude <= bounds.c.max_lon) &
        (nyc.c.dropoff_latitude >= bounds.c.min_lat) &
        (nyc.c.dropoff_latitude <= bounds.c.max_lat)
    )
)

In [ ]:

sel = sa.select([
    bounds.c.name,
    nyc.c.avg_speed_in_mph, 
    nyc.c.pickup_latitude,
    nyc.c.pickup_longitude,
    nyc.c.dropoff_latitude,
    nyc.c.dropoff_longitude
]).select_from(joined).alias()

In [ ]:

per_boro_speeds = sa.select([
    sel.c.name,
    sa.func.avg(sel.c.avg_speed_in_mph).label('avg_speed'),
    sa.func.count(sel.c.avg_speed_in_mph).label('nsamples')
]).group_by(sel.c.name)

In [ ]:

import sqlparse

In [ ]:

print(sqlparse.format(str(per_boro_speeds), reindent=True))

In [ ]:

speeds = odo(
    per_boro_speeds.order_by(sa.desc(per_boro_speeds.c.avg_speed)),
    pd.DataFrame,
    connect_args=connect_args
)

In [ ]:

speeds

What borough has the fastest NYC taxi drivers?¶

Reduce the data to the area defined as NYC¶

Assume that > 120 MPH is invalid¶

A small aside, with odo¶

Let's look at the distribution of rides by borough¶

Q: Where do we get borough geolocation data?¶

A: From NYC's own API!¶

Parse the KML with lxml¶

Placemark elements delineate boroughs¶

Create some functions to pull out the borough names and points¶

Parse the location data¶

Inspect our data with pandas¶

After we're satisfied, let's throw everything into redshift for later analysis¶

Compute the bounding box for each borough¶

Get out the SQLAlchemy table when blaze isn't enough¶

A small aside, with `odo`¶

Parse the KML with `lxml`¶

`Placemark` elements delineate boroughs¶