#!/usr/bin/env python
# coding: utf-8

# # What NYC boroughs should we avoid?

# ## (Or, what borough has the fastest taxi drivers?)

# In[1]:


from __future__ import division


# In[2]:


import sqlparse


def psql(query):
    print(
        sqlparse.format(
            str(query.compile(
                compile_kwargs=dict(literal_binds=True)
            )),
            reindent=True
        )
    )


# In[46]:


get_ipython().run_line_magic('reload_ext', 'autotime')
get_ipython().run_line_magic('matplotlib', 'nbagg')

from blaze import by, compute, transform, Data
from odo import odo, drop, resource, Temp, CSV
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from IPython.display import Image
from sqlalchemy import func as f
from bokeh.charts import Histogram
from bokeh.charts import defaults, vplot, hplot, show, output_notebook
output_notebook()


# ## Load up NYC Taxi from Postgres

# In[4]:


d = Data('postgresql://localhost/nyc::bigsample')


# In[5]:


d.head(5)


# ## What is the bounding box of NYC?

# In[6]:


from IPython.display import HTML, IFrame


# In[7]:


IFrame(
    'https://www.maptechnica.com/us-city-boundary-map/city/New%20York/state/NY/cityid/3651000',
    width=800,
    height=600
)


# ## Postgres has some funky operators for doing cool things

# In[8]:


get_ipython().system("psql nyc -c 'select point(40.477399, -74.259090) <@> point(40.917577, -73.700272) as box_dist'")


# In[9]:


max_distance = d.data.bind.execute(
    sa.select([
        f.point(40.477399, -74.259090).op('<@>')(f.point(40.917577, -73.700272))
    ])
).scalar()
max_distance


# ## Reduce the data to the area defined as NYC

# In[10]:


# http://hafen.github.io/taxi/#reading-in-to-r

min_lat, max_lat = 40.477399, 40.917577
min_lon, max_lon = -74.259090, -73.700272

valid = d[
  (d.trip_distance > 0) &
  (d.trip_distance <= max_distance) &
  (d.trip_time_in_secs > 0) &
  (d.passenger_count > 0) & (d.passenger_count <= 5) &
  (d.pickup_latitude >= min_lat) & (d.pickup_latitude <= max_lat) &
  (d.dropoff_latitude >= min_lat) & (d.dropoff_latitude <= max_lat) &
  (d.pickup_longitude >= min_lon) & (d.pickup_longitude <= max_lon) &
  (d.dropoff_longitude >= min_lon) & (d.dropoff_longitude <= max_lon)
]


# In[11]:


psql(compute(valid))


# ## Add a column containing the average speed in MPH

# In[12]:


trip_time_in_hours = valid.trip_time_in_secs.coerce('float64') / 3600.0
calcd = transform(valid, avg_speed_in_mph=valid.trip_distance / trip_time_in_hours)


# In[13]:


calcd.avg_speed_in_mph.head(5)


# In[14]:


s = odo(calcd.avg_speed_in_mph, pd.Series)


# In[55]:


show(Histogram(
    s.loc[s > 120],
    title='Average Speed in MPH',
    bins=100,
    xlabel='Average Speed in MPH',
    ylabel='Count'
))


# In[ ]:


p = Scatter(df, x='mpg', y='hp', color='cyl', title="HP vs MPG (shaded by CYL)",
            xlabel="Miles Per Gallon", ylabel="Horsepower")


# ### Assume that > 120 MPH is invalid

# In[16]:


data = calcd[(calcd.avg_speed_in_mph > 5) & (calcd.avg_speed_in_mph <= 120)][
    [
        'avg_speed_in_mph', 
        'pickup_latitude', 
        'pickup_longitude', 
        'dropoff_latitude', 
        'dropoff_longitude'
    ]
]


# In[17]:


data.head(5)


# In[18]:


psql(compute(data))


# ## A small aside, with `odo`

# ### Let's look at the distribution of rides by borough
# 
# #### Q: Where do we get borough geolocation data?
# #### A: From NYC's own API!

# In[19]:


import requests


# In[20]:


def get_kml(wifi):
    if wifi:
        result = requests.get(
            'https://data.cityofnewyork.us/api/geospatial/tqmj-j8zm',
            params=dict(method='export', format='KML')
        ).text
        with open('cityofnewyork.kml', 'wt') as f:
            f.write(result)
        return result
    else:
        with open('cityofnewyork.kml', 'rt') as f:
            return f.read()


# In[21]:


raw = get_kml(wifi=False)


# In[22]:


import lxml
import lxml.etree


# #### Parse the KML with `lxml`

# In[23]:


kml = lxml.etree.fromstring(raw.encode('utf8'))


# In[24]:


print(raw[:1000])


# #### `Placemark` elements delineate boroughs

# In[25]:


placemarks = kml.xpath("//*[local-name()='Placemark']")


# #### Create some functions to pull out the borough names and points

# In[26]:


from toolz.curried import map
from toolz import pipe, concat

import io


def find_borough_name(text):
    return re.search(r'.*(Brooklyn|Manhattan|Bronx|Staten Island|Queens).*', text).group(1)


def parse_coords(p, name):
    coords = p.xpath('.//*[local-name()="coordinates"]/text()')
    return pipe(
        coords,
        map(str.split),
        map(lambda x: '\n'.join(map(lambda y: '%s,%s' % (y, name), x))),
        '\n'.join
    )


def parse_kml(placemarks):
    result = []
    
    for p in placemarks:
        desc, = p.xpath(".//*[local-name()='description']")
        name = find_borough_name(desc.text)        
        result.append(parse_coords(p, name))
    return '\n'.join(result)


# #### Parse the location data

# In[27]:


geodata = parse_kml(placemarks)


# In[28]:


geodata[:geodata.find('\n')]


# #### Inspect our data with pandas

# In[29]:


df = pd.read_csv(
    io.StringIO(geodata),
    names=['lon', 'lat', 'name']
).drop_duplicates().reset_index(drop=True)


# In[30]:


df.describe()


# In[31]:


import seaborn as sns


# In[ ]:


p = Scatter(
    df, x='lon', y='lat',
    color='name',
    title="NYC Borough Polygons"
    xlabel="Longitude", ylabel="Horsepower"
)
output_notebook()
show(p)


# #### After we're satisfied, let's throw everything into postgres for later analysis

# In[33]:


boros = odo(
    odo(df, Temp(CSV)),
    'postgresql://localhost/nyc::boros'
)


# In[34]:


Data(boros).head(5)


# #### Make a table of `name`, `polygon` for each borough

# In[35]:


bounds = sa.select([
    boros.c.name,
    f.box(
        f.point(f.min(boros.c.lon), f.min(boros.c.lat)),
        f.point(f.max(boros.c.lon), f.max(boros.c.lat))
    ).label('boro_box')
]).group_by(boros.c.name).alias()


# #### Get out the SQLAlchemy table

# In[36]:


data.head(5)


# In[37]:


nyc = compute(data).alias()


# #### Join on both the pickup and dropoff latitude and longtiude being in the same borough

# In[38]:


joined = nyc.join(
    bounds,
    onclause=f.lseg(
        f.point(nyc.c.pickup_longitude, nyc.c.pickup_latitude),
        f.point(nyc.c.dropoff_longitude, nyc.c.dropoff_latitude)
    ).op('<@')(bounds.c.boro_box)
)


# #### Group By `name` and compute the average speed and the number of samples that went into the average

# In[39]:


sel = sa.select([
    bounds.c.name,
    nyc.c.avg_speed_in_mph, 
    nyc.c.pickup_latitude,
    nyc.c.pickup_longitude,
    nyc.c.dropoff_latitude,
    nyc.c.dropoff_longitude
]).select_from(joined).alias()


# In[40]:


per_boro_speeds = sa.select([
    sel.c.name,
    f.avg(sel.c.avg_speed_in_mph).label('avg_speed'),
    f.count(sel.c.avg_speed_in_mph).label('nsamples')
]).group_by(sel.c.name).order_by(sa.desc('avg_speed'))


# In[41]:


psql(per_boro_speeds)


# In[42]:


speeds = odo(per_boro_speeds, pd.DataFrame)
speeds


# In[43]:


Image('i.jpg')