import pandas
from pandasql import sqldf

mysqldf = lambda q: sqldf(q, globals())

import os
import urllib2
import zipfile

url = "http://data.octo.dc.gov/feeds/bbl/bbl_current_csv.zip"
data_dir = "../data/"

current_dir = os.getcwd()
os.chdir(data_dir)

local_zipped = "bbl_current_csv.zip"
local_data = "bbl_current_csv.csv"

if not (os.path.isfile(local_data) and os.path.isfile(local_zipped)):
  filename = os.path.join(os.getcwd(), local_zipped)
  urllib2.urlretrieve(url, local_zipped)

if not os.path.isfile(local_data): 
  with ZipFile(local_zipped, 'r') as zipdata:
    zipdata.extractall()

bbl = pandas.DataFrame.from_csv(local_data)

os.chdir(data_dir)

bbl.head()

mysqldf("select * from bbl where BUSINESSIMPROVEMENTDISTRICT = 'DOWNTOWN BID'")

mysqldf("select * from bbl where WARD = 6")

ward6 = mysqldf("select LICENSECATEGORY, count(*) as LICENSECOUNT from bbl where WARD = 6 group by LICENSECATEGORY")
ward6

%matplotlib inline
import matplotlib.pyplot as plt

plt.figure()

ward6.plot(kind='barh')

query = """
select BBL_LICENSE_FACT_ID as bbl_id, 
LATITUDE as lat, LONGITUDE as lon, 
WARD as ward, LICENSESTATUS as status, 
BUSINESSIMPROVEMENTDISTRICT as bid
from bbl
where LICENSESTATUS != 'CANCELLED'
and WARD != 'NA' 
and LATITUDE <> 0 
AND LONGITUDE <> 0"""

results = mysqldf(query)

results.head()

from ggplot import *

ggplot(aes(x='lat', y='lon', color='bid'), data=results) + \
        geom_point() + ggtitle("Business Improvement District Locations") + \
        xlab("Latitude") + ylab("Longitude") + facet_grid("ward", "status", scales="free_xy")