import pandas from pandasql import sqldf mysqldf = lambda q: sqldf(q, globals()) import os import urllib2 import zipfile url = "http://data.octo.dc.gov/feeds/bbl/bbl_current_csv.zip" data_dir = "../data/" current_dir = os.getcwd() os.chdir(data_dir) local_zipped = "bbl_current_csv.zip" local_data = "bbl_current_csv.csv" if not (os.path.isfile(local_data) and os.path.isfile(local_zipped)): filename = os.path.join(os.getcwd(), local_zipped) urllib2.urlretrieve(url, local_zipped) if not os.path.isfile(local_data): with ZipFile(local_zipped, 'r') as zipdata: zipdata.extractall() bbl = pandas.DataFrame.from_csv(local_data) os.chdir(data_dir) bbl.head() mysqldf("select * from bbl where BUSINESSIMPROVEMENTDISTRICT = 'DOWNTOWN BID'") mysqldf("select * from bbl where WARD = 6") ward6 = mysqldf("select LICENSECATEGORY, count(*) as LICENSECOUNT from bbl where WARD = 6 group by LICENSECATEGORY") ward6 %matplotlib inline import matplotlib.pyplot as plt plt.figure() ward6.plot(kind='barh') query = """ select BBL_LICENSE_FACT_ID as bbl_id, LATITUDE as lat, LONGITUDE as lon, WARD as ward, LICENSESTATUS as status, BUSINESSIMPROVEMENTDISTRICT as bid from bbl where LICENSESTATUS != 'CANCELLED' and WARD != 'NA' and LATITUDE <> 0 AND LONGITUDE <> 0""" results = mysqldf(query) results.head() from ggplot import * ggplot(aes(x='lat', y='lon', color='bid'), data=results) + \ geom_point() + ggtitle("Business Improvement District Locations") + \ xlab("Latitude") + ylab("Longitude") + facet_grid("ward", "status", scales="free_xy")