This notebook explores a 3.9Gb CSV file containing NYC's 311 complaints since 2003. It's the most popular data set in NYC's open data portal.
This notebook is a primer on out-of-memory data analysis with
The dataset is too large to load into a Pandas dataframe. So, instead we'll perform out-of-memory aggregations with SQLite and load the result directly into a dataframe with Panda's iotools
. It's pretty easy to stream a CSV into SQLite and SQLite requires no setup. The SQL query language is pretty intuitive coming from a Pandas mindset.
import plotly.tools as tls
tls.embed('https://plot.ly/~chris/7365')
import pandas as pd
from sqlalchemy import create_engine # database connection
import datetime as dt
from IPython.display import display
import plotly.plotly as py # interactive graphing
from plotly.graph_objs import Bar, Scatter, Marker, Layout
display(pd.read_csv('311_100M.csv', nrows=2).head())
display(pd.read_csv('311_100M.csv', nrows=2).tail())
Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | ... | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 29300358 | 11/16/2014 11:46:00 PM | 11/16/2014 11:46:00 PM | DSNY | BCC - Queens East | Derelict Vehicles | 14 Derelict Vehicles | Street | 11432 | 80-25 PARSONS BOULEVARD | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.719411 | -73.808882 | (40.719410639341916, -73.80888158860446) |
1 | 29299837 | 11/16/2014 02:24:35 AM | 11/16/2014 02:24:35 AM | DOB | Department of Buildings | Building/Use | Illegal Conversion Of Residential Building/Space | NaN | 10465 | 938 HUNTINGTON AVENUE | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.827862 | -73.830641 | (40.827862046105416, -73.83064067165407) |
2 rows × 52 columns
Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | ... | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 29300358 | 11/16/2014 11:46:00 PM | 11/16/2014 11:46:00 PM | DSNY | BCC - Queens East | Derelict Vehicles | 14 Derelict Vehicles | Street | 11432 | 80-25 PARSONS BOULEVARD | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.719411 | -73.808882 | (40.719410639341916, -73.80888158860446) |
1 | 29299837 | 11/16/2014 02:24:35 AM | 11/16/2014 02:24:35 AM | DOB | Department of Buildings | Building/Use | Illegal Conversion Of Residential Building/Space | NaN | 10465 | 938 HUNTINGTON AVENUE | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.827862 | -73.830641 | (40.827862046105416, -73.83064067165407) |
2 rows × 52 columns
!wc -l < 311_100M.csv # Number of lines in dataset
8281035
disk_engine = create_engine('sqlite:///311_8M.db') # Initializes database with filename 311_8M.db in current directory
start = dt.datetime.now()
chunksize = 20000
j = 0
index_start = 1
for df in pd.read_csv('311_100M.csv', chunksize=chunksize, iterator=True, encoding='utf-8'):
df = df.rename(columns={c: c.replace(' ', '') for c in df.columns}) # Remove spaces from columns
df['CreatedDate'] = pd.to_datetime(df['CreatedDate']) # Convert to datetimes
df['ClosedDate'] = pd.to_datetime(df['ClosedDate'])
df.index += index_start
# Remove the un-interesting columns
columns = ['Agency', 'CreatedDate', 'ClosedDate', 'ComplaintType', 'Descriptor',
'CreatedDate', 'ClosedDate', 'TimeToCompletion',
'City']
for c in df.columns:
if c not in columns:
df = df.drop(c, axis=1)
j+=1
print '{} seconds: completed {} rows'.format((dt.datetime.now() - start).seconds, j*chunksize)
df.to_sql('data', disk_engine, if_exists='append')
index_start = df.index[-1] + 1
//anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1164: DtypeWarning: Columns (17) have mixed types. Specify dtype option on import or set low_memory=False. //anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1164: DtypeWarning: Columns (8,46) have mixed types. Specify dtype option on import or set low_memory=False.
6 seconds: completed 20000 rows 12 seconds: completed 40000 rows 18 seconds: completed 60000 rows 24 seconds: completed 80000 rows 30 seconds: completed 100000 rows 37 seconds: completed 120000 rows 43 seconds: completed 140000 rows 49 seconds: completed 160000 rows 55 seconds: completed 180000 rows 62 seconds: completed 200000 rows 68 seconds: completed 220000 rows 74 seconds: completed 240000 rows 81 seconds: completed 260000 rows 87 seconds: completed 280000 rows 99 seconds: completed 300000 rows 108 seconds: completed 320000 rows 116 seconds: completed 340000 rows 123 seconds: completed 360000 rows 131 seconds: completed 380000 rows 138 seconds: completed 400000 rows 149 seconds: completed 420000 rows 158 seconds: completed 440000 rows 164 seconds: completed 460000 rows 171 seconds: completed 480000 rows 177 seconds: completed 500000 rows 184 seconds: completed 520000 rows 190 seconds: completed 540000 rows 198 seconds: completed 560000 rows 204 seconds: completed 580000 rows 210 seconds: completed 600000 rows 217 seconds: completed 620000 rows 223 seconds: completed 640000 rows 229 seconds: completed 660000 rows 235 seconds: completed 680000 rows 242 seconds: completed 700000 rows 248 seconds: completed 720000 rows 255 seconds: completed 740000 rows 261 seconds: completed 760000 rows 267 seconds: completed 780000 rows 274 seconds: completed 800000 rows 280 seconds: completed 820000 rows 287 seconds: completed 840000 rows 293 seconds: completed 860000 rows 300 seconds: completed 880000 rows 306 seconds: completed 900000 rows 312 seconds: completed 920000 rows 318 seconds: completed 940000 rows 325 seconds: completed 960000 rows 331 seconds: completed 980000 rows 337 seconds: completed 1000000 rows 344 seconds: completed 1020000 rows 350 seconds: completed 1040000 rows 356 seconds: completed 1060000 rows 362 seconds: completed 1080000 rows 369 seconds: completed 1100000 rows 376 seconds: completed 1120000 rows 383 seconds: completed 1140000 rows 390 seconds: completed 1160000 rows 398 seconds: completed 1180000 rows 405 seconds: completed 1200000 rows 412 seconds: completed 1220000 rows 419 seconds: completed 1240000 rows 426 seconds: completed 1260000 rows 434 seconds: completed 1280000 rows 441 seconds: completed 1300000 rows 448 seconds: completed 1320000 rows 456 seconds: completed 1340000 rows 463 seconds: completed 1360000 rows 470 seconds: completed 1380000 rows 477 seconds: completed 1400000 rows 485 seconds: completed 1420000 rows 492 seconds: completed 1440000 rows 499 seconds: completed 1460000 rows 506 seconds: completed 1480000 rows 514 seconds: completed 1500000 rows 521 seconds: completed 1520000 rows 528 seconds: completed 1540000 rows 536 seconds: completed 1560000 rows 543 seconds: completed 1580000 rows 551 seconds: completed 1600000 rows 558 seconds: completed 1620000 rows 565 seconds: completed 1640000 rows 573 seconds: completed 1660000 rows 580 seconds: completed 1680000 rows 588 seconds: completed 1700000 rows 596 seconds: completed 1720000 rows 603 seconds: completed 1740000 rows 610 seconds: completed 1760000 rows 618 seconds: completed 1780000 rows 625 seconds: completed 1800000 rows 633 seconds: completed 1820000 rows 640 seconds: completed 1840000 rows 648 seconds: completed 1860000 rows 655 seconds: completed 1880000 rows 663 seconds: completed 1900000 rows 670 seconds: completed 1920000 rows 678 seconds: completed 1940000 rows 685 seconds: completed 1960000 rows 693 seconds: completed 1980000 rows 700 seconds: completed 2000000 rows 708 seconds: completed 2020000 rows 716 seconds: completed 2040000 rows 723 seconds: completed 2060000 rows 731 seconds: completed 2080000 rows 738 seconds: completed 2100000 rows 746 seconds: completed 2120000 rows 753 seconds: completed 2140000 rows 760 seconds: completed 2160000 rows 768 seconds: completed 2180000 rows 775 seconds: completed 2200000 rows 782 seconds: completed 2220000 rows 790 seconds: completed 2240000 rows 797 seconds: completed 2260000 rows 805 seconds: completed 2280000 rows 812 seconds: completed 2300000 rows 820 seconds: completed 2320000 rows 827 seconds: completed 2340000 rows 835 seconds: completed 2360000 rows 843 seconds: completed 2380000 rows 852 seconds: completed 2400000 rows 860 seconds: completed 2420000 rows 870 seconds: completed 2440000 rows 878 seconds: completed 2460000 rows 885 seconds: completed 2480000 rows 893 seconds: completed 2500000 rows 900 seconds: completed 2520000 rows 908 seconds: completed 2540000 rows 915 seconds: completed 2560000 rows 922 seconds: completed 2580000 rows 930 seconds: completed 2600000 rows 937 seconds: completed 2620000 rows 944 seconds: completed 2640000 rows 952 seconds: completed 2660000 rows 959 seconds: completed 2680000 rows 967 seconds: completed 2700000 rows 974 seconds: completed 2720000 rows 982 seconds: completed 2740000 rows 989 seconds: completed 2760000 rows 997 seconds: completed 2780000 rows 1004 seconds: completed 2800000 rows 1011 seconds: completed 2820000 rows 1019 seconds: completed 2840000 rows 1026 seconds: completed 2860000 rows 1034 seconds: completed 2880000 rows 1041 seconds: completed 2900000 rows 1049 seconds: completed 2920000 rows 1056 seconds: completed 2940000 rows 1064 seconds: completed 2960000 rows 1071 seconds: completed 2980000 rows 1079 seconds: completed 3000000 rows 1086 seconds: completed 3020000 rows 1093 seconds: completed 3040000 rows 1101 seconds: completed 3060000 rows 1108 seconds: completed 3080000 rows 1116 seconds: completed 3100000 rows 1123 seconds: completed 3120000 rows 1131 seconds: completed 3140000 rows 1138 seconds: completed 3160000 rows 1146 seconds: completed 3180000 rows 1153 seconds: completed 3200000 rows 1161 seconds: completed 3220000 rows 1168 seconds: completed 3240000 rows 1176 seconds: completed 3260000 rows 1183 seconds: completed 3280000 rows 1191 seconds: completed 3300000 rows 1199 seconds: completed 3320000 rows 1206 seconds: completed 3340000 rows 1214 seconds: completed 3360000 rows 1221 seconds: completed 3380000 rows 1229 seconds: completed 3400000 rows 1236 seconds: completed 3420000 rows 1244 seconds: completed 3440000 rows 1251 seconds: completed 3460000 rows 1259 seconds: completed 3480000 rows 1266 seconds: completed 3500000 rows 1274 seconds: completed 3520000 rows 1282 seconds: completed 3540000 rows 1289 seconds: completed 3560000 rows 1297 seconds: completed 3580000 rows 1304 seconds: completed 3600000 rows 1312 seconds: completed 3620000 rows 1319 seconds: completed 3640000 rows 1327 seconds: completed 3660000 rows 1334 seconds: completed 3680000 rows 1342 seconds: completed 3700000 rows 1350 seconds: completed 3720000 rows 1357 seconds: completed 3740000 rows 1364 seconds: completed 3760000 rows 1372 seconds: completed 3780000 rows 1379 seconds: completed 3800000 rows 1387 seconds: completed 3820000 rows 1394 seconds: completed 3840000 rows 1402 seconds: completed 3860000 rows 1409 seconds: completed 3880000 rows 1416 seconds: completed 3900000 rows 1424 seconds: completed 3920000 rows 1431 seconds: completed 3940000 rows 1439 seconds: completed 3960000 rows 1446 seconds: completed 3980000 rows 1454 seconds: completed 4000000 rows 1461 seconds: completed 4020000 rows 1468 seconds: completed 4040000 rows 1476 seconds: completed 4060000 rows 1484 seconds: completed 4080000 rows 1491 seconds: completed 4100000 rows 1498 seconds: completed 4120000 rows 1506 seconds: completed 4140000 rows 1513 seconds: completed 4160000 rows 1521 seconds: completed 4180000 rows 1528 seconds: completed 4200000 rows 1536 seconds: completed 4220000 rows 1543 seconds: completed 4240000 rows 1551 seconds: completed 4260000 rows 1558 seconds: completed 4280000 rows 1566 seconds: completed 4300000 rows 1573 seconds: completed 4320000 rows 1581 seconds: completed 4340000 rows 1588 seconds: completed 4360000 rows 1596 seconds: completed 4380000 rows 1603 seconds: completed 4400000 rows 1611 seconds: completed 4420000 rows 1618 seconds: completed 4440000 rows 1626 seconds: completed 4460000 rows 1634 seconds: completed 4480000 rows 1641 seconds: completed 4500000 rows 1649 seconds: completed 4520000 rows 1656 seconds: completed 4540000 rows 1664 seconds: completed 4560000 rows 1671 seconds: completed 4580000 rows 1679 seconds: completed 4600000 rows 1686 seconds: completed 4620000 rows 1694 seconds: completed 4640000 rows 1701 seconds: completed 4660000 rows 1709 seconds: completed 4680000 rows 1717 seconds: completed 4700000 rows 1724 seconds: completed 4720000 rows 1732 seconds: completed 4740000 rows 1739 seconds: completed 4760000 rows 1747 seconds: completed 4780000 rows 1754 seconds: completed 4800000 rows 1762 seconds: completed 4820000 rows 1769 seconds: completed 4840000 rows 1777 seconds: completed 4860000 rows 1785 seconds: completed 4880000 rows 1792 seconds: completed 4900000 rows 1800 seconds: completed 4920000 rows 1807 seconds: completed 4940000 rows 1815 seconds: completed 4960000 rows 1822 seconds: completed 4980000 rows 1830 seconds: completed 5000000 rows 1837 seconds: completed 5020000 rows 1845 seconds: completed 5040000 rows 1853 seconds: completed 5060000 rows 1860 seconds: completed 5080000 rows 1867 seconds: completed 5100000 rows 1875 seconds: completed 5120000 rows 1883 seconds: completed 5140000 rows 1890 seconds: completed 5160000 rows 1898 seconds: completed 5180000 rows 1905 seconds: completed 5200000 rows 1913 seconds: completed 5220000 rows 1920 seconds: completed 5240000 rows 1928 seconds: completed 5260000 rows 1935 seconds: completed 5280000 rows 1943 seconds: completed 5300000 rows 1950 seconds: completed 5320000 rows 1958 seconds: completed 5340000 rows 1965 seconds: completed 5360000 rows 1973 seconds: completed 5380000 rows 1980 seconds: completed 5400000 rows 1987 seconds: completed 5420000 rows 1995 seconds: completed 5440000 rows 2002 seconds: completed 5460000 rows 2010 seconds: completed 5480000 rows 2017 seconds: completed 5500000 rows 2025 seconds: completed 5520000 rows 2032 seconds: completed 5540000 rows 2040 seconds: completed 5560000 rows 2047 seconds: completed 5580000 rows 2055 seconds: completed 5600000 rows 2062 seconds: completed 5620000 rows 2070 seconds: completed 5640000 rows 2078 seconds: completed 5660000 rows 2085 seconds: completed 5680000 rows 2092 seconds: completed 5700000 rows 2099 seconds: completed 5720000 rows 2106 seconds: completed 5740000 rows 2113 seconds: completed 5760000 rows 2120 seconds: completed 5780000 rows 2127 seconds: completed 5800000 rows 2134 seconds: completed 5820000 rows 2141 seconds: completed 5840000 rows 2148 seconds: completed 5860000 rows 2155 seconds: completed 5880000 rows 2162 seconds: completed 5900000 rows 2169 seconds: completed 5920000 rows 2176 seconds: completed 5940000 rows 2183 seconds: completed 5960000 rows 2190 seconds: completed 5980000 rows 2197 seconds: completed 6000000 rows 2204 seconds: completed 6020000 rows 2211 seconds: completed 6040000 rows 2218 seconds: completed 6060000 rows 2225 seconds: completed 6080000 rows 2232 seconds: completed 6100000 rows 2239 seconds: completed 6120000 rows 2246 seconds: completed 6140000 rows 2252 seconds: completed 6160000 rows 2259 seconds: completed 6180000 rows 2266 seconds: completed 6200000 rows 2274 seconds: completed 6220000 rows 2281 seconds: completed 6240000 rows 2288 seconds: completed 6260000 rows 2296 seconds: completed 6280000 rows 2303 seconds: completed 6300000 rows 2311 seconds: completed 6320000 rows 2318 seconds: completed 6340000 rows 2326 seconds: completed 6360000 rows 2333 seconds: completed 6380000 rows 2341 seconds: completed 6400000 rows 2348 seconds: completed 6420000 rows 2356 seconds: completed 6440000 rows 2363 seconds: completed 6460000 rows 2371 seconds: completed 6480000 rows 2378 seconds: completed 6500000 rows 2386 seconds: completed 6520000 rows 2393 seconds: completed 6540000 rows 2401 seconds: completed 6560000 rows 2409 seconds: completed 6580000 rows 2417 seconds: completed 6600000 rows 2424 seconds: completed 6620000 rows 2432 seconds: completed 6640000 rows 2440 seconds: completed 6660000 rows 2448 seconds: completed 6680000 rows 2456 seconds: completed 6700000 rows 2463 seconds: completed 6720000 rows 2471 seconds: completed 6740000 rows 2478 seconds: completed 6760000 rows 2486 seconds: completed 6780000 rows 2493 seconds: completed 6800000 rows 2501 seconds: completed 6820000 rows 2508 seconds: completed 6840000 rows 2516 seconds: completed 6860000 rows 2523 seconds: completed 6880000 rows 2531 seconds: completed 6900000 rows 2538 seconds: completed 6920000 rows 2546 seconds: completed 6940000 rows 2554 seconds: completed 6960000 rows 2561 seconds: completed 6980000 rows 2568 seconds: completed 7000000 rows 2576 seconds: completed 7020000 rows 2583 seconds: completed 7040000 rows 2591 seconds: completed 7060000 rows 2599 seconds: completed 7080000 rows 2606 seconds: completed 7100000 rows 2614 seconds: completed 7120000 rows 2621 seconds: completed 7140000 rows 2629 seconds: completed 7160000 rows 2636 seconds: completed 7180000 rows 2643 seconds: completed 7200000 rows 2651 seconds: completed 7220000 rows 2658 seconds: completed 7240000 rows 2666 seconds: completed 7260000 rows 2673 seconds: completed 7280000 rows 2681 seconds: completed 7300000 rows 2688 seconds: completed 7320000 rows 2696 seconds: completed 7340000 rows 2703 seconds: completed 7360000 rows 2711 seconds: completed 7380000 rows 2718 seconds: completed 7400000 rows 2726 seconds: completed 7420000 rows 2733 seconds: completed 7440000 rows 2740 seconds: completed 7460000 rows 2748 seconds: completed 7480000 rows 2756 seconds: completed 7500000 rows 2763 seconds: completed 7520000 rows 2770 seconds: completed 7540000 rows 2778 seconds: completed 7560000 rows 2785 seconds: completed 7580000 rows 2792 seconds: completed 7600000 rows 2800 seconds: completed 7620000 rows 2807 seconds: completed 7640000 rows 2815 seconds: completed 7660000 rows 2822 seconds: completed 7680000 rows 2830 seconds: completed 7700000 rows 2837 seconds: completed 7720000 rows 2845 seconds: completed 7740000 rows 2852 seconds: completed 7760000 rows 2860 seconds: completed 7780000 rows 2867 seconds: completed 7800000 rows 2875 seconds: completed 7820000 rows 2882 seconds: completed 7840000 rows 2889 seconds: completed 7860000 rows 2897 seconds: completed 7880000 rows 2904 seconds: completed 7900000 rows 2912 seconds: completed 7920000 rows 2919 seconds: completed 7940000 rows 2927 seconds: completed 7960000 rows 2934 seconds: completed 7980000 rows 2942 seconds: completed 8000000 rows 2949 seconds: completed 8020000 rows 2957 seconds: completed 8040000 rows 2964 seconds: completed 8060000 rows 2972 seconds: completed 8080000 rows 2979 seconds: completed 8100000 rows 2987 seconds: completed 8120000 rows 2994 seconds: completed 8140000 rows 3002 seconds: completed 8160000 rows 3009 seconds: completed 8180000 rows 3017 seconds: completed 8200000 rows 3024 seconds: completed 8220000 rows 3031 seconds: completed 8240000 rows 3038 seconds: completed 8260000 rows 3045 seconds: completed 8280000 rows 3047 seconds: completed 8300000 rows
df = pd.read_sql_query('SELECT * FROM data LIMIT 3', disk_engine)
df.head()
index | CreatedDate | ClosedDate | Agency | ComplaintType | Descriptor | City | |
---|---|---|---|---|---|---|---|
0 | 1 | 2014-11-16 23:46:00.000000 | 2014-11-16 23:46:00.000000 | DSNY | Derelict Vehicles | 14 Derelict Vehicles | Jamaica |
1 | 2 | 2014-11-16 02:24:35.000000 | 2014-11-16 02:24:35.000000 | DOB | Building/Use | Illegal Conversion Of Residential Building/Space | BRONX |
2 | 3 | 2014-11-16 02:17:12.000000 | 2014-11-16 02:50:48.000000 | NYPD | Illegal Parking | Blocked Sidewalk | BROOKLYN |
df = pd.read_sql_query('SELECT Agency, Descriptor FROM data LIMIT 3', disk_engine)
df.head()
Agency | Descriptor | |
---|---|---|
0 | DSNY | 14 Derelict Vehicles |
1 | DOB | Illegal Conversion Of Residential Building/Space |
2 | NYPD | Blocked Sidewalk |
LIMIT
the number of rows that are retrieved¶df = pd.read_sql_query('SELECT ComplaintType, Descriptor, Agency '
'FROM data '
'LIMIT 10', disk_engine)
df
ComplaintType | Descriptor | Agency | |
---|---|---|---|
0 | Derelict Vehicles | 14 Derelict Vehicles | DSNY |
1 | Building/Use | Illegal Conversion Of Residential Building/Space | DOB |
2 | Illegal Parking | Blocked Sidewalk | NYPD |
3 | Noise - Street/Sidewalk | Loud Music/Party | NYPD |
4 | Illegal Parking | Commercial Overnight Parking | NYPD |
5 | Noise - Street/Sidewalk | Loud Talking | NYPD |
6 | Traffic | Congestion/Gridlock | NYPD |
7 | Noise - Commercial | Loud Music/Party | NYPD |
8 | Noise - Commercial | Loud Music/Party | NYPD |
9 | Noise - Commercial | Loud Music/Party | NYPD |
WHERE
¶df = pd.read_sql_query('SELECT ComplaintType, Descriptor, Agency '
'FROM data '
'WHERE Agency = "NYPD" '
'LIMIT 10', disk_engine)
df.head()
ComplaintType | Descriptor | Agency | |
---|---|---|---|
0 | Illegal Parking | Blocked Sidewalk | NYPD |
1 | Noise - Street/Sidewalk | Loud Music/Party | NYPD |
2 | Illegal Parking | Commercial Overnight Parking | NYPD |
3 | Noise - Street/Sidewalk | Loud Talking | NYPD |
4 | Traffic | Congestion/Gridlock | NYPD |
WHERE
and IN
¶df = pd.read_sql_query('SELECT ComplaintType, Descriptor, Agency '
'FROM data '
'WHERE Agency IN ("NYPD", "DOB")'
'LIMIT 10', disk_engine)
df.head()
ComplaintType | Descriptor | Agency | |
---|---|---|---|
0 | Building/Use | Illegal Conversion Of Residential Building/Space | DOB |
1 | Illegal Parking | Blocked Sidewalk | NYPD |
2 | Noise - Street/Sidewalk | Loud Music/Party | NYPD |
3 | Illegal Parking | Commercial Overnight Parking | NYPD |
4 | Noise - Street/Sidewalk | Loud Talking | NYPD |
DISTINCT
¶df = pd.read_sql_query('SELECT DISTINCT City FROM data', disk_engine)
df.head()
City | |
---|---|
0 | Jamaica |
1 | BRONX |
2 | BROOKLYN |
3 | NEW YORK |
4 | STATEN ISLAND |
COUNT(*)
and GROUP BY
¶df = pd.read_sql_query('SELECT Agency, COUNT(*) as `num_complaints`'
'FROM data '
'GROUP BY Agency ', disk_engine)
df.head()
Agency | num_complaints | |
---|---|---|
0 | 3-1-1 | 22029 |
1 | ACS | 2 |
2 | AJC | 2 |
3 | ART | 3 |
4 | CAU | 7 |
ORDER
and -
¶Housing and Development Dept receives the most complaints
df = pd.read_sql_query('SELECT Agency, COUNT(*) as `num_complaints`'
'FROM data '
'GROUP BY Agency '
'ORDER BY -num_complaints', disk_engine)
py.iplot([Bar(x=df.Agency, y=df.num_complaints)], filename='311/most common complaints by agency')
df = pd.read_sql_query('SELECT ComplaintType, COUNT(*) as `num_complaints`, Agency '
'FROM data '
'GROUP BY `ComplaintType` '
'ORDER BY -num_complaints', disk_engine)
most_common_complaints = df # used later
py.iplot({
'data': [Bar(x=df['ComplaintType'], y=df.num_complaints)],
'layout': {
'margin': {'b': 150}, # Make the bottom margin a bit bigger to handle the long text
'xaxis': {'tickangle': 40}} # Angle the labels a bit
}, filename='311/most common complaints by complaint type')
This graph is interactive. Click-and-drag horizontally to zoom, shift-click to pan, double click to autoscale
First, let's see how many cities are recorded in the dataset
len(pd.read_sql_query('SELECT DISTINCT City FROM data', disk_engine))
1758
Yikes - let's just plot the 10 most complained about cities
df = pd.read_sql_query('SELECT City, COUNT(*) as `num_complaints` '
'FROM data '
'GROUP BY `City` '
'ORDER BY -num_complaints '
'LIMIT 10 ', disk_engine)
df
City | num_complaints | |
---|---|---|
0 | BROOKLYN | 2441941 |
1 | NEW YORK | 1544421 |
2 | BRONX | 1470746 |
3 | None | 654158 |
4 | STATEN ISLAND | 408095 |
5 | JAMAICA | 141940 |
6 | FLUSHING | 112519 |
7 | ASTORIA | 86051 |
8 | RIDGEWOOD | 63400 |
9 | Jamaica | 54876 |
Flushing and FLUSHING, Jamaica and JAMAICA... the complaints are case sensitive.
GROUP BY
with COLLATE NOCASE
¶df = pd.read_sql_query('SELECT City, COUNT(*) as `num_complaints` '
'FROM data '
'GROUP BY `City` '
'COLLATE NOCASE '
'ORDER BY -num_complaints '
'LIMIT 11 ', disk_engine)
df
City | num_complaints | |
---|---|---|
0 | BROOKLYN | 2441941 |
1 | NEW YORK | 1544423 |
2 | BRONX | 1470746 |
3 | None | 654158 |
4 | STATEN ISLAND | 408095 |
5 | JAMAICA | 196816 |
6 | FLUSHING | 149625 |
7 | ASTORIA | 116103 |
8 | RIDGEWOOD | 86237 |
9 | WOODSIDE | 60148 |
10 | FAR ROCKAWAY | 59552 |
cities = list(df.City)
cities.remove(None)
traces = [] # the series in the graph - one trace for each city
for city in cities:
df = pd.read_sql_query('SELECT ComplaintType, COUNT(*) as `num_complaints` '
'FROM data '
'WHERE City = "{}" COLLATE NOCASE '
'GROUP BY `ComplaintType` '
'ORDER BY -num_complaints'.format(city), disk_engine)
traces.append(Bar(x=df['ComplaintType'], y=df.num_complaints, name=city.capitalize()))
py.iplot({'data': traces, 'layout': Layout(barmode='stack', xaxis={'tickangle': 40}, margin={'b': 150})}, filename='311/complaints by city stacked')
You can also click
on the legend entries to hide/show the traces. Click-and-drag to zoom in and shift-drag to pan.
Now let's normalize these counts. This is super easy now that this data has been reduced into a dataframe.
for trace in traces:
trace['y'] = 100.*trace['y']/sum(trace['y'])
py.iplot({'data': traces,
'layout': Layout(
barmode='group',
xaxis={'tickangle': 40, 'autorange': False, 'range': [-0.5, 16]},
yaxis={'title': 'Percent of Complaints by City'},
margin={'b': 150},
title='Relative Number of 311 Complaints by City')
}, filename='311/relative complaints by city', validate=False)
Click and drag to pan across the graph and see more of the complaints.
YYYY-MM-DD hh:mm:ss
¶df = pd.read_sql_query('SELECT ComplaintType, CreatedDate, City '
'FROM data '
'WHERE CreatedDate < "2014-11-16 23:47:00" '
'AND CreatedDate > "2014-11-16 23:45:00"', disk_engine)
df
ComplaintType | CreatedDate | City | |
---|---|---|---|
0 | Derelict Vehicles | 2014-11-16 23:46:00.000000 | Jamaica |
strftime
¶df = pd.read_sql_query('SELECT CreatedDate, '
'strftime(\'%H\', CreatedDate) as hour, '
'ComplaintType '
'FROM data '
'LIMIT 5 ', disk_engine)
df.head()
CreatedDate | hour | ComplaintType | |
---|---|---|---|
0 | 2014-11-16 23:46:00.000000 | 23 | Derelict Vehicles |
1 | 2014-11-16 02:24:35.000000 | 02 | Building/Use |
2 | 2014-11-16 02:17:12.000000 | 02 | Illegal Parking |
3 | 2014-11-16 02:15:13.000000 | 02 | Noise - Street/Sidewalk |
4 | 2014-11-16 02:14:01.000000 | 02 | Illegal Parking |
strftime
, GROUP BY
, and count(*)
¶df = pd.read_sql_query('SELECT CreatedDate, '
'strftime(\'%H\', CreatedDate) as hour, '
'count(*) as `Complaints per Hour`'
'FROM data '
'GROUP BY hour', disk_engine)
df.head()
CreatedDate | hour | Complaints per Hour | |
---|---|---|---|
0 | 2003-02-26 00:47:27.000000 | 00 | 3178595 |
1 | 2003-02-26 01:36:31.000000 | 01 | 71993 |
2 | 2003-03-04 02:00:46.000000 | 02 | 56362 |
3 | 2003-02-25 03:07:01.000000 | 03 | 33396 |
4 | 2003-03-04 04:32:11.000000 | 04 | 30434 |
py.iplot({
'data': [Bar(x=df['hour'], y=df['Complaints per Hour'])],
'layout': Layout(xaxis={'title': 'Hour in Day'},
yaxis={'title': 'Number of Complaints'})}, filename='311/complaints per hour')
df = pd.read_sql_query('SELECT CreatedDate, '
'strftime(\'%H\', CreatedDate) as `hour`, '
'count(*) as `Complaints per Hour`'
'FROM data '
'WHERE ComplaintType IN ("Noise", '
'"Noise - Street/Sidewalk", '
'"Noise - Commercial", '
'"Noise - Vehicle", '
'"Noise - Park", '
'"Noise - House of Worship", '
'"Noise - Helicopter", '
'"Collection Truck Noise") '
'GROUP BY hour', disk_engine)
display(df.head(n=2))
py.iplot({
'data': [Bar(x=df['hour'], y=df['Complaints per Hour'])],
'layout': Layout(xaxis={'title': 'Hour in Day'},
yaxis={'title': 'Number of Complaints'},
title='Number of Noise Complaints in NYC by Hour in Day'
)}, filename='311/noise complaints per hour')
CreatedDate | hour | Complaints per Hour | |
---|---|---|---|
0 | 2004-08-19 00:54:43.000000 | 00 | 41373 |
1 | 2008-08-29 01:07:39.000000 | 01 | 34588 |
complaint_traces = {} # Each series in the graph will represent a complaint
complaint_traces['Other'] = {}
for hour in range(1, 24):
hour_str = '0'+str(hour) if hour < 10 else str(hour)
df = pd.read_sql_query('SELECT CreatedDate, '
'ComplaintType ,'
'strftime(\'%H\', CreatedDate) as `hour`, '
'COUNT(*) as num_complaints '
'FROM data '
'WHERE hour = "{}" '
'GROUP BY ComplaintType '
'ORDER BY -num_complaints'.format(hour_str), disk_engine)
complaint_traces['Other'][hour] = sum(df.num_complaints)
# Grab the 7 most common complaints for that hour
for i in range(7):
complaint = df.get_value(i, 'ComplaintType')
count = df.get_value(i, 'num_complaints')
complaint_traces['Other'][hour] -= count
if complaint in complaint_traces:
complaint_traces[complaint][hour] = count
else:
complaint_traces[complaint] = {hour: count}
traces = []
for complaint in complaint_traces:
traces.append({
'x': range(25),
'y': [complaint_traces[complaint].get(i, None) for i in range(25)],
'name': complaint,
'type': 'bar'
})
py.iplot({
'data': traces,
'layout': {
'barmode': 'stack',
'xaxis': {'title': 'Hour in Day'},
'yaxis': {'title': 'Number of Complaints'},
'title': 'The 7 Most Common 311 Complaints by Hour in a Day'
}}, filename='311/most common complaints by hour')
First, create a new column with timestamps rounded to the previous 15 minute interval
minutes = 15
seconds = 15*60
df = pd.read_sql_query('SELECT CreatedDate, '
'datetime(('
'strftime(\'%s\', CreatedDate) / {seconds}) * {seconds}, \'unixepoch\') interval '
'FROM data '
'LIMIT 10 '.format(seconds=seconds), disk_engine)
display(df.head())
CreatedDate | interval | |
---|---|---|
0 | 2014-11-16 23:46:00.000000 | 2014-11-16 23:45:00 |
1 | 2014-11-16 02:24:35.000000 | 2014-11-16 02:15:00 |
2 | 2014-11-16 02:17:12.000000 | 2014-11-16 02:15:00 |
3 | 2014-11-16 02:15:13.000000 | 2014-11-16 02:15:00 |
4 | 2014-11-16 02:14:01.000000 | 2014-11-16 02:00:00 |
Then, GROUP BY
that interval and COUNT(*)
minutes = 15
seconds = minutes*60
df = pd.read_sql_query('SELECT datetime(('
'strftime(\'%s\', CreatedDate) / {seconds}) * {seconds}, \'unixepoch\') interval ,'
'COUNT(*) as "Complaints / interval"'
'FROM data '
'GROUP BY interval '
'ORDER BY interval '
'LIMIT 500'.format(seconds=seconds), disk_engine)
display(df.head())
display(df.tail())
interval | Complaints / interval | |
---|---|---|
0 | 2003-02-24 09:15:00 | 1 |
1 | 2003-02-24 09:30:00 | 2 |
2 | 2003-02-24 09:45:00 | 2 |
3 | 2003-02-24 10:00:00 | 2 |
4 | 2003-02-24 10:15:00 | 1 |
interval | Complaints / interval | |
---|---|---|
495 | 2003-03-13 07:30:00 | 2 |
496 | 2003-03-13 08:45:00 | 1 |
497 | 2003-03-13 09:00:00 | 1 |
498 | 2003-03-13 09:15:00 | 1 |
499 | 2003-03-13 09:30:00 | 2 |
py.iplot(
{
'data': [{
'x': df.interval,
'y': df['Complaints / interval'],
'type': 'bar'
}],
'layout': {
'title': 'Number of 311 Complaints per 15 Minutes'
}
}, filename='311/complaints per 15 minutes')
hours = 24
minutes = hours*60
seconds = minutes*60
df = pd.read_sql_query('SELECT datetime(('
'strftime(\'%s\', CreatedDate) / {seconds}) * {seconds}, \'unixepoch\') interval ,'
'COUNT(*) as "Complaints / interval"'
'FROM data '
'GROUP BY interval '
'ORDER BY interval'.format(seconds=seconds), disk_engine)
py.iplot(
{
'data': [{
'x': df.interval,
'y': df['Complaints / interval'],
'type': 'bar'
}],
'layout': {
'title': 'Number of 311 Complaints per Day'
}
}, filename='311/complaints per day')
from IPython.core.display import HTML
import urllib2
HTML(urllib2.urlopen('https://raw.githubusercontent.com/plotly/python-user-guide/css-updates/custom.css').read())