import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('/Users/danielforsyth/Desktop/bikes.csv')
df.head()
DC_NUM | DC_KEY | LOCATION_B | THEFT_DATE | THEFT_YEAR | DC_DIST | STOLEN_VAL | THEFT_HOUR | UCR | LAT | LNG | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 14848 | 2.010010e+11 | 2100 BLOCK S BROAD ST | 4/16/2010 | 2010 | 1 | 150 | 0 | 625 | 39.924273 | -75.169775 |
1 | 16330 | 2.010010e+11 | S 2100 MCKEAN ST | 4/24/2010 | 2010 | 1 | 215 | 17 | 615 | 39.927074 | -75.180905 |
2 | 23568 | 2.010010e+11 | 2700 BLOCK SNYDER AVE | 6/6/2010 | 2010 | 1 | 120 | 11 | 625 | 39.927120 | -75.191034 |
3 | 28556 | 2.010010e+11 | 2100 BLOCK S GARNET ST | 7/8/2010 | 2010 | 1 | 200 | 15 | 615 | 39.925411 | -75.178257 |
4 | 29047 | 2.010010e+11 | 2100 BLOCK S 15TH ST | 7/11/2010 | 2010 | 1 | 75 | 11 | 625 | 39.924141 | -75.171457 |
5 rows × 11 columns
df = df[['LOCATION_B','THEFT_DATE','THEFT_YEAR','THEFT_HOUR','STOLEN_VAL','UCR']]
df.columns = ['Location','Date','Year','Hour','Value','Value_Code']
df.head()
Location | Date | Year | Hour | Value | Value_Code | |
---|---|---|---|---|---|---|
0 | 2100 BLOCK S BROAD ST | 4/16/2010 | 2010 | 0 | 150 | 625 |
1 | S 2100 MCKEAN ST | 4/24/2010 | 2010 | 17 | 215 | 615 |
2 | 2700 BLOCK SNYDER AVE | 6/6/2010 | 2010 | 11 | 120 | 625 |
3 | 2100 BLOCK S GARNET ST | 7/8/2010 | 2010 | 15 | 200 | 615 |
4 | 2100 BLOCK S 15TH ST | 7/11/2010 | 2010 | 11 | 75 | 625 |
5 rows × 6 columns
pd.options.display.mpl_style = 'default'
from matplotlib import rcParams
rcParams['figure.figsize'] = (20, 8)
rcParams['figure.dpi'] = 300
df['Date'] = pd.to_datetime(pd.Series(df['Date']))
df.set_index('Date', drop=False, inplace=True)
df['Thefts'] = df['Date'].value_counts()
plt.figure()
plt.ylim(0,20)
df['Thefts'].plot(x_compat=True)
<matplotlib.axes.AxesSubplot at 0x1168429d0>
temp = pd.read_csv('/Users/danielforsyth/Desktop/climate.csv',
parse_dates={'Date': ['MO','DAY','YEAR']},
usecols=['MO', 'DAY', 'YEAR', 'HIGH'])
temp.head()
Date | HIGH | |
---|---|---|
0 | 2010-01-01 | 42 |
1 | 2010-01-02 | 32 |
2 | 2010-01-03 | 27 |
3 | 2010-01-04 | 32 |
4 | 2010-01-05 | 33 |
5 rows × 2 columns
temp.plot()
<matplotlib.axes.AxesSubplot at 0x11741f910>
#merge df and temp
merged = pd.merge(df, temp, on='Date', how='outer')
merged.columns = ['Location','Date','Year','Hour','Value','Value_Code','Thefts','Temp']
merged.head()
Location | Date | Year | Hour | Value | Value_Code | Thefts | Temp | |
---|---|---|---|---|---|---|---|---|
0 | 2100 BLOCK S BROAD ST | 2010-04-16 | 2010 | 0 | 150 | 625 | 4 | 75 |
1 | 7300 BLOCK CLARIDGE ST | 2010-04-16 | 2010 | 19 | 700 | 615 | 4 | 75 |
2 | 1000 BLOCK MARKET ST | 2010-04-16 | 2010 | 6 | 100 | 625 | 4 | 75 |
3 | 800 BLOCK S 48TH ST | 2010-04-16 | 2010 | 19 | 150 | 625 | 4 | 75 |
4 | S 2100 MCKEAN ST | 2010-04-24 | 2010 | 17 | 215 | 615 | 7 | 68 |
5 rows × 8 columns
new = merged.sort('Thefts', ascending=False)
new = new.dropna()
new.head()
Location | Date | Year | Hour | Value | Value_Code | Thefts | Temp | |
---|---|---|---|---|---|---|---|---|
6738 | 400 BLOCK N BROAD ST | 2013-08-15 | 2013 | 14 | 400 | 615 | 20 | 78 |
6749 | 1000 BLOCK S 45TH ST | 2013-08-15 | 2013 | 10 | 70 | 625 | 20 | 78 |
6748 | 200 BLOCK E ROCKLAND ST | 2013-08-15 | 2013 | 17 | 100 | 625 | 20 | 78 |
6747 | 1700 BLOCK N BROAD ST | 2013-08-15 | 2013 | 15 | 400 | 615 | 20 | 78 |
6742 | 2000 BLOCK CHANCELLOR ST | 2013-08-15 | 2013 | 21 | 200 | 615 | 20 | 78 |
5 rows × 8 columns
thefts = merged['Date'].value_counts()
thefts.head(10)
2013-08-15 20 2013-07-24 19 2011-06-15 19 2013-07-10 19 2013-08-14 19 2012-08-09 18 2013-07-13 18 2013-07-23 17 2010-08-26 17 2012-09-21 17 dtype: int64
#thefts per year 2013 , missing 3 months (october-december)
yearly = df.Year.value_counts()
yearly
2012 1975 2011 1848 2010 1831 2013 1569 dtype: int64
#Thefts per Month
df['Date'] = df['Date'].map(lambda x: str(x)[:2])
df['Date'] = df['Date'].map(lambda x: x.rstrip('/'))
monthly = df.Date.value_counts()
monthly
8 1182 7 1132 9 957 6 947 5 698 10 608 4 495 11 359 3 305 12 182 2 181 1 177 dtype: int64
df.Hour.value_counts()
17 618 18 598 16 593 19 574 20 495 15 478 14 456 12 448 13 447 21 369 11 324 22 319 10 293 9 250 23 201 8 186 0 144 1 113 7 98 2 71 3 48 4 38 5 31 6 31 dtype: int64
df.Value.value_counts()
200 821 100 636 300 595 500 482 150 477 400 413 250 292 600 254 0 192 350 175 1000 168 50 146 800 145 700 137 450 100 ... 405 1 397 1 374 1 442 1 269 1 446 1 337 1 333 1 335 1 515 1 313 1 305 1 289 1 327 1 243 1 Length: 396, dtype: int64
df.Value_Code.value_counts()
615 4887 625 2172 635 164 dtype: int64
loc = df.Location.value_counts()
loc.head(25)
1500 BLOCK MARKET ST 28 200 BLOCK S BROAD ST 27 200 BLOCK S 33RD ST 24 1700 BLOCK N 10TH ST 22 200 BLOCK S JUNIPER ST 21 3100 BLOCK MARKET ST 20 1900 BLOCK VINE ST 20 200 BLOCK N 34TH ST 20 900 BLOCK SOUTH ST 19 1100 BLOCK FILBERT ST 18 3700 BLOCK WALNUT ST 18 1800 BLOCK WALNUT ST 18 3400 BLOCK SPRUCE ST 17 200 BLOCK S 34TH ST 17 1600 BLOCK CHESTNUT ST 17 1100 BLOCK S BROAD ST 17 3900 BLOCK WALNUT ST 17 3900 BLOCK LOCUST WALK 16 1800 BLOCK N 10TH ST 15 1600 BLOCK WALNUT ST 15 1300 BLOCK CECIL B MOORE AV 15 3100 BLOCK LUDLOW ST 15 2000 BLOCK N BROAD ST 14 200 BLOCK S 36TH ST 14 4000 BLOCK WALNUT ST 14 dtype: int64
value = df[['Location','Value_Code']]
value.head()
Location | Value_Code | |
---|---|---|
0 | 2100 BLOCK S BROAD ST | 625 |
1 | S 2100 MCKEAN ST | 615 |
2 | 2700 BLOCK SNYDER AVE | 625 |
3 | 2100 BLOCK S GARNET ST | 615 |
4 | 2100 BLOCK S 15TH ST | 625 |
5 rows × 2 columns