In [319]:

import pandas as pd
import matplotlib.pyplot as plt

In [320]:

df = pd.read_csv('/Users/danielforsyth/Desktop/bikes.csv')
df.head()

Out[320]:

	DC_NUM	DC_KEY	LOCATION_B	THEFT_DATE	THEFT_YEAR	DC_DIST	STOLEN_VAL	THEFT_HOUR	UCR	LAT	LNG
0	14848	2.010010e+11	2100 BLOCK S BROAD ST	4/16/2010	2010	1	150	0	625	39.924273	-75.169775
1	16330	2.010010e+11	S 2100 MCKEAN ST	4/24/2010	2010	1	215	17	615	39.927074	-75.180905
2	23568	2.010010e+11	2700 BLOCK SNYDER AVE	6/6/2010	2010	1	120	11	625	39.927120	-75.191034
3	28556	2.010010e+11	2100 BLOCK S GARNET ST	7/8/2010	2010	1	200	15	615	39.925411	-75.178257
4	29047	2.010010e+11	2100 BLOCK S 15TH ST	7/11/2010	2010	1	75	11	625	39.924141	-75.171457

5 rows × 11 columns

In [321]:

df = df[['LOCATION_B','THEFT_DATE','THEFT_YEAR','THEFT_HOUR','STOLEN_VAL','UCR']]
df.columns = ['Location','Date','Year','Hour','Value','Value_Code']
df.head()

Out[321]:

	Location	Date	Year	Hour	Value	Value_Code
0	2100 BLOCK S BROAD ST	4/16/2010	2010	0	150	625
1	S 2100 MCKEAN ST	4/24/2010	2010	17	215	615
2	2700 BLOCK SNYDER AVE	6/6/2010	2010	11	120	625
3	2100 BLOCK S GARNET ST	7/8/2010	2010	15	200	615
4	2100 BLOCK S 15TH ST	7/11/2010	2010	11	75	625

5 rows × 6 columns

In [258]:

pd.options.display.mpl_style = 'default'
from matplotlib import rcParams
rcParams['figure.figsize'] = (20, 8)
rcParams['figure.dpi'] = 300

In [259]:

df['Date'] = pd.to_datetime(pd.Series(df['Date']))
df.set_index('Date', drop=False, inplace=True)

df['Thefts'] = df['Date'].value_counts()

plt.figure()
plt.ylim(0,20)

df['Thefts'].plot(x_compat=True)

Out[259]:

<matplotlib.axes.AxesSubplot at 0x1168429d0>

In [281]:

temp = pd.read_csv('/Users/danielforsyth/Desktop/climate.csv',
            parse_dates={'Date': ['MO','DAY','YEAR']},
            usecols=['MO', 'DAY', 'YEAR', 'HIGH'])
temp.head()

Out[281]:

	Date	HIGH
0	2010-01-01	42
1	2010-01-02	32
2	2010-01-03	27
3	2010-01-04	32
4	2010-01-05	33

5 rows × 2 columns

In [274]:

temp.plot()

Out[274]:

<matplotlib.axes.AxesSubplot at 0x11741f910>

In [284]:

#merge df and temp
merged = pd.merge(df, temp, on='Date', how='outer')
merged.columns = ['Location','Date','Year','Hour','Value','Value_Code','Thefts','Temp']

In [285]:

merged.head()

Out[285]:

	Location	Date	Year	Hour	Value	Value_Code	Thefts	Temp
0	2100 BLOCK S BROAD ST	2010-04-16	2010	0	150	625	4	75
1	7300 BLOCK CLARIDGE ST	2010-04-16	2010	19	700	615	4	75
2	1000 BLOCK MARKET ST	2010-04-16	2010	6	100	625	4	75
3	800 BLOCK S 48TH ST	2010-04-16	2010	19	150	625	4	75
4	S 2100 MCKEAN ST	2010-04-24	2010	17	215	615	7	68

5 rows × 8 columns

In [322]:

new = merged.sort('Thefts', ascending=False)
new = new.dropna()
new.head()

Out[322]:

	Location	Date	Year	Hour	Value	Value_Code	Thefts	Temp
6738	400 BLOCK N BROAD ST	2013-08-15	2013	14	400	615	20	78
6749	1000 BLOCK S 45TH ST	2013-08-15	2013	10	70	625	20	78
6748	200 BLOCK E ROCKLAND ST	2013-08-15	2013	17	100	625	20	78
6747	1700 BLOCK N BROAD ST	2013-08-15	2013	15	400	615	20	78
6742	2000 BLOCK CHANCELLOR ST	2013-08-15	2013	21	200	615	20	78

5 rows × 8 columns

In [289]:

thefts = merged['Date'].value_counts()

thefts.head(10)

Out[289]:

2013-08-15    20
2013-07-24    19
2011-06-15    19
2013-07-10    19
2013-08-14    19
2012-08-09    18
2013-07-13    18
2013-07-23    17
2010-08-26    17
2012-09-21    17
dtype: int64

In [146]:

#thefts per year 2013 , missing 3 months (october-december)
yearly = df.Year.value_counts()
yearly

Out[146]:

2012    1975
2011    1848
2010    1831
2013    1569
dtype: int64

In [147]:

#Thefts per Month
df['Date'] = df['Date'].map(lambda x: str(x)[:2])
df['Date'] = df['Date'].map(lambda x: x.rstrip('/'))

monthly = df.Date.value_counts()
monthly

Out[147]:

8     1182
7     1132
9      957
6      947
5      698
10     608
4      495
11     359
3      305
12     182
2      181
1      177
dtype: int64

In [323]:

df.Hour.value_counts()

Out[323]:

17    618
18    598
16    593
19    574
20    495
15    478
14    456
12    448
13    447
21    369
11    324
22    319
10    293
9     250
23    201
8     186
0     144
1     113
7      98
2      71
3      48
4      38
5      31
6      31
dtype: int64

In [325]:

df.Value.value_counts()

Out[325]:

200     821
100     636
300     595
500     482
150     477
400     413
250     292
600     254
0       192
350     175
1000    168
50      146
800     145
700     137
450     100
...
405    1
397    1
374    1
442    1
269    1
446    1
337    1
333    1
335    1
515    1
313    1
305    1
289    1
327    1
243    1
Length: 396, dtype: int64

In [151]:

df.Value_Code.value_counts()

Out[151]:

615    4887
625    2172
635     164
dtype: int64

In [327]:

loc = df.Location.value_counts()
loc.head(25)

Out[327]:

1500 BLOCK MARKET ST           28
200 BLOCK S BROAD ST           27
200 BLOCK S 33RD ST            24
1700 BLOCK N 10TH ST           22
200 BLOCK S JUNIPER ST         21
3100 BLOCK MARKET ST           20
1900 BLOCK VINE ST             20
200 BLOCK N 34TH ST            20
900 BLOCK SOUTH ST             19
1100 BLOCK FILBERT ST          18
3700 BLOCK WALNUT ST           18
1800 BLOCK WALNUT ST           18
3400 BLOCK SPRUCE ST           17
200 BLOCK S 34TH ST            17
1600 BLOCK CHESTNUT ST         17
1100 BLOCK S BROAD ST          17
3900 BLOCK WALNUT ST           17
3900 BLOCK LOCUST WALK         16
1800 BLOCK N 10TH ST           15
1600 BLOCK WALNUT ST           15
1300 BLOCK CECIL B MOORE AV    15
3100 BLOCK LUDLOW ST           15
2000 BLOCK N BROAD ST          14
200 BLOCK S 36TH ST            14
4000 BLOCK WALNUT ST           14
dtype: int64

In [153]:

value = df[['Location','Value_Code']]
value.head()

Out[153]:

	Location	Value_Code
0	2100 BLOCK S BROAD ST	625
1	S 2100 MCKEAN ST	615
2	2700 BLOCK SNYDER AVE	625
3	2100 BLOCK S GARNET ST	615
4	2100 BLOCK S 15TH ST	625

5 rows × 2 columns