import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv('data/nyc_data.csv', parse_dates=['pickup_datetime',
'dropoff_datetime'])
fare = pd.read_csv('data/nyc_fare.csv', parse_dates=['pickup_datetime'])
data[['trip_distance', 'trip_time_in_secs']].head(3)
trip_distance trip_time_in_secs 0 0.61 300 1 3.28 960 2 1.50 386
data.loc[0]
medallion 76942C3205E17D7E7FE5A9F709D16434 hack_license 25BA06A87905667AA1FE5990E33F0E2E vendor_id VTS rate_code 1 store_and_fwd_flag NaN pickup_datetime 2013-01-01 00:00:00 dropoff_datetime 2013-01-01 00:05:00 passenger_count 3 trip_time_in_secs 300 trip_distance 0.61 pickup_longitude -73.95592 pickup_latitude 40.78189 dropoff_longitude -73.96318 dropoff_latitude 40.77783 Name: 0, dtype: object
data.loc[[0, 100000]]
data.loc[1000:2000:10,
['trip_distance', 'trip_time_in_secs']]
trip_distance trip_time_in_secs 1000 1.00 441 1010 3.80 691 .... 1990 0.13 60 2000 9.60 963
data.loc[data.trip_distance>50]
from ipywidgets import interact
@interact
def show_nrows(distance_threshold=(0, 200)):
return len(data.loc[data.trip_distance > distance_threshold])
data['trip_time_in_mins'] = data.trip_time_in_secs / 60.0
data[['trip_time_in_secs', 'trip_time_in_mins']].head(3)
trip_time_in_secs trip_time_in_mins 0 300 5.000000 1 960 16.000000 2 386 6.433333
a = data.trip_distance[:5]
a
0 0.61 1 3.28 2 1.50 3 0.00 4 1.31 Name: trip_distance, dtype: float64
b = data.trip_distance[2:6]
b
2 1.50 3 0.00 4 1.31 5 5.81 Name: trip_distance, dtype: float64
a + b
0 NaN 1 NaN 2 3.00 3 0.00 4 2.62 5 NaN Name: trip_distance, dtype: float64
data.medallion.head(3)
0 76942C3205E17D7E7FE5A9F709D16434 1 517C6B330DBB3F055D007B07512628B3 2 ED15611F168E41B33619C83D900FE266 Name: medallion, dtype: object
data.medallion.str.slice(0, 4).head(3)
0 7694 1 517C 2 ED15 Name: medallion, dtype: object
data.pickup_datetime.dt.dayofweek[::200000]
0 1 200000 6 400000 5 600000 0 800000 1 dtype: int64
day_p = data.pickup_datetime.dt.day
day_d = data.dropoff_datetime.dt.day
selection = (day_p != day_d)
print(len(data.loc[selection]))
data.loc[selection].head(3)
7716