import pandas as pd
import numpy as np
import datetime
import seaborn as sns
from matplotlib import pyplot as plt
%pylab inline
sns.set_style('darkgrid')
df = pd.read_csv('download1.csv')
del df['Unnamed: 0']
df.index = pd.to_datetime(df.time)
df = df[pd.isnull(df.index) == False]
df.head()
Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['axes', 'grid'] `%matplotlib` prevents importing * from pylab and numpy
12TH_0 | 12TH_1 | 12TH_2 | 19TH_0 | 19TH_1 | 19TH_2 | CIVC_0 | CIVC_1 | CIVC_2 | EMBR_0 | ... | POWL_0 | POWL_1 | POWL_2 | ROCK_0 | ROCK_1 | ROCK_2 | WOAK_0 | WOAK_1 | WOAK_2 | time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
time | |||||||||||||||||||||
2015-03-02 11:48:04 | 11 | 26 | 41 | 10 | 25 | 40 | 0 | 12 | 27 | 8 | ... | 11 | 26 | 41 | 3 | 18 | 33 | 0 | 16 | 31 | 2015-03-02 11:48:04 |
2015-03-02 11:48:34 | 11 | 26 | 41 | 9 | 24 | 39 | 12 | 26 | 41 | 7 | ... | 11 | 25 | 40 | 2 | 18 | 33 | 0 | 15 | 30 | 2015-03-02 11:48:34 |
2015-03-02 11:49:04 | 10 | 25 | 40 | 9 | 24 | 39 | 11 | 26 | 41 | 6 | ... | 10 | 25 | 40 | 2 | 18 | 32 | 15 | 30 | 45 | 2015-03-02 11:49:04 |
2015-03-02 11:49:34 | 10 | 25 | 40 | 8 | 24 | 38 | 11 | 25 | 40 | 6 | ... | 9 | 24 | 39 | 1 | 17 | 32 | 14 | 29 | 44 | 2015-03-02 11:49:34 |
2015-03-02 11:50:06 | 9 | 24 | 39 | 8 | 23 | 38 | 10 | 25 | 40 | 5 | ... | 9 | 24 | 39 | 0 | 16 | 31 | 14 | 29 | 44 | 2015-03-02 11:50:06 |
5 rows × 28 columns
commute_df = df[(df.index.map(lambda x: x.time) >= datetime.time(6, 0, 0)) & (df.index.map(lambda x: x.time) < datetime.time(10, 0, 0))]
def get_arrival_times(df):
output = dict()
cols = [c for c in df.columns if c[-1] == '0']
for col in cols:
# temp0 = df[df[col] == 0].index.tolist()
# temp1 = (df[df[col] == 0].index - datetime.timedelta(minutes=1)).tolist()
# temp = temp0 + temp1
# temp.sort()
# temp = pd.Series(temp)
temp = pd.Series(df[df[col] == 0].index.tolist())
diffs = pd.Series(np.diff(temp), index=np.arange(1, len(temp)))
first_arrivals = temp[1:][diffs >= datetime.timedelta(seconds=60)]
output[col] = pd.Series([temp[0]] + first_arrivals.tolist())
return output
temp = pd.DataFrame(get_arrival_times(commute_df))
temp.head()
12TH_0 | 19TH_0 | CIVC_0 | EMBR_0 | MCAR_0 | MONT_0 | POWL_0 | ROCK_0 | WOAK_0 | |
---|---|---|---|---|---|---|---|---|---|
0 | 2015-03-03 06:13:19 | 2015-03-03 06:11:49 | 2015-03-03 06:14:34 | 2015-03-03 06:09:49 | 2015-03-03 06:08:04 | 2015-03-03 06:11:19 | 2015-03-03 06:12:49 | 2015-03-03 06:04:49 | 2015-03-03 06:03:04 |
1 | 2015-03-03 06:28:34 | 2015-03-03 06:27:34 | 2015-03-03 06:30:07 | 2015-03-03 06:24:34 | 2015-03-03 06:23:04 | 2015-03-03 06:26:19 | 2015-03-03 06:28:04 | 2015-03-03 06:20:07 | 2015-03-03 06:18:04 |
2 | 2015-03-03 06:43:22 | 2015-03-03 06:42:04 | 2015-03-03 06:44:34 | 2015-03-03 06:40:07 | 2015-03-03 06:38:34 | 2015-03-03 06:41:34 | 2015-03-03 06:43:04 | 2015-03-03 06:34:49 | 2015-03-03 06:32:52 |
3 | 2015-03-03 07:03:34 | 2015-03-03 07:01:34 | 2015-03-03 06:59:34 | 2015-03-03 06:55:06 | 2015-03-03 06:56:34 | 2015-03-03 06:56:34 | 2015-03-03 06:58:04 | 2015-03-03 06:53:04 | 2015-03-03 06:47:49 |
4 | 2015-03-03 07:13:34 | 2015-03-03 07:12:34 | 2015-03-03 07:20:07 | 2015-03-03 07:15:06 | 2015-03-03 07:08:04 | 2015-03-03 07:16:34 | 2015-03-03 07:18:04 | 2015-03-03 07:04:34 | 2015-03-03 07:08:04 |
output = []
cols_in_order = ['ROCK_0', 'MCAR_0', '19TH_0', '12TH_0', 'WOAK_0', 'EMBR_0', 'MONT_0', 'POWL_0', 'CIVC_0']
min_diff_seconds = {
'MCAR_0': 60,
'19TH_0': 60,
'12TH_0': 30,
'WOAK_0': 30,
'EMBR_0': 300,
'MONT_0': 30,
'POWL_0': 30,
'CIVC_0': 30,
}
for x in temp.dropna().ROCK_0:
if x.hour >= 9:
continue
this_row = dict()
this_row['ROCK_0'] = x
last_time = x
for col in cols_in_order[1:]:
next_time = temp[temp[col] >= last_time + datetime.timedelta(seconds=min_diff_seconds[col])][col].iloc[0]
this_row[col] = next_time
last_time = next_time
output.append(this_row)
all_trains = pd.DataFrame(output, columns=cols_in_order)
sns.distplot((all_trains['CIVC_0'] - all_trains['ROCK_0']) / np.timedelta64(1, 'm'), bins=np.arange(20, 51))
plt.gcf().set_size_inches(10, 5)
plt.xlabel('Trip Duration (Minutes)')
plt.title('Distribution of Trip Lengths')
<matplotlib.text.Text at 0x13436bf90>
whole_trip = (all_trains['CIVC_0'] - all_trains['ROCK_0']) / np.timedelta64(1, 'm')
for i in range(20, 50):
print i, "{:.3f}".format((whole_trip <= i).mean())
20 0.000 21 0.000 22 0.004 23 0.045 24 0.152 25 0.386 26 0.591 27 0.750 28 0.835 29 0.894 30 0.934 31 0.957 32 0.973 33 0.981 34 0.986 35 0.991 36 0.995 37 0.996 38 0.996 39 0.996 40 0.997 41 0.997 42 0.999 43 0.999 44 0.999 45 0.999 46 1.000 47 1.000 48 1.000 49 1.000
whole_trip.median()
25.716666666666665
for _, row in all_trains.iterrows():
pd.Series(np.diff(row) / np.timedelta64(1, 'm'), index=cols_in_order[1:]).plot(color='k', alpha=0.5)
all_trains[((all_trains['WOAK_0'] - all_trains['12TH_0']) / np.timedelta64(1, 'm')) >= 10]
ROCK_0 | MCAR_0 | 19TH_0 | 12TH_0 | WOAK_0 | EMBR_0 | MONT_0 | POWL_0 | CIVC_0 | |
---|---|---|---|---|---|---|---|---|---|
435 | 2015-04-14 07:05:19 | 2015-04-14 07:08:19 | 2015-04-14 07:12:19 | 2015-04-14 07:13:20 | 2015-04-14 07:24:49 | 2015-04-14 07:34:19 | 2015-04-14 07:36:22 | 2015-04-14 07:37:49 | 2015-04-14 07:39:19 |
477 | 2015-04-17 08:35:34 | 2015-04-17 08:39:07 | 2015-04-17 08:45:06 | 2015-04-17 08:46:34 | 2015-04-17 09:08:34 | 2015-04-17 09:16:04 | 2015-04-17 09:18:04 | 2015-04-17 09:19:34 | 2015-04-17 09:21:04 |
569 | 2015-04-27 07:05:22 | 2015-04-27 07:08:19 | 2015-04-27 07:11:51 | 2015-04-27 07:13:19 | 2015-04-27 07:23:19 | 2015-04-27 07:31:19 | 2015-04-27 07:33:19 | 2015-04-27 07:35:34 | 2015-04-27 07:37:34 |
571 | 2015-04-27 07:36:04 | 2015-04-27 07:39:19 | 2015-04-27 07:42:49 | 2015-04-27 07:44:04 | 2015-04-27 07:57:19 | 2015-04-27 08:05:22 | 2015-04-27 08:06:49 | 2015-04-27 08:08:49 | 2015-04-27 08:10:19 |
628 | 2015-05-02 07:18:48 | 2015-05-02 07:21:52 | 2015-05-02 07:25:19 | 2015-05-02 07:26:18 | 2015-05-02 07:46:03 | 2015-05-02 07:53:18 | 2015-05-02 07:54:33 | 2015-05-02 07:56:03 | 2015-05-02 07:58:03 |
693 | 2015-05-08 07:36:04 | 2015-05-08 07:39:04 | 2015-05-08 07:42:34 | 2015-05-08 07:44:04 | 2015-05-08 07:54:49 | 2015-05-08 08:02:37 | 2015-05-08 08:04:19 | 2015-05-08 08:06:19 | 2015-05-08 08:07:49 |
all_trains.iloc[433:438]
ROCK_0 | MCAR_0 | 19TH_0 | 12TH_0 | WOAK_0 | EMBR_0 | MONT_0 | POWL_0 | CIVC_0 | |
---|---|---|---|---|---|---|---|---|---|
433 | 2015-04-14 06:44:19 | 2015-04-14 06:47:19 | 2015-04-14 06:50:19 | 2015-04-14 06:51:49 | 2015-04-14 06:56:22 | 2015-04-14 07:03:19 | 2015-04-14 07:05:19 | 2015-04-14 07:16:49 | 2015-04-14 07:18:22 |
434 | 2015-04-14 06:50:19 | 2015-04-14 06:53:20 | 2015-04-14 06:57:22 | 2015-04-14 06:58:49 | 2015-04-14 07:04:19 | 2015-04-14 07:20:49 | 2015-04-14 07:22:52 | 2015-04-14 07:24:19 | 2015-04-14 07:25:49 |
435 | 2015-04-14 07:05:19 | 2015-04-14 07:08:19 | 2015-04-14 07:12:19 | 2015-04-14 07:13:20 | 2015-04-14 07:24:49 | 2015-04-14 07:34:19 | 2015-04-14 07:36:22 | 2015-04-14 07:37:49 | 2015-04-14 07:39:19 |
436 | 2015-04-14 07:22:52 | 2015-04-14 07:25:49 | 2015-04-14 07:28:49 | 2015-04-14 07:30:49 | 2015-04-14 07:34:49 | 2015-04-14 07:45:19 | 2015-04-14 07:47:22 | 2015-04-14 07:49:49 | 2015-04-14 07:51:19 |
437 | 2015-04-14 07:36:22 | 2015-04-14 07:39:19 | 2015-04-14 07:43:49 | 2015-04-14 07:45:19 | 2015-04-14 07:49:49 | 2015-04-14 07:59:19 | 2015-04-14 08:01:22 | 2015-04-14 08:03:51 | 2015-04-14 08:05:19 |
temp[temp['WOAK_0'] >= datetime.datetime(2015, 4, 14, 7, 13)]['WOAK_0']
586 2015-04-14 07:24:49 587 2015-04-14 07:34:49 588 2015-04-14 07:49:49 589 2015-04-14 08:05:49 590 2015-04-14 08:19:19 591 2015-04-14 08:35:49 592 2015-04-14 08:53:49 593 2015-04-14 09:05:19 594 2015-04-14 09:20:19 595 2015-04-14 09:32:22 596 2015-04-14 09:47:22 597 2015-04-15 06:02:49 598 2015-04-15 06:18:19 599 2015-04-15 06:32:49 600 2015-04-15 06:48:49 601 2015-04-15 07:03:49 602 2015-04-15 07:17:49 603 2015-04-15 07:32:49 604 2015-04-15 07:48:19 605 2015-04-15 08:02:52 606 2015-04-15 08:20:22 607 2015-04-15 08:33:49 608 2015-04-15 08:51:49 609 2015-04-15 09:03:04 610 2015-04-15 09:18:34 611 2015-04-15 09:34:07 612 2015-04-15 09:48:19 613 2015-04-16 06:03:04 614 2015-04-16 06:17:49 615 2015-04-16 06:33:04 ... 1047 2015-05-16 09:03:18 1048 2015-05-16 09:11:18 1049 2015-05-16 09:37:19 1050 2015-05-16 09:51:48 1051 2015-05-17 06:49:48 1052 2015-05-17 07:05:52 1053 2015-05-17 07:25:22 1054 2015-05-17 07:44:49 1055 2015-05-17 08:05:18 1056 2015-05-17 08:29:22 1057 2015-05-17 08:45:18 1058 2015-05-17 09:05:18 1059 2015-05-17 09:27:03 1060 2015-05-17 09:45:34 1061 2015-05-18 06:02:49 1062 2015-05-18 06:18:19 1063 2015-05-18 06:32:49 1064 2015-05-18 06:47:49 1065 2015-05-18 07:03:49 1066 2015-05-18 07:18:49 1067 2015-05-18 07:33:04 1068 2015-05-18 07:50:19 1069 2015-05-18 08:04:34 1070 2015-05-18 08:25:06 1071 2015-05-18 08:34:52 1072 2015-05-18 08:41:04 1073 2015-05-18 08:54:37 1074 2015-05-18 09:05:34 1075 2015-05-18 09:18:34 1076 2015-05-18 09:35:37 Name: WOAK_0, dtype: datetime64[ns]
df[df.index >= datetime.datetime(2015, 4, 17, 8, 46)]['WOAK_0']
time 2015-04-17 08:46:04 6 2015-04-17 08:46:34 5 2015-04-17 08:47:07 5 2015-04-17 08:47:34 4 2015-04-17 08:48:04 4 2015-04-17 08:48:34 4 2015-04-17 08:49:07 4 2015-04-17 08:49:37 4 2015-04-17 08:50:06 4 2015-04-17 08:50:34 4 2015-04-17 08:51:04 4 2015-04-17 08:51:34 4 2015-04-17 08:52:07 4 2015-04-17 08:52:34 4 2015-04-17 08:53:04 11 2015-04-17 08:53:34 11 2015-04-17 08:54:04 10 2015-04-17 08:54:34 10 2015-04-17 08:55:06 10 2015-04-17 08:55:34 9 2015-04-17 08:56:04 9 2015-04-17 08:56:34 9 2015-04-17 08:57:07 9 2015-04-17 08:57:34 9 2015-04-17 08:58:04 9 2015-04-17 08:58:34 9 2015-04-17 08:59:04 9 2015-04-17 08:59:34 8 2015-04-17 08:59:49 8 2015-04-17 09:00:34 7 .. 2015-05-18 21:36:06 10 2015-05-18 21:36:33 9 2015-05-18 21:37:03 9 2015-05-18 21:37:33 8 2015-05-18 21:38:03 8 2015-05-18 21:38:33 7 2015-05-18 21:39:03 7 2015-05-18 21:39:33 6 2015-05-18 21:40:06 6 2015-05-18 21:40:34 5 2015-05-18 21:41:03 5 2015-05-18 21:41:33 4 2015-05-18 21:42:03 3 2015-05-18 21:42:33 3 2015-05-18 21:43:03 2 2015-05-18 21:43:33 2 2015-05-18 21:44:03 0 2015-05-18 21:44:33 0 2015-05-18 21:45:06 0 2015-05-18 21:45:33 20 2015-05-18 21:46:03 20 2015-05-18 21:46:35 19 2015-05-18 21:47:03 19 2015-05-18 21:47:37 18 2015-05-18 21:48:03 18 2015-05-18 21:48:37 17 2015-05-18 21:49:03 17 2015-05-18 21:49:33 16 2015-05-18 21:50:06 16 2015-05-18 21:50:37 15 Name: WOAK_0, dtype: float64
sns.jointplot(all_trains.ROCK_0.map(lambda x: x.hour + x.minute/60.0), all_trains.CIVC_0.map(lambda x: x.hour + x.minute/60.0))
<seaborn.axisgrid.JointGrid at 0x1343f9790>
sns.distplot(all_trains.ROCK_0.map(lambda x: x.hour + x.minute/60.0), bins=np.linspace(6, 9, 180))
plt.gcf().set_size_inches(20, 10)
at_6 = all_trains[all_trains.ROCK_0.map(lambda x: x.hour) == 6]
at_7 = all_trains[all_trains.ROCK_0.map(lambda x: x.hour) == 7]
at_8 = all_trains[all_trains.ROCK_0.map(lambda x: x.hour) == 8]
_, axes = plt.subplots(3, sharex=True, sharey=True)
ax = sns.distplot((at_6['CIVC_0'] - at_6['ROCK_0']) / np.timedelta64(1, 'm'), ax=axes[0], bins=np.arange(20, 47))
ax.set_title('6am to 7am')
# plt.subplot('312')
ax = sns.distplot((at_7['CIVC_0'] - at_7['ROCK_0']) / np.timedelta64(1, 'm'), ax=axes[1], bins=np.arange(20, 47))
ax.set_title('7am to 8am')
# plt.subplot('313')
ax = sns.distplot((at_8['CIVC_0'] - at_8['ROCK_0']) / np.timedelta64(1, 'm'), ax=axes[2], bins=np.arange(20, 47))
ax.set_title('8am to 9am')
plt.gcf().set_size_inches(10, 5)
plt.xlim(20, 47)
plt.suptitle('Trip Time as a function of Hour of Departure')
plt.xlabel('Trip Duration (Minutes)')
<matplotlib.text.Text at 0x13a35bf50>
print (at_6.CIVC_0 - at_6.ROCK_0).median()
print (at_7.CIVC_0 - at_7.ROCK_0).median()
print (at_8.CIVC_0 - at_8.ROCK_0).median()
0 days 00:24:33 0 days 00:26:00 0 days 00:26:30
for i in range(20, 50):
print i, "{:.3f}".format(((at_8.CIVC_0 - at_8.ROCK_0)/np.timedelta64(1, 'm') <= i).mean())
20 0.000 21 0.000 22 0.000 23 0.050 24 0.104 25 0.218 26 0.429 27 0.625 28 0.754 29 0.829 30 0.900 31 0.943 32 0.961 33 0.975 34 0.982 35 0.986 36 0.993 37 0.996 38 0.996 39 0.996 40 0.996 41 0.996 42 0.996 43 0.996 44 0.996 45 0.996 46 1.000 47 1.000 48 1.000 49 1.000
plt.scatter(all_trains.ROCK_0.map(lambda x: x.hour + x.minute/60.0), (all_trains.CIVC_0 - all_trains.ROCK_0) / np.timedelta64(1, 'm'), color=sns.color_palette()[0])
plt.xlabel('Departure Hour (from Rockridge)')
plt.ylabel('Trip Length (Minutes)')
plt.gcf().set_size_inches(8, 5)
all_trains[whole_trip >= 38]
ROCK_0 | MCAR_0 | 19TH_0 | 12TH_0 | WOAK_0 | EMBR_0 | MONT_0 | POWL_0 | CIVC_0 | |
---|---|---|---|---|---|---|---|---|---|
73 | 2015-03-10 06:19:49 | 2015-03-10 06:38:19 | 2015-03-10 06:41:19 | 2015-03-10 06:42:49 | 2015-03-10 06:47:19 | 2015-03-10 06:54:19 | 2015-03-10 06:57:49 | 2015-03-10 06:59:22 | 2015-03-10 07:00:52 |
477 | 2015-04-17 08:35:34 | 2015-04-17 08:39:07 | 2015-04-17 08:45:06 | 2015-04-17 08:46:34 | 2015-04-17 09:08:34 | 2015-04-17 09:16:04 | 2015-04-17 09:18:04 | 2015-04-17 09:19:34 | 2015-04-17 09:21:04 |
628 | 2015-05-02 07:18:48 | 2015-05-02 07:21:52 | 2015-05-02 07:25:19 | 2015-05-02 07:26:18 | 2015-05-02 07:46:03 | 2015-05-02 07:53:18 | 2015-05-02 07:54:33 | 2015-05-02 07:56:03 | 2015-05-02 07:58:03 |
all_trains.iloc[475:480]
ROCK_0 | MCAR_0 | 19TH_0 | 12TH_0 | WOAK_0 | EMBR_0 | MONT_0 | POWL_0 | CIVC_0 | |
---|---|---|---|---|---|---|---|---|---|
475 | 2015-04-17 08:06:34 | 2015-04-17 08:09:34 | 2015-04-17 08:12:34 | 2015-04-17 08:14:34 | 2015-04-17 08:22:34 | 2015-04-17 08:30:06 | 2015-04-17 08:32:04 | 2015-04-17 08:33:34 | 2015-04-17 08:35:05 |
476 | 2015-04-17 08:20:34 | 2015-04-17 08:23:34 | 2015-04-17 08:28:04 | 2015-04-17 08:29:34 | 2015-04-17 08:35:05 | 2015-04-17 08:42:34 | 2015-04-17 08:44:34 | 2015-04-17 08:45:34 | 2015-04-17 08:47:34 |
477 | 2015-04-17 08:35:34 | 2015-04-17 08:39:07 | 2015-04-17 08:45:06 | 2015-04-17 08:46:34 | 2015-04-17 09:08:34 | 2015-04-17 09:16:04 | 2015-04-17 09:18:04 | 2015-04-17 09:19:34 | 2015-04-17 09:21:04 |
478 | 2015-04-17 08:51:04 | 2015-04-17 08:54:34 | 2015-04-17 09:02:04 | 2015-04-17 09:03:04 | 2015-04-17 09:08:34 | 2015-04-17 09:16:04 | 2015-04-17 09:18:04 | 2015-04-17 09:19:34 | 2015-04-17 09:21:04 |
479 | 2015-04-18 06:35:33 | 2015-04-18 06:38:19 | 2015-04-18 06:45:33 | 2015-04-18 06:46:18 | 2015-04-18 06:50:33 | 2015-04-18 06:57:18 | 2015-04-18 06:58:48 | 2015-04-18 07:00:18 | 2015-04-18 07:01:48 |
all_trains.head()
ROCK_0 | MCAR_0 | 19TH_0 | 12TH_0 | WOAK_0 | EMBR_0 | MONT_0 | POWL_0 | CIVC_0 | |
---|---|---|---|---|---|---|---|---|---|
0 | 2015-03-03 06:04:49 | 2015-03-03 06:08:04 | 2015-03-03 06:11:49 | 2015-03-03 06:13:19 | 2015-03-03 06:18:04 | 2015-03-03 06:24:34 | 2015-03-03 06:26:19 | 2015-03-03 06:28:04 | 2015-03-03 06:30:07 |
1 | 2015-03-03 06:20:07 | 2015-03-03 06:23:04 | 2015-03-03 06:27:34 | 2015-03-03 06:28:34 | 2015-03-03 06:32:52 | 2015-03-03 06:40:07 | 2015-03-03 06:41:34 | 2015-03-03 06:43:04 | 2015-03-03 06:44:34 |
2 | 2015-03-03 06:34:49 | 2015-03-03 06:38:34 | 2015-03-03 06:42:04 | 2015-03-03 06:43:22 | 2015-03-03 06:47:49 | 2015-03-03 06:55:06 | 2015-03-03 06:56:34 | 2015-03-03 06:58:04 | 2015-03-03 06:59:34 |
3 | 2015-03-03 06:53:04 | 2015-03-03 06:56:34 | 2015-03-03 07:01:34 | 2015-03-03 07:03:34 | 2015-03-03 07:08:04 | 2015-03-03 07:15:06 | 2015-03-03 07:16:34 | 2015-03-03 07:18:04 | 2015-03-03 07:20:07 |
4 | 2015-03-03 07:04:34 | 2015-03-03 07:08:04 | 2015-03-03 07:12:34 | 2015-03-03 07:13:34 | 2015-03-03 07:18:04 | 2015-03-03 07:26:04 | 2015-03-03 07:28:04 | 2015-03-03 07:30:07 | 2015-03-03 07:31:34 |
all_trains.to_csv('clean_trains.csv')
df.time.max()
'2015-05-18 21:50:37'