plot_schedule_dist(weekday_stop_times, "Weekday")
plot_schedule_dist(sat_stop_times, 'Saturday')
plot_schedule_dist(sun_stop_times, "Sunday")
What are the factors that affect the punctuality of TTC vehicles?
n, bins, patches = plt.hist(punctualities_series_orig,bins=300)
t = plt.title("Punctuality of TTC Vehicles")
t = plt.xlabel("Minutes Late")
t = plt.ylabel("Number of Instances")
print(punctualities_series_orig.describe())
print("Pearson's index: ", pearsons_index(punctualities_series_orig))
print("Interquartile range: ", interquartile_range(punctualities_series_orig))
print("Data range: ", data_range(punctualities_series_orig))
count 222510.000000 mean 10.598743 std 55.564036 min 0.000000 25% 1.350000 50% 3.483333 75% 7.916667 max 1386.633333 dtype: float64 Pearson's index: 0.384173486614 Interquartile range: 6.56666666667 Data range: 1386.63333333
outlier_r = outlier_range(punctualities_series_orig)
punctualities_series = pd.Series(x for x in punctualities
if x >= outlier_r[0] and x <= 40)
print(punctualities_series.describe())
print("Pearson's index: ", pearsons_index(punctualities_series))
print("Interquartile range: ", interquartile_range(punctualities_series))
print("Data range: ", data_range(punctualities_series))
count 219122.000000 mean 5.580403 std 6.187868 min 0.000000 25% 1.350000 50% 3.416667 75% 7.466667 max 39.983333 dtype: float64 Pearson's index: 1.04902199392 Interquartile range: 6.11666666667 Data range: 39.9833333333
n, bins, patches = plt.hist(punctualities_series,bins=20)
t = plt.title("Punctuality of TTC Vehicles")
t = plt.xlabel("Minutes Late")
t = plt.ylabel("Number of Instances")
plt.boxplot(punctualities_series, True, '+', vert=False, whis=np.inf)
plt.xlim(0, 50)
t = plt.title("Punctuality of TTC Vehicles")
t = plt.xlabel("Minutes Late")
t = plt.yticks((1,), ('TTC Vehicles',), rotation=90)
How is punctuality across routes?
N = len(rt_and_punc_medians)
ind = np.arange(N)
d = list(zip(*rt_and_punc_medians.items()))
p = plt.bar(ind, d[1])
t = plt.xlabel('Routes')
t = plt.ylabel('Median Late Time (minutes)')
t = plt.xticks(ind + 0.4, d[0], rotation=90)
t = plt.title('Median Late Times of Routes')
p = plt.hist(d[1], bins=30)
t = plt.xlabel('Median Late Time (minutes)')
t = plt.title('Median Late Times of Routes')
Are the median velocities of routes correlated with the median punctuality of routes?
from pandas.stats.api import ols
res = ols(y=routes_puncs_df['punc'], x=routes_puncs_df['vel'])
print(res)
-------------------------Summary of Regression Analysis------------------------- Formula: Y ~ <x> + <intercept> Number of Observations: 172 Number of Degrees of Freedom: 2 R-squared: 0.0274 Adj R-squared: 0.0217 Rmse: 6.8197 F-stat (1, 170): 4.7845, p-value: 0.0301 Degrees of Freedom: model 1, resid 170 -----------------------Summary of Estimated Coefficients------------------------ Variable Coef Std Err t-stat p-value CI 2.5% CI 97.5% -------------------------------------------------------------------------------- x 0.8392 0.3836 2.19 0.0301 0.0872 1.5911 intercept 3.9364 1.9720 2.00 0.0475 0.0713 7.8014 ---------------------------------End of Summary---------------------------------
plt.hold(True)
p = plt.scatter(routes_puncs_df['vel'], routes_puncs_df['punc'])
t = plt.title('Median Punctuality vs. Median Velocity of Routes')
t = plt.xlabel('Median Velocity (m/s)')
t = plt.ylabel('Median Punctuality (minutes)')
m, b = 0.8392, 3.9364
p = plt.plot(np.arange(1, 11), np.poly1d((m, b))(np.arange(1,11)), '-r')
plt.hold(False)
Does a route having more trips make it more punctual on median?
res = ols(y=puncs_trips_df['punc'], x=puncs_trips_df['num_trips'])
print(res)
-------------------------Summary of Regression Analysis------------------------- Formula: Y ~ <x> + <intercept> Number of Observations: 172 Number of Degrees of Freedom: 2 R-squared: 0.1862 Adj R-squared: 0.1814 Rmse: 6.2380 F-stat (1, 170): 38.9004, p-value: 0.0000 Degrees of Freedom: model 1, resid 170 -----------------------Summary of Estimated Coefficients------------------------ Variable Coef Std Err t-stat p-value CI 2.5% CI 97.5% -------------------------------------------------------------------------------- x -0.0050 0.0008 -6.24 0.0000 -0.0065 -0.0034 intercept 11.4722 0.7205 15.92 0.0000 10.0601 12.8844 ---------------------------------End of Summary---------------------------------
plot_with_reg(puncs_trips_df['num_trips'], puncs_trips_df['punc'])
t = plt.title('Number of Trips vs. Punctuality')
t = plt.xlabel('Number of Trips (per six weeks)')
t = plt.ylabel('Punctuality (minutes)')
t = plt.xlim(0,5000)
t = plt.ylim(0, 40)
Are there correlations between ridership and punctuality?
print(ols(x=ridership_punc_df['ridership'], y=ridership_punc_df['punc']))
-------------------------Summary of Regression Analysis------------------------- Formula: Y ~ <x> + <intercept> Number of Observations: 136 Number of Degrees of Freedom: 2 R-squared: 0.1538 Adj R-squared: 0.1475 Rmse: 5.0319 F-stat (1, 134): 24.3621, p-value: 0.0000 Degrees of Freedom: model 1, resid 134 -----------------------Summary of Estimated Coefficients------------------------ Variable Coef Std Err t-stat p-value CI 2.5% CI 97.5% -------------------------------------------------------------------------------- x -0.0002 0.0000 -4.94 0.0000 -0.0002 -0.0001 intercept 9.0460 0.5849 15.47 0.0000 7.8996 10.1924 ---------------------------------End of Summary---------------------------------
plot_with_reg(ridership_punc_df['ridership'], ridership_punc_df['punc'])
t = plt.ylim(0, 30)
t = plt.title("Punctuality vs. Ridership")
t = plt.xlabel("Weekday Ridership")
t = plt.ylabel("Punctuality (minutes)")
Does having more riders slow down vehicles?
print(ols(x=ridership_punc_df['ridership'], y=ridership_punc_df['vel']))
-------------------------Summary of Regression Analysis------------------------- Formula: Y ~ <x> + <intercept> Number of Observations: 136 Number of Degrees of Freedom: 2 R-squared: 0.0104 Adj R-squared: 0.0030 Rmse: 0.9908 F-stat (1, 134): 1.4055, p-value: 0.2379 Degrees of Freedom: model 1, resid 134 -----------------------Summary of Estimated Coefficients------------------------ Variable Coef Std Err t-stat p-value CI 2.5% CI 97.5% -------------------------------------------------------------------------------- x -0.0000 0.0000 -1.19 0.2379 -0.0000 0.0000 intercept 4.8046 0.1152 41.72 0.0000 4.5789 5.0303 ---------------------------------End of Summary---------------------------------
plot_with_reg(x=ridership_punc_df['ridership'], y=ridership_punc_df['vel'])
t = plt.ylabel("Median Velocity (m/s)")
t = plt.xlabel("Weekday Ridership")
t = plt.title("Velocity vs. Ridership")
Is there any correlation between punctuality and temperature or pressure?
print(ols(x=weather_punc_df['temp'], y=weather_punc_df['punc']))
-------------------------Summary of Regression Analysis------------------------- Formula: Y ~ <x> + <intercept> Number of Observations: 491464 Number of Degrees of Freedom: 2 R-squared: 0.0003 Adj R-squared: 0.0003 Rmse: 5.5336 F-stat (1, 491462): 154.5662, p-value: 0.0000 Degrees of Freedom: model 1, resid 491462 -----------------------Summary of Estimated Coefficients------------------------ Variable Coef Std Err t-stat p-value CI 2.5% CI 97.5% -------------------------------------------------------------------------------- x 0.0241 0.0019 12.43 0.0000 0.0203 0.0279 intercept 4.5434 0.0241 188.32 0.0000 4.4961 4.5907 ---------------------------------End of Summary---------------------------------
plot_with_reg(weather_punc_df['temp'], weather_punc_df['punc'])
t = plt.title('Punctuality and Temperature')
t = plt.xlabel('Temperature (degrees Celcius)')
t = plt.ylabel('Punctuality (minutes)')
print(ols(x=weather_punc_df['pressure'], y=weather_punc_df['punc']))
-------------------------Summary of Regression Analysis------------------------- Formula: Y ~ <x> + <intercept> Number of Observations: 491464 Number of Degrees of Freedom: 2 R-squared: 0.0008 Adj R-squared: 0.0008 Rmse: 5.5324 F-stat (1, 491462): 375.3727, p-value: 0.0000 Degrees of Freedom: model 1, resid 491462 -----------------------Summary of Estimated Coefficients------------------------ Variable Coef Std Err t-stat p-value CI 2.5% CI 97.5% -------------------------------------------------------------------------------- x -0.2921 0.0151 -19.37 0.0000 -0.3217 -0.2626 intercept 34.0656 1.5092 22.57 0.0000 31.1077 37.0235 ---------------------------------End of Summary---------------------------------
plot_with_reg(weather_punc_df['pressure'], weather_punc_df['punc'])
t = plt.title('Punctuality and Pressure')
t = plt.xlabel('Pressure (kPa)')
t = plt.ylabel('Punctuality (minutes)')