How many routes have how many trips?
## Distribution of trips vs routes
trips_freq = []
for route in routes_list.values():
trips_freq.append((route.route_short_name, len(route.trips)))
with open('trips_routes.pkl', 'wb') as trips_routes_file:
pickle.dump(trips_freq, trips_routes_file)
trips_series = pd.Series([x[1] for x in trips_freq])
print(trips_series.describe())
count 182.000000 mean 689.664835 std 622.333137 min 13.000000 25% 275.750000 50% 554.000000 75% 945.000000 max 4696.000000 dtype: float64
print("Pearson's: %s" % pearsons_index(trips_series))
print("Range: %s" % data_range(trips_series))
print("IQR: %s" % interquartile_range(trips_series))
print("Outliers lie outside: (%s,%s)" % outlier_range(trips_series))
Pearson's: 0.653981736266 Range: 4683 IQR: 669.25 Outliers lie outside: (-728.125,1948.875)
fig, ax = plt.subplots(1,1)
n, bins, patches = ax.hist([x[1] for x in trips_freq], bins=25,
align='mid', range=(0, 5000))
plt.xlabel('Number of Trips')
plt.ylabel('Number of Routes')
plt.title('Distribution of trips over routes')
plt.xticks(rotation=70)
ax.set_xticks(bins[:-1])
plt.xticks(bins[:-1],['%.1f' % (x-0.5) for x in bins[:-1]])
plt.subplots_adjust(bottom=0.2)
outlier_r = outlier_range(trips_series)
outliers = filter(lambda x: x[1] < outlier_r[0] or x[1] > outlier_r[1], trips_freq)
print(list(outliers))
[('29', 2224), ('32', 2011), ('501', 2705), ('504', 2025), ('509', 2059), ('510', 4696), ('52', 2043), ('BLR', 2181), ('YUS', 2267)]
trips_series = pd.Series([x[1] for x in trips_freq if x[1] >= outlier_r[0] and x[1] <= outlier_r[1]])
print(trips_series.describe())
print("Pearson's: %s" % pearsons_index(trips_series))
print("IQR: %s" % interquartile_range(trips_series))
count 173.000000 mean 597.156069 std 445.952944 min 13.000000 25% 261.000000 50% 525.000000 75% 881.000000 max 1814.000000 dtype: float64 Pearson's: 0.485405940789 IQR: 620.0
fig, ax = plt.subplots(1,1)
n, bins, patches = ax.hist(trips_series, bins=25,
align='mid', range=(0, 2000))
plt.xlabel('Number of Trips')
plt.ylabel('Number of Routes')
plt.title('Distribution of trips over routes')
plt.xticks(rotation=70)
plt.xticks(bins[:-1],['%.1f' % (x-0.5) for x in bins[:-1]])
plt.subplots_adjust(bottom=0.2)
How is the daily cost of a route related to its ridership?
print(ols(x=riders_costs_df['ridership'], y=riders_costs_df['cost']))
-------------------------Summary of Regression Analysis------------------------- Formula: Y ~ <x> + <intercept> Number of Observations: 138 Number of Degrees of Freedom: 2 R-squared: 0.9276 Adj R-squared: 0.9270 Rmse: 7045.0030 F-stat (1, 136): 1741.2502, p-value: 0.0000 Degrees of Freedom: model 1, resid 136 -----------------------Summary of Estimated Coefficients------------------------ Variable Coef Std Err t-stat p-value CI 2.5% CI 97.5% -------------------------------------------------------------------------------- x 2.0056 0.0481 41.73 0.0000 1.9114 2.0998 intercept 2478.4971 815.2188 3.04 0.0028 880.6683 4076.3258 ---------------------------------End of Summary---------------------------------
plot_with_reg(x=riders_costs_df['ridership'], y=riders_costs_df['cost'])
t = plt.title("Cost of a Route vs. Number of Riders")
t = plt.xlabel("Number of Customers per Weekday")
t = plt.ylabel("Cost per Weekday ($)")
How are routes, trips, and types of vehicles related?
# from https://developers.google.com/transit/gtfs/reference#routes_fields
TYPE_SUBWAY = 1
TYPE_STREETCAR = 0
TYPE_BUS = 3
buses = 0
subways = 0
streetcars = 0
for route in routes_list.values():
if route.route_type == TYPE_SUBWAY:
subways += 1
elif route.route_type == TYPE_STREETCAR:
streetcars += 1
elif route.route_type == TYPE_BUS:
buses += 1
print("Bus routes: %s" % buses)
print("Subway routes: %s" % subways)
print("Streetcar routes: %s" % streetcars)
Bus routes: 166 Subway routes: 4 Streetcar routes: 12
plt.figure(figsize = (6,6))
p = plt.pie((buses, streetcars, subways), labels=("Bus Routes", "Streetcar Routes", "Subway Routes"),
startangle=90, autopct="%1.1f%%")
t = plt.title("Vehicles in the TTC by Number of Routes")
subways_trips = 0
buses_trips = 0
streetcars_trips = 0
trips_map = dict(trips_freq)
for route in routes_list:
route_type = routes_list[route].route_type
route_name = routes_list[route].route_short_name
if route_type == TYPE_SUBWAY:
subways_trips += trips_map[route_name]
elif route_type == TYPE_BUS:
buses_trips += trips_map[route_name]
elif route_type == TYPE_STREETCAR:
streetcars_trips += trips_map[route_name]
print("Bus trips: %s" % buses_trips)
print("Subway trips: %s" % subways_trips)
print("Streetcar trips: %s" % streetcars_trips)
plt.figure(figsize=(6,6))
p = plt.pie((buses_trips, streetcars_trips, subways_trips), labels=("Bus Trips", "Streetcar Trips",
"Subway Trips"),
startangle=90, autopct="%1.1f%%")
t = plt.title("Vehicles in the TTC by number of trips")
Bus trips: 101427 Subway trips: 7586 Streetcar trips: 16506
streetcar_trip_route = streetcars_trips / streetcars
buses_trip_route = buses_trips / buses
subway_trip_route = subways_trips / subways
print("Streetcars: %s trips per route" % streetcar_trip_route)
print("Buses: %s trips per route" % buses_trip_route)
print("Subways: %s trips per route" % subway_trip_route)
index = np.arange(3)
bar_width = .35
rect_buses = plt.bar(index, (buses_trip_route, streetcar_trip_route, subway_trip_route), bar_width)
plt.xlabel("Vehicle Type")
plt.ylabel("Average Trips per Route")
plt.title("Average Trips per Route by Vehicle Type")
ticks = plt.xticks(index + bar_width / 2, ("Buses", "Streetcars", "Subways"))
Streetcars: 1375.5 trips per route Buses: 611.0060240963855 trips per route Subways: 1896.5 trips per route
The TTC has over 10,000 stop locations. Where are they?
stops_list = {}
with open('schedules/stops.txt') as stops_file:
stops_csv = csv.reader(stops_file)
next(stops_file)
for stop in stops_csv:
stop_id = int(stop[0])
stop_name = stop[2]
stop_lat = float(stop[4])
stop_lon = float(stop[5])
stops_list[stop_id] = Stop(stop_id, stop_name, stop_lat, stop_lon)
lats = [x.stop_lat for x in stops_list.values()]
longs = [x.stop_lon for x in stops_list.values()]
plt.figure(figsize=(8,6))
plt.scatter(longs, lats, marker='.', c='b')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
t = plt.title('Geographic Distribution of All Stops')
longs_pd = pd.Series(longs)
lats_pd = pd.Series(lats)
fig, ax = plt.subplots(1,1)
n, bins, patches = ax.hist(longs_pd, align='mid', bins=25)
plt.xlabel('Longitude')
plt.ylabel('Number of stops')
plt.title('Longitude of Stops')
plt.xticks(rotation=70)
plt.xticks(bins[:-1],['%.3f' % (x-0.5) for x in bins[:-1]])
plt.subplots_adjust(bottom=0.2)
print("Longitudes: ", longs_pd.describe())
Longitudes: count 10738.000000 mean -79.399878 std 0.110536 min -79.650347 25% -79.489651 50% -79.400623 75% -79.313075 max -79.123046 dtype: float64
fig, ax = plt.subplots(1,1)
n, bins, patches = ax.hist(lats_pd, align='mid', bins=25)
plt.xlabel('Latitude')
plt.ylabel('Number of stops')
plt.title('Latitude of Stops')
plt.xticks(rotation=70)
plt.xticks(bins[:-1],['%.3f' % (x-0.5) for x in bins[:-1]])
plt.subplots_adjust(bottom=0.2)
print("Latitudes: ", lats_pd.describe())
Latitudes: count 10738.000000 mean 43.725382 std 0.060305 min 43.591810 25% 43.677031 50% 43.721847 75% 43.771064 max 43.914362 dtype: float64
What does real-time vehicle location reveal?
spike_idx = r.index(max(r))
del r[spike_idx]
del theta[spike_idx]
ax = plt.subplot(111, polar=True)
ax.plot(theta, r)
ax.grid(True)
ax.set_theta_direction(-1)
ax.set_theta_zero_location('N')
ti = plt.title("Headings of TTC vehicles.")
total_headings = headings_dataframe['count'].sum()
# median
median_idx = total_headings // 2
current_idx = 0
for heading in range(len(headings_dataframe)):
current_idx += headings_dataframe.iloc[heading]['count']
if current_idx >= median_idx:
median = headings_dataframe.iloc[heading]['heading']
break
# mean
s = 0
for heading in range(len(headings_dataframe)):
s += headings_dataframe.iloc[heading]['count'] * headings_dataframe.iloc[heading]['heading']
mean = s / total_headings
# standard deviation
s = 0
for heading in headings_dataframe.iterrows():
n = heading[1]['count']
h = heading[1]['heading']
s += n * (h - mean) ** 2
stdev = math.sqrt((1/total_headings) * s)
# mode
modes = []
max_heading = max(headings_dataframe['count'])
modes = [x[1]['heading'] for x in headings_dataframe.iterrows() if x[1]['count'] == max_heading]
print("Mean: %.3f"% mean)
print("Standard deviation: %.3f" % stdev)
print("Median: %.0f"% median)
print("Modes: %.0f" % modes[0])
Mean: 197.525 Standard deviation: 102.488 Median: 179 Modes: 253
What happens to the number of active vehicles over time?
with open('time_dist.pickle', 'rb') as time_dist_pickle:
time_dist = pickle.load(time_dist_pickle)[1:]
times, dist = zip(*time_dist)
# note that the timestamp from the pickle file already has timezone data
date_times = [datetime.fromtimestamp(x) for x in times]
fig, axes = plt.subplots()
fig.set_size_inches(8,6)
days = DayLocator()
hours = HourLocator(interval=8)
formatter = DateFormatter('%b %d')
axes.plot_date(date_times,dist, 'b-')
axes.xaxis.set_major_locator(days)
axes.xaxis.set_minor_locator(hours)
axes.xaxis.set_major_formatter(formatter)
axes.set_xlabel('Date and Time')
axes.set_ylabel('Number of vehicles on duty')
axes.set_title('Number of TTC vehicles on duty from April 27th to May 4th')
plt.show()
What velocities do TTC vehicles travel at, and what are some factors that affect it?
outlier_b, outlier_t = outlier_range(velocities_pd['velocity'])
velocities_series = pd.Series(
[x for x in velocities_pd['velocity'] if x >= outlier_b and x <= outlier_t])
n, bins, patchs = plt.hist(velocities_series, bins=25, range=(0,20))
t = plt.title('Aggregated TTC Vehicles Velocities')
plt.xlabel('Velocity (m/s)')
plt.ylabel("Number of Vehicle-Time Points")
print(velocities_series.describe())
print("IQR:", interquartile_range(velocities_series))
print("Pearson's Index: ", pearsons_index(velocities_series))
print("Range: ", data_range(velocities_series))
count 8617369.000000 mean 4.550967 std 3.079863 min 0.000000 25% 2.187578 50% 4.475319 75% 6.643428 max 13.522408 dtype: float64 IQR: 4.45584987475 Pearson's Index: 0.0736866892983 Range: 13.5224079069
How does velocity of TTC vehicles change over time?
date_times_gen = [datetime.fromtimestamp(x // 1000) for x in velocity_moving_average[0]]
fig, axes = plt.subplots()
fig.set_size_inches(8,6)
days = DayLocator(interval=2)
formatter = DateFormatter('%b %d')
axes.plot_date(date_times_gen,velocity_moving_average[1],'b-')
axes.xaxis.set_major_locator(days)
axes.xaxis.set_major_formatter(formatter)
axes.set_xlabel('Date')
axes.set_ylabel('Average Velocity (m/s)')
axes.set_title('Velocity of TTC Vehicles Measured Over Time')
plt.show()
times_gen = [x.time() for x in date_times_gen]
fig, axes = plt.subplots()
fig.set_size_inches(8,6)
axes.plot(times_gen,velocity_moving_average[1], 'b.')
axes.set_xlabel('Time')
axes.set_ylabel('Average Velocity (m/s)')
axes.set_title('Velocity of TTC Vehicles Measured Over a Day')
plt.show()
Is there a correlation between velocity and busyness? (Measured by number of active vehicles.)
p = plt.plot(bus_vel[0], bus_vel[1], 'b.')
plt.xlabel("Number of Active Vehicles")
plt.ylabel("Average Velocity of Active Vehicles (m/s)")
t = plt.title("Average Velocity of Active Vehicles vs Number of Active Vehicles")
velocity_active_df = pd.DataFrame({"active": bus_vel[0], "vel": bus_vel[1]})
res = ols(y=velocity_active_df['vel'], x = velocity_active_df['active'])
print(res)
-------------------------Summary of Regression Analysis------------------------- Formula: Y ~ <x> + <intercept> Number of Observations: 1273 Number of Degrees of Freedom: 2 R-squared: 0.2501 Adj R-squared: 0.2495 Rmse: 0.6348 F-stat (1, 1271): 423.9162, p-value: 0.0000 Degrees of Freedom: model 1, resid 1271 -----------------------Summary of Estimated Coefficients------------------------ Variable Coef Std Err t-stat p-value CI 2.5% CI 97.5% -------------------------------------------------------------------------------- x -0.0007 0.0000 -20.59 0.0000 -0.0008 -0.0007 intercept 5.4518 0.0381 143.14 0.0000 5.3771 5.5264 ---------------------------------End of Summary---------------------------------
plt.hold(True)
p = plt.plot(bus_vel[0], bus_vel[1], 'b.')
plt.xlabel("Number of Active Vehicles")
plt.ylabel("Average Velocity of Active Vehicles (m/s)")
t = plt.title("Average Velocity of Active Vehicles vs Number of Active Vehicles")
m, b = -0.0007, 5.4518
x = np.arange(0, 2000, 100)
p = plt.plot(x, np.poly1d((m, b))(x), '-k')
plt.hold(False)