Routes and Trips¶

How many routes have how many trips?

In [4]:

## Distribution of trips vs routes
trips_freq = []
for route in routes_list.values():
    trips_freq.append((route.route_short_name, len(route.trips)))
with open('trips_routes.pkl', 'wb') as trips_routes_file:
    pickle.dump(trips_freq, trips_routes_file)
trips_series = pd.Series([x[1] for x in trips_freq])
print(trips_series.describe())

count     182.000000
mean      689.664835
std       622.333137
min        13.000000
25%       275.750000
50%       554.000000
75%       945.000000
max      4696.000000
dtype: float64

In [6]:

print("Pearson's: %s" % pearsons_index(trips_series))
print("Range: %s" % data_range(trips_series))
print("IQR: %s" % interquartile_range(trips_series))
print("Outliers lie outside: (%s,%s)" % outlier_range(trips_series))

Pearson's: 0.653981736266
Range: 4683
IQR: 669.25
Outliers lie outside: (-728.125,1948.875)

In [7]:

fig, ax = plt.subplots(1,1)
n, bins, patches = ax.hist([x[1] for x in trips_freq], bins=25,
                            align='mid', range=(0, 5000))
plt.xlabel('Number of Trips')
plt.ylabel('Number of Routes')
plt.title('Distribution of trips over routes')
plt.xticks(rotation=70)
ax.set_xticks(bins[:-1])
plt.xticks(bins[:-1],['%.1f' % (x-0.5) for x in bins[:-1]])
plt.subplots_adjust(bottom=0.2)

In [8]:

outlier_r = outlier_range(trips_series)
outliers = filter(lambda x: x[1] < outlier_r[0] or x[1] > outlier_r[1], trips_freq)
print(list(outliers))

[('29', 2224), ('32', 2011), ('501', 2705), ('504', 2025), ('509', 2059), ('510', 4696), ('52', 2043), ('BLR', 2181), ('YUS', 2267)]

In [9]:

trips_series = pd.Series([x[1] for x in trips_freq if x[1] >= outlier_r[0] and x[1] <= outlier_r[1]])
print(trips_series.describe())
print("Pearson's: %s" % pearsons_index(trips_series))
print("IQR: %s" % interquartile_range(trips_series))

count     173.000000
mean      597.156069
std       445.952944
min        13.000000
25%       261.000000
50%       525.000000
75%       881.000000
max      1814.000000
dtype: float64
Pearson's: 0.485405940789
IQR: 620.0

In [10]:

fig, ax = plt.subplots(1,1)
n, bins, patches = ax.hist(trips_series, bins=25,
                            align='mid', range=(0, 2000))
plt.xlabel('Number of Trips')
plt.ylabel('Number of Routes')
plt.title('Distribution of trips over routes')
plt.xticks(rotation=70)
plt.xticks(bins[:-1],['%.1f' % (x-0.5) for x in bins[:-1]])
plt.subplots_adjust(bottom=0.2)

Ridership and Costs¶

How is the daily cost of a route related to its ridership?

In [42]:

print(ols(x=riders_costs_df['ridership'], y=riders_costs_df['cost']))

-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <x> + <intercept>

Number of Observations:         138
Number of Degrees of Freedom:   2

R-squared:         0.9276
Adj R-squared:     0.9270

Rmse:           7045.0030

F-stat (1, 136):  1741.2502, p-value:     0.0000

Degrees of Freedom: model 1, resid 136

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
             x     2.0056     0.0481      41.73     0.0000     1.9114     2.0998
     intercept  2478.4971   815.2188       3.04     0.0028   880.6683  4076.3258
---------------------------------End of Summary---------------------------------

In [47]:

plot_with_reg(x=riders_costs_df['ridership'],  y=riders_costs_df['cost'])
t = plt.title("Cost of a Route vs. Number of Riders")
t = plt.xlabel("Number of Customers per Weekday")
t = plt.ylabel("Cost per Weekday ($)")

Types of Vehicles¶

How are routes, trips, and types of vehicles related?

In [11]:

# from https://developers.google.com/transit/gtfs/reference#routes_fields
TYPE_SUBWAY = 1
TYPE_STREETCAR = 0
TYPE_BUS = 3

buses = 0
subways = 0
streetcars = 0
for route in routes_list.values():
    if route.route_type == TYPE_SUBWAY:
        subways += 1
    elif route.route_type == TYPE_STREETCAR:
        streetcars += 1
    elif route.route_type == TYPE_BUS:
        buses += 1
print("Bus routes: %s" % buses)
print("Subway routes: %s" % subways)
print("Streetcar routes: %s" % streetcars)

Bus routes: 166
Subway routes: 4
Streetcar routes: 12

In [12]:

plt.figure(figsize = (6,6))
p = plt.pie((buses, streetcars, subways), labels=("Bus Routes", "Streetcar Routes", "Subway Routes"),
    startangle=90, autopct="%1.1f%%")
t = plt.title("Vehicles in the TTC by Number of Routes")

In [13]:

subways_trips = 0
buses_trips = 0
streetcars_trips = 0

trips_map = dict(trips_freq)
for route in routes_list:
    route_type = routes_list[route].route_type
    route_name = routes_list[route].route_short_name
    if route_type == TYPE_SUBWAY:
        subways_trips += trips_map[route_name]
    elif route_type == TYPE_BUS:
        buses_trips += trips_map[route_name]
    elif route_type == TYPE_STREETCAR:
        streetcars_trips += trips_map[route_name]
print("Bus trips: %s" % buses_trips)
print("Subway trips: %s" % subways_trips)
print("Streetcar trips: %s" % streetcars_trips)
plt.figure(figsize=(6,6))
p = plt.pie((buses_trips, streetcars_trips, subways_trips), labels=("Bus Trips", "Streetcar Trips",
                                                                    "Subway Trips"),
            startangle=90, autopct="%1.1f%%")
t = plt.title("Vehicles in the TTC by number of trips")

Bus trips: 101427
Subway trips: 7586
Streetcar trips: 16506

In [14]:

streetcar_trip_route = streetcars_trips / streetcars
buses_trip_route = buses_trips / buses
subway_trip_route = subways_trips / subways
print("Streetcars: %s trips per route" % streetcar_trip_route)
print("Buses: %s trips per route" % buses_trip_route)
print("Subways: %s trips per route" % subway_trip_route)
index = np.arange(3)
bar_width = .35
rect_buses = plt.bar(index, (buses_trip_route, streetcar_trip_route, subway_trip_route), bar_width)
plt.xlabel("Vehicle Type")
plt.ylabel("Average Trips per Route")
plt.title("Average Trips per Route by Vehicle Type")
ticks = plt.xticks(index + bar_width / 2, ("Buses", "Streetcars", "Subways"))

Streetcars: 1375.5 trips per route
Buses: 611.0060240963855 trips per route
Subways: 1896.5 trips per route

Geographic Distribution of Bus Stops¶

The TTC has over 10,000 stop locations. Where are they?

In [15]:

stops_list = {}

with open('schedules/stops.txt') as stops_file:
    stops_csv = csv.reader(stops_file)
    next(stops_file)
    for stop in stops_csv:
        stop_id = int(stop[0])
        stop_name = stop[2]
        stop_lat = float(stop[4])
        stop_lon = float(stop[5])
        stops_list[stop_id] = Stop(stop_id, stop_name, stop_lat, stop_lon)

lats = [x.stop_lat for x in stops_list.values()]
longs = [x.stop_lon for x in stops_list.values()]
plt.figure(figsize=(8,6))
plt.scatter(longs, lats, marker='.', c='b')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
t = plt.title('Geographic Distribution of All Stops')

In [16]:

longs_pd = pd.Series(longs)
lats_pd = pd.Series(lats)
fig, ax = plt.subplots(1,1)
n, bins, patches = ax.hist(longs_pd, align='mid', bins=25)
plt.xlabel('Longitude')
plt.ylabel('Number of stops')
plt.title('Longitude of Stops')
plt.xticks(rotation=70)
plt.xticks(bins[:-1],['%.3f' % (x-0.5) for x in bins[:-1]])
plt.subplots_adjust(bottom=0.2)
print("Longitudes: ", longs_pd.describe())

Longitudes:  count    10738.000000
mean       -79.399878
std          0.110536
min        -79.650347
25%        -79.489651
50%        -79.400623
75%        -79.313075
max        -79.123046
dtype: float64

In [17]:

fig, ax = plt.subplots(1,1)
n, bins, patches = ax.hist(lats_pd, align='mid', bins=25)
plt.xlabel('Latitude')
plt.ylabel('Number of stops')
plt.title('Latitude of Stops')
plt.xticks(rotation=70)
plt.xticks(bins[:-1],['%.3f' % (x-0.5) for x in bins[:-1]])
plt.subplots_adjust(bottom=0.2)
print("Latitudes: ", lats_pd.describe())

Latitudes:  count    10738.000000
mean        43.725382
std          0.060305
min         43.591810
25%         43.677031
50%         43.721847
75%         43.771064
max         43.914362
dtype: float64

Live Vehicle Location Data¶

What does real-time vehicle location reveal?

In [21]:

spike_idx = r.index(max(r))
del r[spike_idx]
del theta[spike_idx]

ax = plt.subplot(111, polar=True)
ax.plot(theta, r)
ax.grid(True)

ax.set_theta_direction(-1)
ax.set_theta_zero_location('N')
ti = plt.title("Headings of TTC vehicles.")

In [23]:

total_headings = headings_dataframe['count'].sum()

# median
median_idx = total_headings // 2
current_idx = 0
for heading in range(len(headings_dataframe)):
    current_idx += headings_dataframe.iloc[heading]['count']
    if current_idx >= median_idx:
        median = headings_dataframe.iloc[heading]['heading']
        break

# mean
s = 0
for heading in range(len(headings_dataframe)):
    s += headings_dataframe.iloc[heading]['count'] * headings_dataframe.iloc[heading]['heading']
mean = s / total_headings

# standard deviation
s = 0
for heading in headings_dataframe.iterrows():
    n = heading[1]['count']
    h = heading[1]['heading']
    s += n * (h - mean) ** 2
stdev = math.sqrt((1/total_headings) * s)

# mode
modes = []
max_heading = max(headings_dataframe['count'])
modes = [x[1]['heading'] for x in headings_dataframe.iterrows() if x[1]['count'] == max_heading]

print("Mean: %.3f"% mean)
print("Standard deviation: %.3f" % stdev)
print("Median: %.0f"% median)
print("Modes: %.0f" % modes[0])

Mean: 197.525
Standard deviation: 102.488
Median: 179
Modes: 253

Time Distribution¶

What happens to the number of active vehicles over time?

In [25]:

with open('time_dist.pickle', 'rb') as time_dist_pickle:
    time_dist = pickle.load(time_dist_pickle)[1:]
    times, dist = zip(*time_dist)
    # note that the timestamp from the pickle file already has timezone data
    date_times = [datetime.fromtimestamp(x) for x in times]
fig, axes = plt.subplots()
fig.set_size_inches(8,6)
days = DayLocator()
hours = HourLocator(interval=8)
formatter = DateFormatter('%b %d')
axes.plot_date(date_times,dist, 'b-')
axes.xaxis.set_major_locator(days)
axes.xaxis.set_minor_locator(hours)
axes.xaxis.set_major_formatter(formatter)
axes.set_xlabel('Date and Time')
axes.set_ylabel('Number of vehicles on duty')
axes.set_title('Number of TTC vehicles on duty from April 27th to May 4th')
plt.show()

Note that the lables indicate the start of the day, so the data here starts the morning of Monday, April 28th and ends shortly after Monday, May 5th. Also note that the small gaps on April 30th can be attributed to a server error.

Even without looking at the exact times, there are many more buses and streetcars on the road during rush hour on weekdays than there are on weekends, and much more than there are at night. Weekdays follow a bimodal distribution, whereas weekends follow a modal distribution. Also, there are consistently more vehicles active during morning rush hour than evening rush hour on weekdays.

At the peak of Saturday service, there are only as just as many vehicles as there are during the least busy hours of weekdays. There are even fewer active vehicles during peak hours on Sundays.

Velocity¶

What velocities do TTC vehicles travel at, and what are some factors that affect it?

In [30]:

outlier_b, outlier_t = outlier_range(velocities_pd['velocity'])
velocities_series = pd.Series(
    [x for x in velocities_pd['velocity'] if x >= outlier_b and x <= outlier_t])

n, bins, patchs = plt.hist(velocities_series, bins=25, range=(0,20))
t = plt.title('Aggregated TTC Vehicles Velocities')
plt.xlabel('Velocity (m/s)')
plt.ylabel("Number of Vehicle-Time Points")

print(velocities_series.describe())
print("IQR:", interquartile_range(velocities_series))
print("Pearson's Index: ", pearsons_index(velocities_series))
print("Range: ", data_range(velocities_series))

count    8617369.000000
mean           4.550967
std            3.079863
min            0.000000
25%            2.187578
50%            4.475319
75%            6.643428
max           13.522408
dtype: float64
IQR: 4.45584987475
Pearson's Index:  0.0736866892983
Range:  13.5224079069

Velocity and Time¶

How does velocity of TTC vehicles change over time?

In [32]:

date_times_gen = [datetime.fromtimestamp(x // 1000) for x in velocity_moving_average[0]]

fig, axes = plt.subplots()
fig.set_size_inches(8,6)
days = DayLocator(interval=2)
formatter = DateFormatter('%b %d')
axes.plot_date(date_times_gen,velocity_moving_average[1],'b-')
axes.xaxis.set_major_locator(days)
axes.xaxis.set_major_formatter(formatter)
axes.set_xlabel('Date')
axes.set_ylabel('Average Velocity (m/s)')
axes.set_title('Velocity of TTC Vehicles Measured Over Time')
plt.show()

In [33]:

times_gen = [x.time() for x in date_times_gen]
fig, axes = plt.subplots()
fig.set_size_inches(8,6)
axes.plot(times_gen,velocity_moving_average[1], 'b.')
axes.set_xlabel('Time')
axes.set_ylabel('Average Velocity (m/s)')
axes.set_title('Velocity of TTC Vehicles Measured Over a Day')
plt.show()

Correlating Velocities and Busyness¶

Is there a correlation between velocity and busyness? (Measured by number of active vehicles.)

In [35]:

p = plt.plot(bus_vel[0], bus_vel[1], 'b.')
plt.xlabel("Number of Active Vehicles")
plt.ylabel("Average Velocity of Active Vehicles (m/s)")
t = plt.title("Average Velocity of Active Vehicles vs Number of Active Vehicles")

In [36]:

velocity_active_df = pd.DataFrame({"active": bus_vel[0], "vel": bus_vel[1]})
res = ols(y=velocity_active_df['vel'], x = velocity_active_df['active'])
print(res)

-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <x> + <intercept>

Number of Observations:         1273
Number of Degrees of Freedom:   2

R-squared:         0.2501
Adj R-squared:     0.2495

Rmse:              0.6348

F-stat (1, 1271):   423.9162, p-value:     0.0000

Degrees of Freedom: model 1, resid 1271

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
             x    -0.0007     0.0000     -20.59     0.0000    -0.0008    -0.0007
     intercept     5.4518     0.0381     143.14     0.0000     5.3771     5.5264
---------------------------------End of Summary---------------------------------

In [37]:

plt.hold(True)
p = plt.plot(bus_vel[0], bus_vel[1], 'b.')
plt.xlabel("Number of Active Vehicles")
plt.ylabel("Average Velocity of Active Vehicles (m/s)")
t = plt.title("Average Velocity of Active Vehicles vs Number of Active Vehicles")
m, b = -0.0007, 5.4518
x = np.arange(0, 2000, 100)
p = plt.plot(x, np.poly1d((m, b))(x), '-k')
plt.hold(False)