%matplotlib inline import csv, math, pickle import pandas as pd import numpy as np import matplotlib.pyplot as plt from classes import Route, Trip, Stop # just classes to store the data from pandas.stats.api import ols import pymongo from pymongo import MongoClient, ASCENDING, DESCENDING client = MongoClient() db = client.datasummative # my name for the database routes_list = {} with open('schedules/routes.txt') as routes_file: routes_csv = csv.reader(routes_file) next(routes_csv) # skip header for route in routes_csv: route_id =int(route[0]) route_short_name = route[2] route_long_name = route[3] route_type = int(route[5]) routes_list[route_id] = Route(route_id, route_short_name, route_long_name, route_type) with open('schedules/trips.txt') as trips_file: trips_csv = csv.reader(trips_file) next(trips_file) for trip in trips_csv: route_id = int(trip[0]) service_id =int(trip[1]) trip_id = int(trip[2]) trip_headsign = trip[3] route = routes_list[route_id] route.trips[trip_id] = Trip(trip_id, service_id, trip_headsign) ## Distribution of trips vs routes trips_freq = [] for route in routes_list.values(): trips_freq.append((route.route_short_name, len(route.trips))) with open('trips_routes.pkl', 'wb') as trips_routes_file: pickle.dump(trips_freq, trips_routes_file) trips_series = pd.Series([x[1] for x in trips_freq]) print(trips_series.describe()) def pearsons_index(series): return 3 * (series.mean() - series.median()) / series.std() def data_range(series): return series.quantile(1) - series.quantile(0) def interquartile_range(series): return series.quantile(.75) - series.quantile(.25) def outlier_range(series): iqr = interquartile_range(series) return (series.quantile(.25) - 1.5 * iqr, series.quantile(.75) + 1.5 * iqr) print("Pearson's: %s" % pearsons_index(trips_series)) print("Range: %s" % data_range(trips_series)) print("IQR: %s" % interquartile_range(trips_series)) print("Outliers lie outside: (%s,%s)" % outlier_range(trips_series)) fig, ax = plt.subplots(1,1) n, bins, patches = ax.hist([x[1] for x in trips_freq], bins=25, align='mid', range=(0, 5000)) plt.xlabel('Number of Trips') plt.ylabel('Number of Routes') plt.title('Distribution of trips over routes') plt.xticks(rotation=70) ax.set_xticks(bins[:-1]) plt.xticks(bins[:-1],['%.1f' % (x-0.5) for x in bins[:-1]]) plt.subplots_adjust(bottom=0.2) outlier_r = outlier_range(trips_series) outliers = filter(lambda x: x[1] < outlier_r[0] or x[1] > outlier_r[1], trips_freq) print(list(outliers)) trips_series = pd.Series([x[1] for x in trips_freq if x[1] >= outlier_r[0] and x[1] <= outlier_r[1]]) print(trips_series.describe()) print("Pearson's: %s" % pearsons_index(trips_series)) print("IQR: %s" % interquartile_range(trips_series)) fig, ax = plt.subplots(1,1) n, bins, patches = ax.hist(trips_series, bins=25, align='mid', range=(0, 2000)) plt.xlabel('Number of Trips') plt.ylabel('Number of Routes') plt.title('Distribution of trips over routes') plt.xticks(rotation=70) plt.xticks(bins[:-1],['%.1f' % (x-0.5) for x in bins[:-1]]) plt.subplots_adjust(bottom=0.2) ridership_collection = db.ridership rts = [] riders = [] costs = [] for ridership_row in ridership_collection.find(): rts.append(ridership_row['route']) riders.append(ridership_row['ridership']) costs.append(ridership_row['cost']) riders_costs_df = pd.DataFrame({'route': rts, 'ridership': riders, 'cost': costs}, index=rts) del rts del costs del riders print(ols(x=riders_costs_df['ridership'], y=riders_costs_df['cost'])) # define a convenient way to graph some stuff, find its regression, and graph that too def plot_with_reg(x, y): plt.hold(True) plt.plot(x,y, 'b.') res = ols(y=y, x=x) m = res.beta['x'] b = res.beta['intercept'] x_fit = np.linspace(min(x), max(x)) y_fit = np.poly1d((m,b)) plt.plot(x_fit, y_fit(x_fit), '-k') plt.hold(False) plot_with_reg(x=riders_costs_df['ridership'], y=riders_costs_df['cost']) t = plt.title("Cost of a Route vs. Number of Riders") t = plt.xlabel("Number of Customers per Weekday") t = plt.ylabel("Cost per Weekday ($)") # from https://developers.google.com/transit/gtfs/reference#routes_fields TYPE_SUBWAY = 1 TYPE_STREETCAR = 0 TYPE_BUS = 3 buses = 0 subways = 0 streetcars = 0 for route in routes_list.values(): if route.route_type == TYPE_SUBWAY: subways += 1 elif route.route_type == TYPE_STREETCAR: streetcars += 1 elif route.route_type == TYPE_BUS: buses += 1 print("Bus routes: %s" % buses) print("Subway routes: %s" % subways) print("Streetcar routes: %s" % streetcars) plt.figure(figsize = (6,6)) p = plt.pie((buses, streetcars, subways), labels=("Bus Routes", "Streetcar Routes", "Subway Routes"), startangle=90, autopct="%1.1f%%") t = plt.title("Vehicles in the TTC by Number of Routes") subways_trips = 0 buses_trips = 0 streetcars_trips = 0 trips_map = dict(trips_freq) for route in routes_list: route_type = routes_list[route].route_type route_name = routes_list[route].route_short_name if route_type == TYPE_SUBWAY: subways_trips += trips_map[route_name] elif route_type == TYPE_BUS: buses_trips += trips_map[route_name] elif route_type == TYPE_STREETCAR: streetcars_trips += trips_map[route_name] print("Bus trips: %s" % buses_trips) print("Subway trips: %s" % subways_trips) print("Streetcar trips: %s" % streetcars_trips) plt.figure(figsize=(6,6)) p = plt.pie((buses_trips, streetcars_trips, subways_trips), labels=("Bus Trips", "Streetcar Trips", "Subway Trips"), startangle=90, autopct="%1.1f%%") t = plt.title("Vehicles in the TTC by number of trips") streetcar_trip_route = streetcars_trips / streetcars buses_trip_route = buses_trips / buses subway_trip_route = subways_trips / subways print("Streetcars: %s trips per route" % streetcar_trip_route) print("Buses: %s trips per route" % buses_trip_route) print("Subways: %s trips per route" % subway_trip_route) index = np.arange(3) bar_width = .35 rect_buses = plt.bar(index, (buses_trip_route, streetcar_trip_route, subway_trip_route), bar_width) plt.xlabel("Vehicle Type") plt.ylabel("Average Trips per Route") plt.title("Average Trips per Route by Vehicle Type") ticks = plt.xticks(index + bar_width / 2, ("Buses", "Streetcars", "Subways")) stops_list = {} with open('schedules/stops.txt') as stops_file: stops_csv = csv.reader(stops_file) next(stops_file) for stop in stops_csv: stop_id = int(stop[0]) stop_name = stop[2] stop_lat = float(stop[4]) stop_lon = float(stop[5]) stops_list[stop_id] = Stop(stop_id, stop_name, stop_lat, stop_lon) lats = [x.stop_lat for x in stops_list.values()] longs = [x.stop_lon for x in stops_list.values()] plt.figure(figsize=(8,6)) plt.scatter(longs, lats, marker='.', c='b') plt.xlabel('Longitude') plt.ylabel('Latitude') t = plt.title('Geographic Distribution of All Stops') longs_pd = pd.Series(longs) lats_pd = pd.Series(lats) fig, ax = plt.subplots(1,1) n, bins, patches = ax.hist(longs_pd, align='mid', bins=25) plt.xlabel('Longitude') plt.ylabel('Number of stops') plt.title('Longitude of Stops') plt.xticks(rotation=70) plt.xticks(bins[:-1],['%.3f' % (x-0.5) for x in bins[:-1]]) plt.subplots_adjust(bottom=0.2) print("Longitudes: ", longs_pd.describe()) fig, ax = plt.subplots(1,1) n, bins, patches = ax.hist(lats_pd, align='mid', bins=25) plt.xlabel('Latitude') plt.ylabel('Number of stops') plt.title('Latitude of Stops') plt.xticks(rotation=70) plt.xticks(bins[:-1],['%.3f' % (x-0.5) for x in bins[:-1]]) plt.subplots_adjust(bottom=0.2) print("Latitudes: ", lats_pd.describe()) collection = db.vehicles # the vehicles collection in the database print("Number of points: %s" % collection.count()) print("An arbitrarily selected point: %s" % collection.find_one()) headings = collection.aggregate([{"$group":{ '_id': '$heading', 'count': {'$sum': 1}}}, {"$match": {'_id': {'$gte': 0}}}]) headings = headings['result'] theta = [] r = [] for heading in sorted(headings, key=lambda x: x['_id']): theta.append(heading['_id'] * np.pi / 180) r.append(heading['count']) ax = plt.subplot(111, polar=True) ax.plot(theta, r) ax.grid(True) ax.set_theta_direction(-1) ax.set_theta_zero_location('N') plt.title("Headings of TTC vehicles.") print("Maximum: %s"% max(r)) print("Values on two sides of maximum:",r[r.index(max(r)) -1 ], r[r.index(max(r)) + 1]) print("Angle of maximum (radians): ", theta[r.index(max(r))]) print("Total count: ", sum(r)) spike_idx = r.index(max(r)) del r[spike_idx] del theta[spike_idx] ax = plt.subplot(111, polar=True) ax.plot(theta, r) ax.grid(True) ax.set_theta_direction(-1) ax.set_theta_zero_location('N') ti = plt.title("Headings of TTC vehicles.") headings_dataframe = pd.DataFrame(data={"heading":[x * 180 / np.pi for x in theta], "count": r}) print(headings_dataframe[:10]) total_headings = headings_dataframe['count'].sum() # median median_idx = total_headings // 2 current_idx = 0 for heading in range(len(headings_dataframe)): current_idx += headings_dataframe.iloc[heading]['count'] if current_idx >= median_idx: median = headings_dataframe.iloc[heading]['heading'] break # mean s = 0 for heading in range(len(headings_dataframe)): s += headings_dataframe.iloc[heading]['count'] * headings_dataframe.iloc[heading]['heading'] mean = s / total_headings # standard deviation s = 0 for heading in headings_dataframe.iterrows(): n = heading[1]['count'] h = heading[1]['heading'] s += n * (h - mean) ** 2 stdev = math.sqrt((1/total_headings) * s) # mode modes = [] max_heading = max(headings_dataframe['count']) modes = [x[1]['heading'] for x in headings_dataframe.iterrows() if x[1]['count'] == max_heading] print("Mean: %.3f"% mean) print("Standard deviation: %.3f" % stdev) print("Median: %.0f"% median) print("Modes: %.0f" % modes[0]) import pickle from matplotlib.dates import HourLocator, DateFormatter, DayLocator from datetime import datetime from pytz import timezone import pytz tz = timezone('America/Toronto') with open('time_dist.pickle', 'rb') as time_dist_pickle: time_dist = pickle.load(time_dist_pickle)[1:] times, dist = zip(*time_dist) # note that the timestamp from the pickle file already has timezone data date_times = [datetime.fromtimestamp(x) for x in times] fig, axes = plt.subplots() fig.set_size_inches(8,6) days = DayLocator() hours = HourLocator(interval=8) formatter = DateFormatter('%b %d') axes.plot_date(date_times,dist, 'b-') axes.xaxis.set_major_locator(days) axes.xaxis.set_minor_locator(hours) axes.xaxis.set_major_formatter(formatter) axes.set_xlabel('Date and Time') axes.set_ylabel('Number of vehicles on duty') axes.set_title('Number of TTC vehicles on duty from April 27th to May 4th') plt.show() #velocity_collection = db.velocity #velocities = [] #for idx, velocity_point in enumerate(velocity_collection.find(sort=[('timestamp_orig', ASCENDING)])): # if idx % 1000 == 0: # print(idx, end='\r') # timestamp = velocity_point['timestamp_orig'] # v = math.sqrt(velocity_point['vx'] ** 2 + velocity_point['vy'] ** 2) # velocities.append((timestamp, v)) with open('velocities.pkl', 'rb') as velocities_file: velocities = pickle.load(velocities_file) date_times, dist = zip(*velocities) del velocities n, bins, patchs = plt.hist(dist, bins=25, range=(0,20)) t = plt.title('Aggregated TTC Vehicles Velocities') plt.xlabel('Velocity (m/s)') plt.ylabel("Number of Vehicle-Time Points") velocities_pd = pd.DataFrame({'date_times': date_times, 'velocity': dist}) print(velocities_pd['velocity'].describe()) print("IQR:", interquartile_range(velocities_pd['velocity'])) print("Pearson's Index: ", pearsons_index(velocities_pd['velocity'])) print("Range: ", data_range(velocities_pd['velocity'])) outlier_b, outlier_t = outlier_range(velocities_pd['velocity']) velocities_series = pd.Series( [x for x in velocities_pd['velocity'] if x >= outlier_b and x <= outlier_t]) n, bins, patchs = plt.hist(velocities_series, bins=25, range=(0,20)) t = plt.title('Aggregated TTC Vehicles Velocities') plt.xlabel('Velocity (m/s)') plt.ylabel("Number of Vehicle-Time Points") print(velocities_series.describe()) print("IQR:", interquartile_range(velocities_series)) print("Pearson's Index: ", pearsons_index(velocities_series)) print("Range: ", data_range(velocities_series)) velocity_moving_average = ([],[]) i = 0 while i < len(velocities_pd): start_point = velocities_pd.iloc[i] t_start = start_point['date_times'] t_end = t_start + 1000 * 60 * 15 # fifteen minute moving averages s = 0 n = 0 while i < len(velocities_pd) and velocities_pd['date_times'][i] < t_end: this_point = velocities_pd.iloc[i] v = this_point['velocity'] if v <= outlier_t and v >= outlier_b: s += v n += 1 i += 100 # sample every 100th point for speed if n != 0: avg_vel = s / n velocity_moving_average[0].append(t_start) velocity_moving_average[1].append(avg_vel) i += 1 date_times_gen = [datetime.fromtimestamp(x // 1000) for x in velocity_moving_average[0]] fig, axes = plt.subplots() fig.set_size_inches(8,6) days = DayLocator(interval=2) formatter = DateFormatter('%b %d') axes.plot_date(date_times_gen,velocity_moving_average[1],'b-') axes.xaxis.set_major_locator(days) axes.xaxis.set_major_formatter(formatter) axes.set_xlabel('Date') axes.set_ylabel('Average Velocity (m/s)') axes.set_title('Velocity of TTC Vehicles Measured Over Time') plt.show() times_gen = [x.time() for x in date_times_gen] fig, axes = plt.subplots() fig.set_size_inches(8,6) axes.plot(times_gen,velocity_moving_average[1], 'b.') axes.set_xlabel('Time') axes.set_ylabel('Average Velocity (m/s)') axes.set_title('Velocity of TTC Vehicles Measured Over a Day') plt.show() active_vehicles = db.active_vehicles bus_vel = ([], []) for i in range(len(velocity_moving_average[0])): ts = velocity_moving_average[0][i] // 1000 vel = velocity_moving_average[1][i] number_active = active_vehicles.find_one({'timestamp': {'$gte': ts}}, sort=[('timestamp',ASCENDING)]) n = number_active['count'] bus_vel[0].append(n) bus_vel[1].append(vel) p = plt.plot(bus_vel[0], bus_vel[1], 'b.') plt.xlabel("Number of Active Vehicles") plt.ylabel("Average Velocity of Active Vehicles (m/s)") t = plt.title("Average Velocity of Active Vehicles vs Number of Active Vehicles") velocity_active_df = pd.DataFrame({"active": bus_vel[0], "vel": bus_vel[1]}) res = ols(y=velocity_active_df['vel'], x = velocity_active_df['active']) print(res) plt.hold(True) p = plt.plot(bus_vel[0], bus_vel[1], 'b.') plt.xlabel("Number of Active Vehicles") plt.ylabel("Average Velocity of Active Vehicles (m/s)") t = plt.title("Average Velocity of Active Vehicles vs Number of Active Vehicles") m, b = -0.0007, 5.4518 x = np.arange(0, 2000, 100) p = plt.plot(x, np.poly1d((m, b))(x), '-k') plt.hold(False)