%matplotlib inline import pickle from datetime import datetime from matplotlib import pyplot as plt import numpy as np import pandas as pd from pytz import timezone with open('schedules_times.pickle', 'rb') as schedule_times_file: weekday_stop_times, sat_stop_times, sun_stop_times = pickle.load(schedule_times_file) def plot_schedule_dist(times, schedule_str="Weekday"): vals, bin_edges = np.histogram(times, 100) fig, ax = plt.subplots() ax.plot([datetime.fromtimestamp(x // 1000, tz=timezone('UTC')) for x in bin_edges][:-1], vals) plt.title("Distribution of %s Stop Times" % schedule_str) plt.xlabel("Time") plt.ylabel("Number of Stops") fig.autofmt_xdate() plot_schedule_dist(weekday_stop_times, "Weekday") plot_schedule_dist(sat_stop_times, 'Saturday') plot_schedule_dist(sun_stop_times, "Sunday") from pymongo import MongoClient, ASCENDING, DESCENDING client = MongoClient() db = client.datasummative punctuality_collection = db.punctuality punctualities = [] for punc_row in punctuality_collection.find(fields={'punctuality': 1}): punctualities.append(punc_row['punctuality'] / (1000 * 60)) punctualities_series_orig = pd.Series(punctualities) n, bins, patches = plt.hist(punctualities_series_orig,bins=300) t = plt.title("Punctuality of TTC Vehicles") t = plt.xlabel("Minutes Late") t = plt.ylabel("Number of Instances") def pearsons_index(series): return 3 * (series.mean() - series.median()) / series.std() def data_range(series): return series.quantile(1) - series.quantile(0) def interquartile_range(series): return series.quantile(.75) - series.quantile(.25) def outlier_range(series): iqr = interquartile_range(series) return (series.quantile(.25) - 1.5 * iqr, series.quantile(.75) + 1.5 * iqr) print(punctualities_series_orig.describe()) print("Pearson's index: ", pearsons_index(punctualities_series_orig)) print("Interquartile range: ", interquartile_range(punctualities_series_orig)) print("Data range: ", data_range(punctualities_series_orig)) outlier_r = outlier_range(punctualities_series_orig) punctualities_series = pd.Series(x for x in punctualities if x >= outlier_r[0] and x <= 40) print(punctualities_series.describe()) print("Pearson's index: ", pearsons_index(punctualities_series)) print("Interquartile range: ", interquartile_range(punctualities_series)) print("Data range: ", data_range(punctualities_series)) n, bins, patches = plt.hist(punctualities_series,bins=20) t = plt.title("Punctuality of TTC Vehicles") t = plt.xlabel("Minutes Late") t = plt.ylabel("Number of Instances") plt.boxplot(punctualities_series, True, '+', vert=False, whis=np.inf) plt.xlim(0, 50) t = plt.title("Punctuality of TTC Vehicles") t = plt.xlabel("Minutes Late") t = plt.yticks((1,), ('TTC Vehicles',), rotation=90) # rt_and_punc = {} # for punc_row in punctuality_collection.find(fields={'punctuality': 1, # 'rt_tag': 1}): # rt_tag = punc_row['rt_tag'] # if rt_tag in rt_and_punc: # rt_and_punc[rt_tag].append(punc_row['punctuality'] / (1000 * 60)) # else: # rt_and_punc[rt_tag] = [punc_row['punctuality'] / (1000 * 60)] # ## convert to panda series, then find median # rt_and_punc_medians = {} # for rt in rt_and_punc: # s = pd.Series(rt_and_punc[rt]) # rt_and_punc_medians[rt] = s.median() with open('median_punctuality.pkl', 'rb') as median_punc_file: rt_and_punc_medians = pickle.load(median_punc_file) N = len(rt_and_punc_medians) ind = np.arange(N) d = list(zip(*rt_and_punc_medians.items())) p = plt.bar(ind, d[1]) t = plt.xlabel('Routes') t = plt.ylabel('Median Late Time (minutes)') t = plt.xticks(ind + 0.4, d[0], rotation=90) t = plt.title('Median Late Times of Routes') p = plt.hist(d[1], bins=30) t = plt.xlabel('Median Late Time (minutes)') t = plt.title('Median Late Times of Routes') import math # velocity_collection = db.velocity # routes_velocity = {} # for velocity in velocity_collection.find(fields={'route_tag': 1, # 'vx': 1, # 'vy': 1}): # rt_tag = velocity['route_tag'] # vel = math.hypot(velocity['vx'], velocity['vy']) # if rt_tag in routes_velocity: # routes_velocity[rt_tag].append(vel) # else: # routes_velocity[rt_tag] = [vel] # routes_velocity_median = {} # for rt in routes_velocity: # s = pd.Series(routes_velocity[rt]) # routes_velocity_median[rt] = s.median() import pickle # with open('median_velocities.pkl', 'wb') as median_vels_file: # pickle.dump(routes_velocity_median, median_vels_file) # with open('median_punctuality.pkl', 'wb') as median_punc_file: # pickle.dump(rt_and_punc_medians, median_punc_file) with open('median_velocities.pkl', 'rb') as median_vels_file: routes_velocity_median = pickle.load(median_vels_file) routes = [] puncs = [] velocities = [] for rt in rt_and_punc_medians: if rt in routes_velocity_median: routes.append(rt) puncs.append(rt_and_punc_medians[rt]) velocities.append(routes_velocity_median[rt]) routes_puncs_df = pd.DataFrame({'route': routes, 'punc': puncs, 'vel': velocities}, index=routes) del routes del velocities del puncs p = plt.scatter(routes_puncs_df['vel'], routes_puncs_df['punc']) t = plt.title('Median Punctuality vs. Median Velocity of Routes') t = plt.xlabel('Median Velocity (m/s)') t = plt.ylabel('Median Punctuality (minutes)') from pandas.stats.api import ols res = ols(y=routes_puncs_df['punc'], x=routes_puncs_df['vel']) print(res) plt.hold(True) p = plt.scatter(routes_puncs_df['vel'], routes_puncs_df['punc']) t = plt.title('Median Punctuality vs. Median Velocity of Routes') t = plt.xlabel('Median Velocity (m/s)') t = plt.ylabel('Median Punctuality (minutes)') m, b = 0.8392, 3.9364 p = plt.plot(np.arange(1, 11), np.poly1d((m, b))(np.arange(1,11)), '-r') plt.hold(False) with open('trips_routes.pkl', 'rb') as trips_routes_file: routes_trips = pickle.load(trips_routes_file) rts = [] num_trips_s = [] for rt, num_trips in routes_trips: rts.append(rt) num_trips_s.append(num_trips) routes_trips_df = pd.DataFrame({'route': rts, 'num_trips': num_trips_s}, index=rts) del rts del num_trips_s del routes_trips puncs_trips_df = routes_puncs_df.merge(routes_trips_df, right_index=True, left_index=True) res = ols(y=puncs_trips_df['punc'], x=puncs_trips_df['num_trips']) print(res) def plot_with_reg(x, y): plt.hold(True) plt.plot(x,y, 'b.') res = ols(y=y, x=x) m = res.beta['x'] b = res.beta['intercept'] x_fit = np.linspace(min(x), max(x)) y_fit = np.poly1d((m,b)) plt.plot(x_fit, y_fit(x_fit), '-k') plt.hold(False) plot_with_reg(puncs_trips_df['num_trips'], puncs_trips_df['punc']) t = plt.title('Number of Trips vs. Punctuality') t = plt.xlabel('Number of Trips (per six weeks)') t = plt.ylabel('Punctuality (minutes)') t = plt.xlim(0,5000) t = plt.ylim(0, 40) ridership_collection = db.ridership rts = [] costs = [] riders = [] for r in ridership_collection.find(): rts.append(r['route']) costs.append(r['cost']) riders.append(r['ridership']) ridership_df = pd.DataFrame({'route': rts, 'cost': costs, 'ridership': riders}, index=rts) ridership_punc_df = routes_puncs_df.merge(ridership_df, right_index=True, left_index=True) del rts del costs del riders print(ridership_punc_df.shape) print(ols(x=ridership_punc_df['ridership'], y=ridership_punc_df['punc'])) plot_with_reg(ridership_punc_df['ridership'], ridership_punc_df['punc']) t = plt.ylim(0, 30) t = plt.title("Punctuality vs. Ridership") t = plt.xlabel("Weekday Ridership") t = plt.ylabel("Punctuality (minutes)") print(ols(x=ridership_punc_df['ridership'], y=ridership_punc_df['vel'])) plot_with_reg(x=ridership_punc_df['ridership'], y=ridership_punc_df['vel']) t = plt.ylabel("Median Velocity (m/s)") t = plt.xlabel("Weekday Ridership") t = plt.title("Velocity vs. Ridership") # weather_collection = db.weather # ts = [] # ps = [] # temps = [] # pressures = [] # for punctuality_row in punctuality_collection.find(): # t = punctuality_row['datetime_real'] # p = punctuality_row['punctuality'] / (1000* 60) # if p > 40: # continue # weather_row = weather_collection.find_one({'datetime': {'$lte': t}}, # sort=[('datetime', DESCENDING)]) # ts.append(t) # ps.append(p) # temps.append(weather_row['temp']) # pressures.append(weather_row['pressure']) # di = {'time': ts, 'punc': ps, 'temp': temps, 'pressure': pressures} # with open('weather_punc.pkl', 'wb') as weather_punc_file: # pickle.dump(di, weather_punc_file) # del ts # del ps # del temps # del pressures # del di with open('weather_punc.pkl', 'rb') as weather_punc_file: di = pickle.load(weather_punc_file) weather_punc_df = pd.DataFrame(di, index=di['time']) del di print(ols(x=weather_punc_df['temp'], y=weather_punc_df['punc'])) plot_with_reg(weather_punc_df['temp'], weather_punc_df['punc']) t = plt.title('Punctuality and Temperature') t = plt.xlabel('Temperature (degrees Celcius)') t = plt.ylabel('Punctuality (minutes)') print(ols(x=weather_punc_df['pressure'], y=weather_punc_df['punc'])) plot_with_reg(weather_punc_df['pressure'], weather_punc_df['punc']) t = plt.title('Punctuality and Pressure') t = plt.xlabel('Pressure (kPa)') t = plt.ylabel('Punctuality (minutes)')