%matplotlib inline
import csv, math, pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from classes import Route, Trip, Stop # just classes to store the data
from pandas.stats.api import ols
import pymongo
from pymongo import MongoClient, ASCENDING, DESCENDING

client = MongoClient()
db = client.datasummative # my name for the database

routes_list = {}
with open('schedules/routes.txt') as routes_file: 
    routes_csv = csv.reader(routes_file)
    next(routes_csv)  # skip header
    for route in routes_csv:
        route_id =int(route[0])
        route_short_name = route[2]
        route_long_name = route[3]
        route_type = int(route[5])
        routes_list[route_id] = Route(route_id, route_short_name,
                                      route_long_name, route_type)

with open('schedules/trips.txt') as trips_file:
    trips_csv = csv.reader(trips_file)
    next(trips_file)
    for trip in trips_csv:
        route_id = int(trip[0])
        service_id =int(trip[1])
        trip_id = int(trip[2])
        trip_headsign = trip[3]
        route = routes_list[route_id]
        route.trips[trip_id] = Trip(trip_id, service_id, trip_headsign)

## Distribution of trips vs routes
trips_freq = []
for route in routes_list.values():
    trips_freq.append((route.route_short_name, len(route.trips)))
with open('trips_routes.pkl', 'wb') as trips_routes_file:
    pickle.dump(trips_freq, trips_routes_file)
trips_series = pd.Series([x[1] for x in trips_freq])
print(trips_series.describe())

def pearsons_index(series):
    return 3 * (series.mean() - series.median()) / series.std()
def data_range(series):
    return series.quantile(1) - series.quantile(0)
def interquartile_range(series):
    return series.quantile(.75) - series.quantile(.25)
def outlier_range(series):
    iqr = interquartile_range(series)
    return (series.quantile(.25) - 1.5 * iqr, series.quantile(.75) + 1.5 * iqr)

print("Pearson's: %s" % pearsons_index(trips_series))
print("Range: %s" % data_range(trips_series))
print("IQR: %s" % interquartile_range(trips_series))
print("Outliers lie outside: (%s,%s)" % outlier_range(trips_series))

fig, ax = plt.subplots(1,1)
n, bins, patches = ax.hist([x[1] for x in trips_freq], bins=25,
                            align='mid', range=(0, 5000))
plt.xlabel('Number of Trips')
plt.ylabel('Number of Routes')
plt.title('Distribution of trips over routes')
plt.xticks(rotation=70)
ax.set_xticks(bins[:-1])
plt.xticks(bins[:-1],['%.1f' % (x-0.5) for x in bins[:-1]])
plt.subplots_adjust(bottom=0.2)

outlier_r = outlier_range(trips_series)
outliers = filter(lambda x: x[1] < outlier_r[0] or x[1] > outlier_r[1], trips_freq)
print(list(outliers))

trips_series = pd.Series([x[1] for x in trips_freq if x[1] >= outlier_r[0] and x[1] <= outlier_r[1]])
print(trips_series.describe())
print("Pearson's: %s" % pearsons_index(trips_series))
print("IQR: %s" % interquartile_range(trips_series))

fig, ax = plt.subplots(1,1)
n, bins, patches = ax.hist(trips_series, bins=25,
                            align='mid', range=(0, 2000))
plt.xlabel('Number of Trips')
plt.ylabel('Number of Routes')
plt.title('Distribution of trips over routes')
plt.xticks(rotation=70)
plt.xticks(bins[:-1],['%.1f' % (x-0.5) for x in bins[:-1]])
plt.subplots_adjust(bottom=0.2)

ridership_collection = db.ridership
rts = []
riders = []
costs = []
for ridership_row in ridership_collection.find():
    rts.append(ridership_row['route'])
    riders.append(ridership_row['ridership'])
    costs.append(ridership_row['cost'])
riders_costs_df = pd.DataFrame({'route': rts, 'ridership': riders, 'cost': costs},
                            index=rts)
del rts
del costs
del riders

print(ols(x=riders_costs_df['ridership'], y=riders_costs_df['cost']))

# define a convenient way to graph some stuff, find its regression, and graph that too
def plot_with_reg(x, y):
    plt.hold(True)
    plt.plot(x,y, 'b.')
    res = ols(y=y, x=x)
    m = res.beta['x']
    b = res.beta['intercept']
    x_fit = np.linspace(min(x), max(x))
    y_fit = np.poly1d((m,b))
    plt.plot(x_fit, y_fit(x_fit), '-k')
    plt.hold(False)

plot_with_reg(x=riders_costs_df['ridership'],  y=riders_costs_df['cost'])
t = plt.title("Cost of a Route vs. Number of Riders")
t = plt.xlabel("Number of Customers per Weekday")
t = plt.ylabel("Cost per Weekday ($)")

# from https://developers.google.com/transit/gtfs/reference#routes_fields
TYPE_SUBWAY = 1
TYPE_STREETCAR = 0
TYPE_BUS = 3

buses = 0
subways = 0
streetcars = 0
for route in routes_list.values():
    if route.route_type == TYPE_SUBWAY:
        subways += 1
    elif route.route_type == TYPE_STREETCAR:
        streetcars += 1
    elif route.route_type == TYPE_BUS:
        buses += 1
print("Bus routes: %s" % buses)
print("Subway routes: %s" % subways)
print("Streetcar routes: %s" % streetcars)

plt.figure(figsize = (6,6))
p = plt.pie((buses, streetcars, subways), labels=("Bus Routes", "Streetcar Routes", "Subway Routes"),
    startangle=90, autopct="%1.1f%%")
t = plt.title("Vehicles in the TTC by Number of Routes")

subways_trips = 0
buses_trips = 0
streetcars_trips = 0

trips_map = dict(trips_freq)
for route in routes_list:
    route_type = routes_list[route].route_type
    route_name = routes_list[route].route_short_name
    if route_type == TYPE_SUBWAY:
        subways_trips += trips_map[route_name]
    elif route_type == TYPE_BUS:
        buses_trips += trips_map[route_name]
    elif route_type == TYPE_STREETCAR:
        streetcars_trips += trips_map[route_name]
print("Bus trips: %s" % buses_trips)
print("Subway trips: %s" % subways_trips)
print("Streetcar trips: %s" % streetcars_trips)
plt.figure(figsize=(6,6))
p = plt.pie((buses_trips, streetcars_trips, subways_trips), labels=("Bus Trips", "Streetcar Trips",
                                                                    "Subway Trips"),
            startangle=90, autopct="%1.1f%%")
t = plt.title("Vehicles in the TTC by number of trips")

streetcar_trip_route = streetcars_trips / streetcars
buses_trip_route = buses_trips / buses
subway_trip_route = subways_trips / subways
print("Streetcars: %s trips per route" % streetcar_trip_route)
print("Buses: %s trips per route" % buses_trip_route)
print("Subways: %s trips per route" % subway_trip_route)
index = np.arange(3)
bar_width = .35
rect_buses = plt.bar(index, (buses_trip_route, streetcar_trip_route, subway_trip_route), bar_width)
plt.xlabel("Vehicle Type")
plt.ylabel("Average Trips per Route")
plt.title("Average Trips per Route by Vehicle Type")
ticks = plt.xticks(index + bar_width / 2, ("Buses", "Streetcars", "Subways"))

stops_list = {}

with open('schedules/stops.txt') as stops_file:
    stops_csv = csv.reader(stops_file)
    next(stops_file)
    for stop in stops_csv:
        stop_id = int(stop[0])
        stop_name = stop[2]
        stop_lat = float(stop[4])
        stop_lon = float(stop[5])
        stops_list[stop_id] = Stop(stop_id, stop_name, stop_lat, stop_lon)

lats = [x.stop_lat for x in stops_list.values()]
longs = [x.stop_lon for x in stops_list.values()]
plt.figure(figsize=(8,6))
plt.scatter(longs, lats, marker='.', c='b')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
t = plt.title('Geographic Distribution of All Stops')

longs_pd = pd.Series(longs)
lats_pd = pd.Series(lats)
fig, ax = plt.subplots(1,1)
n, bins, patches = ax.hist(longs_pd, align='mid', bins=25)
plt.xlabel('Longitude')
plt.ylabel('Number of stops')
plt.title('Longitude of Stops')
plt.xticks(rotation=70)
plt.xticks(bins[:-1],['%.3f' % (x-0.5) for x in bins[:-1]])
plt.subplots_adjust(bottom=0.2)
print("Longitudes: ", longs_pd.describe())

fig, ax = plt.subplots(1,1)
n, bins, patches = ax.hist(lats_pd, align='mid', bins=25)
plt.xlabel('Latitude')
plt.ylabel('Number of stops')
plt.title('Latitude of Stops')
plt.xticks(rotation=70)
plt.xticks(bins[:-1],['%.3f' % (x-0.5) for x in bins[:-1]])
plt.subplots_adjust(bottom=0.2)
print("Latitudes: ", lats_pd.describe())

collection = db.vehicles # the vehicles collection in the database
print("Number of points: %s" % collection.count())
print("An arbitrarily selected point: %s" % collection.find_one())

headings = collection.aggregate([{"$group":{ '_id': '$heading',
                                'count': {'$sum': 1}}},
                      {"$match": {'_id': {'$gte': 0}}}])
headings = headings['result']

theta = []
r = []
for heading in sorted(headings, key=lambda x: x['_id']):
    theta.append(heading['_id'] * np.pi / 180)
    r.append(heading['count'])

ax = plt.subplot(111, polar=True)
ax.plot(theta, r)
ax.grid(True)

ax.set_theta_direction(-1)
ax.set_theta_zero_location('N')
plt.title("Headings of TTC vehicles.")
print("Maximum: %s"% max(r))
print("Values on two sides of maximum:",r[r.index(max(r)) -1 ], r[r.index(max(r)) + 1])
print("Angle of maximum (radians): ", theta[r.index(max(r))])
print("Total count: ", sum(r))

spike_idx = r.index(max(r))
del r[spike_idx]
del theta[spike_idx]

ax = plt.subplot(111, polar=True)
ax.plot(theta, r)
ax.grid(True)

ax.set_theta_direction(-1)
ax.set_theta_zero_location('N')
ti = plt.title("Headings of TTC vehicles.")

headings_dataframe = pd.DataFrame(data={"heading":[x * 180 / np.pi for x in theta],
                                        "count": r})
print(headings_dataframe[:10])

total_headings = headings_dataframe['count'].sum()

# median
median_idx = total_headings // 2
current_idx = 0
for heading in range(len(headings_dataframe)):
    current_idx += headings_dataframe.iloc[heading]['count']
    if current_idx >= median_idx:
        median = headings_dataframe.iloc[heading]['heading']
        break

# mean
s = 0
for heading in range(len(headings_dataframe)):
    s += headings_dataframe.iloc[heading]['count'] * headings_dataframe.iloc[heading]['heading']
mean = s / total_headings

# standard deviation
s = 0
for heading in headings_dataframe.iterrows():
    n = heading[1]['count']
    h = heading[1]['heading']
    s += n * (h - mean) ** 2
stdev = math.sqrt((1/total_headings) * s)

# mode
modes = []
max_heading = max(headings_dataframe['count'])
modes = [x[1]['heading'] for x in headings_dataframe.iterrows() if x[1]['count'] == max_heading]

print("Mean: %.3f"% mean)
print("Standard deviation: %.3f" % stdev)
print("Median: %.0f"% median)
print("Modes: %.0f" % modes[0])

import pickle
from matplotlib.dates import HourLocator, DateFormatter, DayLocator
from datetime import datetime
from pytz import timezone
import pytz
tz = timezone('America/Toronto')

with open('time_dist.pickle', 'rb') as time_dist_pickle:
    time_dist = pickle.load(time_dist_pickle)[1:]
    times, dist = zip(*time_dist)
    # note that the timestamp from the pickle file already has timezone data
    date_times = [datetime.fromtimestamp(x) for x in times]
fig, axes = plt.subplots()
fig.set_size_inches(8,6)
days = DayLocator()
hours = HourLocator(interval=8)
formatter = DateFormatter('%b %d')
axes.plot_date(date_times,dist, 'b-')
axes.xaxis.set_major_locator(days)
axes.xaxis.set_minor_locator(hours)
axes.xaxis.set_major_formatter(formatter)
axes.set_xlabel('Date and Time')
axes.set_ylabel('Number of vehicles on duty')
axes.set_title('Number of TTC vehicles on duty from April 27th to May 4th')
plt.show()

#velocity_collection = db.velocity
#velocities = []
#for idx, velocity_point in enumerate(velocity_collection.find(sort=[('timestamp_orig', ASCENDING)])):
#    if idx % 1000 == 0:
#        print(idx, end='\r')
#    timestamp = velocity_point['timestamp_orig']
#    v = math.sqrt(velocity_point['vx'] ** 2 + velocity_point['vy'] ** 2)
#    velocities.append((timestamp, v))

with open('velocities.pkl', 'rb') as velocities_file:
    velocities = pickle.load(velocities_file)

date_times, dist = zip(*velocities)
del velocities

n, bins, patchs = plt.hist(dist, bins=25, range=(0,20))
t = plt.title('Aggregated TTC Vehicles Velocities')
plt.xlabel('Velocity (m/s)')
plt.ylabel("Number of Vehicle-Time Points")

velocities_pd = pd.DataFrame({'date_times': date_times,
                              'velocity': dist})
print(velocities_pd['velocity'].describe())
print("IQR:", interquartile_range(velocities_pd['velocity']))
print("Pearson's Index: ", pearsons_index(velocities_pd['velocity']))
print("Range: ", data_range(velocities_pd['velocity']))

outlier_b, outlier_t = outlier_range(velocities_pd['velocity'])
velocities_series = pd.Series(
    [x for x in velocities_pd['velocity'] if x >= outlier_b and x <= outlier_t])

n, bins, patchs = plt.hist(velocities_series, bins=25, range=(0,20))
t = plt.title('Aggregated TTC Vehicles Velocities')
plt.xlabel('Velocity (m/s)')
plt.ylabel("Number of Vehicle-Time Points")

print(velocities_series.describe())
print("IQR:", interquartile_range(velocities_series))
print("Pearson's Index: ", pearsons_index(velocities_series))
print("Range: ", data_range(velocities_series))

velocity_moving_average = ([],[])
i = 0
while i < len(velocities_pd):
    start_point = velocities_pd.iloc[i]
    t_start = start_point['date_times']
    t_end = t_start + 1000 * 60 * 15  # fifteen minute moving averages
    s = 0
    n = 0
    while i < len(velocities_pd) and velocities_pd['date_times'][i] < t_end:
        this_point = velocities_pd.iloc[i]
        v = this_point['velocity']
        if v <= outlier_t and v >= outlier_b:
            s += v
            n += 1
        i += 100  # sample every 100th point for speed
    if n != 0:
        avg_vel = s / n
        velocity_moving_average[0].append(t_start)
        velocity_moving_average[1].append(avg_vel)
    i += 1    

date_times_gen = [datetime.fromtimestamp(x // 1000) for x in velocity_moving_average[0]]

fig, axes = plt.subplots()
fig.set_size_inches(8,6)
days = DayLocator(interval=2)
formatter = DateFormatter('%b %d')
axes.plot_date(date_times_gen,velocity_moving_average[1],'b-')
axes.xaxis.set_major_locator(days)
axes.xaxis.set_major_formatter(formatter)
axes.set_xlabel('Date')
axes.set_ylabel('Average Velocity (m/s)')
axes.set_title('Velocity of TTC Vehicles Measured Over Time')
plt.show()

times_gen = [x.time() for x in date_times_gen]
fig, axes = plt.subplots()
fig.set_size_inches(8,6)
axes.plot(times_gen,velocity_moving_average[1], 'b.')
axes.set_xlabel('Time')
axes.set_ylabel('Average Velocity (m/s)')
axes.set_title('Velocity of TTC Vehicles Measured Over a Day')
plt.show()

active_vehicles = db.active_vehicles

bus_vel = ([], [])
for i in range(len(velocity_moving_average[0])):
    ts = velocity_moving_average[0][i] // 1000
    vel = velocity_moving_average[1][i]
    number_active = active_vehicles.find_one({'timestamp':
                                              {'$gte': ts}},
                                             sort=[('timestamp',ASCENDING)])
    n = number_active['count']
    bus_vel[0].append(n)
    bus_vel[1].append(vel)

p = plt.plot(bus_vel[0], bus_vel[1], 'b.')
plt.xlabel("Number of Active Vehicles")
plt.ylabel("Average Velocity of Active Vehicles (m/s)")
t = plt.title("Average Velocity of Active Vehicles vs Number of Active Vehicles")

velocity_active_df = pd.DataFrame({"active": bus_vel[0], "vel": bus_vel[1]})
res = ols(y=velocity_active_df['vel'], x = velocity_active_df['active'])
print(res)

plt.hold(True)
p = plt.plot(bus_vel[0], bus_vel[1], 'b.')
plt.xlabel("Number of Active Vehicles")
plt.ylabel("Average Velocity of Active Vehicles (m/s)")
t = plt.title("Average Velocity of Active Vehicles vs Number of Active Vehicles")
m, b = -0.0007, 5.4518
x = np.arange(0, 2000, 100)
p = plt.plot(x, np.poly1d((m, b))(x), '-k')
plt.hold(False)