#!/usr/bin/env python
# coding: utf-8

# ![alt text](http://library.columbia.edu/content/libraryweb/locations/business/_jcr_content/layout_featuredcontent/image.img.jpg/1369336790208.jpg)
# #What do we know about Density's data?

# In[2]:


import pandas as pd
import datetime
import seaborn as sns
import pylab as plt
import numpy as np
import requests
get_ipython().run_line_magic('matplotlib', 'inline')
plt.rcParams.update({'font.size': 28})
sns.set_context("talk")
sns.set_style('darkgrid')


# ##[CUIT](https://cuit.columbia.edu/) has supplied us with our data set
# 
# ###We receive device counts from specific routers aggregated into 15 minute intervals
# 
# Devices connect to wifi routers across Columbia's campus.  CUIT aggregates device collection data by building and by floor and supplies with a live count of the number of devices connected at each location every 15 minutes.
# 
# Below is an example query from the API showing how data is shared publicly

# In[3]:


prms = {'auth_token': 'OCAPDPVGRZ9B7L0YJ4F44VJCYTN6QIKM'}
url = 'http://density.adicu.com/window/2014-10-10T08:00/2014-10-10T21:30/building/75'
r = requests.get(url, params=prms)
data = pd.DataFrame().from_dict(r.json()['data'])
jj = pd.DataFrame().append(data)
nxt = requests.get(r.json()['next_page'])
jj = jj.append(nxt.json()['data']).reset_index(drop=True)
jj['dump_time'] = pd.to_datetime(jj['dump_time'])
jj = jj.set_index('dump_time')
jj.head()


# We'll load our demo data in its original format to analyze long-term trends

# In[4]:


raw = pd.read_csv('../scripts/demo_data.csv')
buildings = pd.read_csv('../scripts/buildings.csv', index_col = 0, header = None, squeeze = True)


# In[5]:


raw['dump_time'] = pd.to_datetime(raw['dump_time'])
raw = raw.set_index('dump_time')


# In[1]:


raw.head()


# In[ ]:


len(raw.index)


# ---
# 
# #What spaces do we have?
# 
# ##We have various Libraries, John Jay, and Lerner

# In[ ]:


print buildings


# ---
# 
# #Just how full is a study space?
# 
# 
# 
# ![alt text](http://www.hercampus.com/sites/default/files/2013/12/02/4192655241_df21ba1706.jpg)
# 
# 
# ##There's an obvious difference beteween a space that is unusually crowded like during orgo night, and a typical busy day
# 
# ![alt text](http://library.columbia.edu/content/libraryweb/locations/butler/_jcr_content/layout_featuredcontent/image.img.jpg/1368472856821.jpg)
# 
# ##Density assumes that capacity is relative to fullness on an average day, not on outlier days
# 
# ##How do we do that? By considering what the average day like at each space 

# In[ ]:


raw['hrfrac'] = raw.index.hour + raw.index.minute/60.


# In[ ]:


def dayplot(plotfn, title):
    f, ax = plt.subplots(figsize=(9, 7))
    plotfn.plot(ax=ax)
    plt.title(title)
    plt.tight_layout()
    plt.xticks([0,3,6,9,12,15,18,21])
    plt.ylabel('Devices')
    plt.xlabel('Time of Day')
    return ax


# ##On an average day at Columbia, device count grows until it stablizes for the day

# In[ ]:


totalday = pd.crosstab(raw.hrfrac, raw.index.date, values=raw.client_count, aggfunc=pd.Series.sum)
total = raw.reset_index().groupby('dump_time').agg({'client_count':'sum', 'hrfrac':'mean'}).reset_index()

ax = dayplot(totalday.mean(axis=1), 'Average Number of Devices at Columbia')
palette = sns.color_palette()
low = total.groupby("hrfrac")['client_count'].apply(np.percentile, 25)
high = total.groupby("hrfrac")['client_count'].apply(np.percentile, 75)
ax.fill_between(low.index, low, high, alpha=.2, color=palette.pop(0))


# #Capacity is the level at which population growth stabilizes
# 
# ###For the capacity of whole campus of Columbia, it would be the following

# In[ ]:


def find_capacity_overall(total_count):
    avg_day = total_count.mean(axis=1)
    return avg_day.max()


# In[ ]:


print find_capacity_overall(totalday)


# ---
# 
# #Different spaces have different properties

# In[ ]:


tabs = pd.crosstab(raw.hrfrac, raw.group_name, raw.client_count, aggfunc=pd.Series.mean)


# In[ ]:


def bldg_plot(bldg, ci=False):
    floors = list(buildings[buildings == bldg].index)
    title = 'Average Daily Device Count in %s' % bldg
    ax = dayplot(tabs[floors], title)
    palette = sns.color_palette()
    if ci:
        for f in floors:
            low = raw[raw['group_name'] == f].groupby('hrfrac')['client_count'].apply(np.percentile, 25)
            high = raw[raw['group_name'] == f].groupby('hrfrac')['client_count'].apply(np.percentile, 75)
            ax.fill_between(low.index, low, high, alpha=.2, color=palette.pop(0))


# ##For example, Butler library behaves very differently from John Jay:
# 
# ###This is Butler:

# In[ ]:


bldg_plot('Butler')


# People tend to come in after lunch, take a break for dinner, and work late into the night.
# 
# ###This is John Jay:

# In[ ]:


bldg_plot('John Jay')


# The dining halls are most crowded around mealtimes.

# ---
# 
# #How does campus population vary over time?
# 
# ##We can find differences in population based on changes in semesters, exams, and holidays

# In[ ]:


f, ax = plt.subplots(figsize=(10, 7))
raw.groupby(raw.index.date).client_count.sum().plot(ax=ax)
plt.tight_layout()
plt.title('Total Number of Devices Across Columbia')
plt.ylabel('Number of Devices')
plt.xlabel('Date')


# There's a clear weekly cycle in the number of devices connected across campus.
# 
# ##Let's remove the weekly cycle by grouping the data into weeks

# In[ ]:


raw['date'] = raw.index.date
raw['weekofyear'] = raw.index.weekofyear
weeks = raw['2014-06-30':'2015-4-19'].groupby('weekofyear').client_count.sum()
weeks.index = list(weeks[weeks.index < 27].index + 52) + list(weeks.index[16:])
weeks = weeks.sort_index()


# ###Smoothing out the data by grouping by week allows us to see differences in semesters and holidays more clearly
# 
# E.g. the big drop around Thanksgiving and the following spike for Finals

# In[ ]:


f, ax = plt.subplots(figsize=(9, 6))
weeks.index = raw[raw.index.dayofweek == 0]['2014-06-30':'2015-04-19']['date'].drop_duplicates()
weeks.plot(ax=ax)
plt.tight_layout()
plt.title('Weekly Device Count Across Comumbia')
plt.ylabel('Total Number of Devices')
plt.savefig('weekly_count.png')


# Aggregating the data by week allows us to see that there's a jump in the number of devices when the semester starts.  We also see dips for fall break, spring break, and Thanksgiving.  Thanksgiving is followed by a spike for finals.  Winter break is also visible following the dramatic drop in device count.
# 
# ---
# 
# ##How does day of the week affect study habits?
# 
# ###People don't study on weekends:

# In[ ]:


f, ax = plt.subplots(figsize=(9, 7))
dailycounts = raw.groupby(raw.date).client_count.sum()
dailycounts.index = pd.to_datetime(dailycounts.index)
dailycounts.groupby(dailycounts.index.dayofweek).mean().plot(ax=ax)
plt.tight_layout()
plt.title('Average Count Across Columbia')
plt.ylabel('Average Daily Device Count')
plt.xlabel('Day of Week')
weekday = 'Mon Tue Wed Thu Fri Sat Sun'.split()
plt.xticks(range(7), weekday)


# In[ ]:


reshaped = pd.crosstab(raw.index, raw.group_name, raw.client_count, aggfunc=pd.Series.sum)


# ##We can also find days with unusually high number of devices

# In[ ]:


max_days = [d.strftime('%Y-%m-%d') for d in reshaped.apply(lambda x: x.argmax())]
max_days = pd.Series(max_days, index = reshaped.columns)
max_day_table = pd.DataFrame(columns = max_days.index, index = sorted(raw.hrfrac.drop_duplicates()))

for d in max_days.index:
    data = reshaped[d].ix[max_days[d]]
    data = reshaped[d].ix[max_days[d]]
    data.index = data.index.hour + data.index.minute / 60.
    max_day_table[d] = data


# In[ ]:


str(max_days.ix['Roone Arledge Auditorium'])


# In[ ]:


def maxplot(bldg):
    dayplot(max_day_table[bldg], 'Max Device Counts in %s' % (bldg))
    labels = str(max_days.ix[bldg])
    print labels 
    plt.legend([labels], loc = 2)


# Our plots show the days with the max. number of devices recorded at any time at that location.  Using these graphs, we can identify special days during the school year.
# 
# ##Like NSOP
# 
# We see multiple spikes in Roone Arledge Auditorium around the same time as orientation. 

# In[ ]:


maxplot('Roone Arledge Auditorium')
plt.savefig('nsop.png')


# ##And Orgo Night!
# 
# Notice the spike in Butler 2 around midnight.

# In[ ]:


maxplot('Butler Library 2')
plt.savefig('orgonight.png')


# ##For more tools with the Density API, check out Chris Mulligan's [Relative Density App!](https://chmullig.shinyapps.io/relative_density/)
# 
# #Check out our [API](http://density.adicu.com/docs) to use our data!