#!/usr/bin/env python # coding: utf-8 # ![alt text](http://library.columbia.edu/content/libraryweb/locations/business/_jcr_content/layout_featuredcontent/image.img.jpg/1369336790208.jpg) # #What do we know about Density's data? # In[2]: import pandas as pd import datetime import seaborn as sns import pylab as plt import numpy as np import requests get_ipython().run_line_magic('matplotlib', 'inline') plt.rcParams.update({'font.size': 28}) sns.set_context("talk") sns.set_style('darkgrid') # ##[CUIT](https://cuit.columbia.edu/) has supplied us with our data set # # ###We receive device counts from specific routers aggregated into 15 minute intervals # # Devices connect to wifi routers across Columbia's campus. CUIT aggregates device collection data by building and by floor and supplies with a live count of the number of devices connected at each location every 15 minutes. # # Below is an example query from the API showing how data is shared publicly # In[3]: prms = {'auth_token': 'OCAPDPVGRZ9B7L0YJ4F44VJCYTN6QIKM'} url = 'http://density.adicu.com/window/2014-10-10T08:00/2014-10-10T21:30/building/75' r = requests.get(url, params=prms) data = pd.DataFrame().from_dict(r.json()['data']) jj = pd.DataFrame().append(data) nxt = requests.get(r.json()['next_page']) jj = jj.append(nxt.json()['data']).reset_index(drop=True) jj['dump_time'] = pd.to_datetime(jj['dump_time']) jj = jj.set_index('dump_time') jj.head() # We'll load our demo data in its original format to analyze long-term trends # In[4]: raw = pd.read_csv('../scripts/demo_data.csv') buildings = pd.read_csv('../scripts/buildings.csv', index_col = 0, header = None, squeeze = True) # In[5]: raw['dump_time'] = pd.to_datetime(raw['dump_time']) raw = raw.set_index('dump_time') # In[1]: raw.head() # In[ ]: len(raw.index) # --- # # #What spaces do we have? # # ##We have various Libraries, John Jay, and Lerner # In[ ]: print buildings # --- # # #Just how full is a study space? # # # # ![alt text](http://www.hercampus.com/sites/default/files/2013/12/02/4192655241_df21ba1706.jpg) # # # ##There's an obvious difference beteween a space that is unusually crowded like during orgo night, and a typical busy day # # ![alt text](http://library.columbia.edu/content/libraryweb/locations/butler/_jcr_content/layout_featuredcontent/image.img.jpg/1368472856821.jpg) # # ##Density assumes that capacity is relative to fullness on an average day, not on outlier days # # ##How do we do that? By considering what the average day like at each space # In[ ]: raw['hrfrac'] = raw.index.hour + raw.index.minute/60. # In[ ]: def dayplot(plotfn, title): f, ax = plt.subplots(figsize=(9, 7)) plotfn.plot(ax=ax) plt.title(title) plt.tight_layout() plt.xticks([0,3,6,9,12,15,18,21]) plt.ylabel('Devices') plt.xlabel('Time of Day') return ax # ##On an average day at Columbia, device count grows until it stablizes for the day # In[ ]: totalday = pd.crosstab(raw.hrfrac, raw.index.date, values=raw.client_count, aggfunc=pd.Series.sum) total = raw.reset_index().groupby('dump_time').agg({'client_count':'sum', 'hrfrac':'mean'}).reset_index() ax = dayplot(totalday.mean(axis=1), 'Average Number of Devices at Columbia') palette = sns.color_palette() low = total.groupby("hrfrac")['client_count'].apply(np.percentile, 25) high = total.groupby("hrfrac")['client_count'].apply(np.percentile, 75) ax.fill_between(low.index, low, high, alpha=.2, color=palette.pop(0)) # #Capacity is the level at which population growth stabilizes # # ###For the capacity of whole campus of Columbia, it would be the following # In[ ]: def find_capacity_overall(total_count): avg_day = total_count.mean(axis=1) return avg_day.max() # In[ ]: print find_capacity_overall(totalday) # --- # # #Different spaces have different properties # In[ ]: tabs = pd.crosstab(raw.hrfrac, raw.group_name, raw.client_count, aggfunc=pd.Series.mean) # In[ ]: def bldg_plot(bldg, ci=False): floors = list(buildings[buildings == bldg].index) title = 'Average Daily Device Count in %s' % bldg ax = dayplot(tabs[floors], title) palette = sns.color_palette() if ci: for f in floors: low = raw[raw['group_name'] == f].groupby('hrfrac')['client_count'].apply(np.percentile, 25) high = raw[raw['group_name'] == f].groupby('hrfrac')['client_count'].apply(np.percentile, 75) ax.fill_between(low.index, low, high, alpha=.2, color=palette.pop(0)) # ##For example, Butler library behaves very differently from John Jay: # # ###This is Butler: # In[ ]: bldg_plot('Butler') # People tend to come in after lunch, take a break for dinner, and work late into the night. # # ###This is John Jay: # In[ ]: bldg_plot('John Jay') # The dining halls are most crowded around mealtimes. # --- # # #How does campus population vary over time? # # ##We can find differences in population based on changes in semesters, exams, and holidays # In[ ]: f, ax = plt.subplots(figsize=(10, 7)) raw.groupby(raw.index.date).client_count.sum().plot(ax=ax) plt.tight_layout() plt.title('Total Number of Devices Across Columbia') plt.ylabel('Number of Devices') plt.xlabel('Date') # There's a clear weekly cycle in the number of devices connected across campus. # # ##Let's remove the weekly cycle by grouping the data into weeks # In[ ]: raw['date'] = raw.index.date raw['weekofyear'] = raw.index.weekofyear weeks = raw['2014-06-30':'2015-4-19'].groupby('weekofyear').client_count.sum() weeks.index = list(weeks[weeks.index < 27].index + 52) + list(weeks.index[16:]) weeks = weeks.sort_index() # ###Smoothing out the data by grouping by week allows us to see differences in semesters and holidays more clearly # # E.g. the big drop around Thanksgiving and the following spike for Finals # In[ ]: f, ax = plt.subplots(figsize=(9, 6)) weeks.index = raw[raw.index.dayofweek == 0]['2014-06-30':'2015-04-19']['date'].drop_duplicates() weeks.plot(ax=ax) plt.tight_layout() plt.title('Weekly Device Count Across Comumbia') plt.ylabel('Total Number of Devices') plt.savefig('weekly_count.png') # Aggregating the data by week allows us to see that there's a jump in the number of devices when the semester starts. We also see dips for fall break, spring break, and Thanksgiving. Thanksgiving is followed by a spike for finals. Winter break is also visible following the dramatic drop in device count. # # --- # # ##How does day of the week affect study habits? # # ###People don't study on weekends: # In[ ]: f, ax = plt.subplots(figsize=(9, 7)) dailycounts = raw.groupby(raw.date).client_count.sum() dailycounts.index = pd.to_datetime(dailycounts.index) dailycounts.groupby(dailycounts.index.dayofweek).mean().plot(ax=ax) plt.tight_layout() plt.title('Average Count Across Columbia') plt.ylabel('Average Daily Device Count') plt.xlabel('Day of Week') weekday = 'Mon Tue Wed Thu Fri Sat Sun'.split() plt.xticks(range(7), weekday) # In[ ]: reshaped = pd.crosstab(raw.index, raw.group_name, raw.client_count, aggfunc=pd.Series.sum) # ##We can also find days with unusually high number of devices # In[ ]: max_days = [d.strftime('%Y-%m-%d') for d in reshaped.apply(lambda x: x.argmax())] max_days = pd.Series(max_days, index = reshaped.columns) max_day_table = pd.DataFrame(columns = max_days.index, index = sorted(raw.hrfrac.drop_duplicates())) for d in max_days.index: data = reshaped[d].ix[max_days[d]] data = reshaped[d].ix[max_days[d]] data.index = data.index.hour + data.index.minute / 60. max_day_table[d] = data # In[ ]: str(max_days.ix['Roone Arledge Auditorium']) # In[ ]: def maxplot(bldg): dayplot(max_day_table[bldg], 'Max Device Counts in %s' % (bldg)) labels = str(max_days.ix[bldg]) print labels plt.legend([labels], loc = 2) # Our plots show the days with the max. number of devices recorded at any time at that location. Using these graphs, we can identify special days during the school year. # # ##Like NSOP # # We see multiple spikes in Roone Arledge Auditorium around the same time as orientation. # In[ ]: maxplot('Roone Arledge Auditorium') plt.savefig('nsop.png') # ##And Orgo Night! # # Notice the spike in Butler 2 around midnight. # In[ ]: maxplot('Butler Library 2') plt.savefig('orgonight.png') # ##For more tools with the Density API, check out Chris Mulligan's [Relative Density App!](https://chmullig.shinyapps.io/relative_density/) # # #Check out our [API](http://density.adicu.com/docs) to use our data!