import pandas as pd
import datetime
import seaborn as sns
import pylab as plt
import numpy as np
import requests
%matplotlib inline
plt.rcParams.update({'font.size': 28})
sns.set_context("talk")
sns.set_style('darkgrid')
Devices connect to wifi routers across Columbia's campus. CUIT aggregates device collection data by building and by floor and supplies with a live count of the number of devices connected at each location every 15 minutes.
Below is an example query from the API showing how data is shared publicly
prms = {'auth_token': 'OCAPDPVGRZ9B7L0YJ4F44VJCYTN6QIKM'}
url = 'http://density.adicu.com/window/2014-10-10T08:00/2014-10-10T21:30/building/75'
r = requests.get(url, params=prms)
data = pd.DataFrame().from_dict(r.json()['data'])
jj = pd.DataFrame().append(data)
nxt = requests.get(r.json()['next_page'])
jj = jj.append(nxt.json()['data']).reset_index(drop=True)
jj['dump_time'] = pd.to_datetime(jj['dump_time'])
jj = jj.set_index('dump_time')
jj.head()
client_count | group_id | group_name | parent_id | parent_name | percent_full | |
---|---|---|---|---|---|---|
dump_time | ||||||
2014-10-10 21:15:00 | 3 | 125 | John Jay Dining Hall | 75 | John Jay | 1.500000 |
2014-10-10 21:15:00 | 4 | 155 | JJ's Place | 75 | John Jay | 3.100775 |
2014-10-10 21:00:00 | 1 | 125 | John Jay Dining Hall | 75 | John Jay | 0.500000 |
2014-10-10 21:00:00 | 2 | 155 | JJ's Place | 75 | John Jay | 1.550388 |
2014-10-10 20:45:00 | 8 | 125 | John Jay Dining Hall | 75 | John Jay | 4.000000 |
We'll load our demo data in its original format to analyze long-term trends
raw = pd.read_csv('../scripts/demo_data.csv')
buildings = pd.read_csv('../scripts/buildings.csv', index_col = 0, header = None, squeeze = True)
raw['dump_time'] = pd.to_datetime(raw['dump_time'])
raw = raw.set_index('dump_time')
raw.head()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-526849a83e1d> in <module>() ----> 1 raw.head() NameError: name 'raw' is not defined
len(raw.index)
print buildings
raw['hrfrac'] = raw.index.hour + raw.index.minute/60.
def dayplot(plotfn, title):
f, ax = plt.subplots(figsize=(9, 7))
plotfn.plot(ax=ax)
plt.title(title)
plt.tight_layout()
plt.xticks([0,3,6,9,12,15,18,21])
plt.ylabel('Devices')
plt.xlabel('Time of Day')
return ax
totalday = pd.crosstab(raw.hrfrac, raw.index.date, values=raw.client_count, aggfunc=pd.Series.sum)
total = raw.reset_index().groupby('dump_time').agg({'client_count':'sum', 'hrfrac':'mean'}).reset_index()
ax = dayplot(totalday.mean(axis=1), 'Average Number of Devices at Columbia')
palette = sns.color_palette()
low = total.groupby("hrfrac")['client_count'].apply(np.percentile, 25)
high = total.groupby("hrfrac")['client_count'].apply(np.percentile, 75)
ax.fill_between(low.index, low, high, alpha=.2, color=palette.pop(0))
def find_capacity_overall(total_count):
avg_day = total_count.mean(axis=1)
return avg_day.max()
print find_capacity_overall(totalday)
tabs = pd.crosstab(raw.hrfrac, raw.group_name, raw.client_count, aggfunc=pd.Series.mean)
def bldg_plot(bldg, ci=False):
floors = list(buildings[buildings == bldg].index)
title = 'Average Daily Device Count in %s' % bldg
ax = dayplot(tabs[floors], title)
palette = sns.color_palette()
if ci:
for f in floors:
low = raw[raw['group_name'] == f].groupby('hrfrac')['client_count'].apply(np.percentile, 25)
high = raw[raw['group_name'] == f].groupby('hrfrac')['client_count'].apply(np.percentile, 75)
ax.fill_between(low.index, low, high, alpha=.2, color=palette.pop(0))
bldg_plot('Butler')
People tend to come in after lunch, take a break for dinner, and work late into the night.
bldg_plot('John Jay')
The dining halls are most crowded around mealtimes.
f, ax = plt.subplots(figsize=(10, 7))
raw.groupby(raw.index.date).client_count.sum().plot(ax=ax)
plt.tight_layout()
plt.title('Total Number of Devices Across Columbia')
plt.ylabel('Number of Devices')
plt.xlabel('Date')
There's a clear weekly cycle in the number of devices connected across campus.
raw['date'] = raw.index.date
raw['weekofyear'] = raw.index.weekofyear
weeks = raw['2014-06-30':'2015-4-19'].groupby('weekofyear').client_count.sum()
weeks.index = list(weeks[weeks.index < 27].index + 52) + list(weeks.index[16:])
weeks = weeks.sort_index()
E.g. the big drop around Thanksgiving and the following spike for Finals
f, ax = plt.subplots(figsize=(9, 6))
weeks.index = raw[raw.index.dayofweek == 0]['2014-06-30':'2015-04-19']['date'].drop_duplicates()
weeks.plot(ax=ax)
plt.tight_layout()
plt.title('Weekly Device Count Across Comumbia')
plt.ylabel('Total Number of Devices')
plt.savefig('weekly_count.png')
Aggregating the data by week allows us to see that there's a jump in the number of devices when the semester starts. We also see dips for fall break, spring break, and Thanksgiving. Thanksgiving is followed by a spike for finals. Winter break is also visible following the dramatic drop in device count.
f, ax = plt.subplots(figsize=(9, 7))
dailycounts = raw.groupby(raw.date).client_count.sum()
dailycounts.index = pd.to_datetime(dailycounts.index)
dailycounts.groupby(dailycounts.index.dayofweek).mean().plot(ax=ax)
plt.tight_layout()
plt.title('Average Count Across Columbia')
plt.ylabel('Average Daily Device Count')
plt.xlabel('Day of Week')
weekday = 'Mon Tue Wed Thu Fri Sat Sun'.split()
plt.xticks(range(7), weekday)
reshaped = pd.crosstab(raw.index, raw.group_name, raw.client_count, aggfunc=pd.Series.sum)
max_days = [d.strftime('%Y-%m-%d') for d in reshaped.apply(lambda x: x.argmax())]
max_days = pd.Series(max_days, index = reshaped.columns)
max_day_table = pd.DataFrame(columns = max_days.index, index = sorted(raw.hrfrac.drop_duplicates()))
for d in max_days.index:
data = reshaped[d].ix[max_days[d]]
data = reshaped[d].ix[max_days[d]]
data.index = data.index.hour + data.index.minute / 60.
max_day_table[d] = data
str(max_days.ix['Roone Arledge Auditorium'])
def maxplot(bldg):
dayplot(max_day_table[bldg], 'Max Device Counts in %s' % (bldg))
labels = str(max_days.ix[bldg])
print labels
plt.legend([labels], loc = 2)
Our plots show the days with the max. number of devices recorded at any time at that location. Using these graphs, we can identify special days during the school year.
We see multiple spikes in Roone Arledge Auditorium around the same time as orientation.
maxplot('Roone Arledge Auditorium')
plt.savefig('nsop.png')
Notice the spike in Butler 2 around midnight.
maxplot('Butler Library 2')
plt.savefig('orgonight.png')