This code generated "Figure 4" in Three Years of Logging My Inbox Count.
compute_time_in_inbox_by_date_received.py
from gmail-graphs on the directory containing the JSON output files of gmail-logger. This will generate a JSON object which maps keys like "2015-04-20"
to arrays like [0.5, 0.5, 5.0, 17.5, 24.0, 327.0]
.python compute_time_in_inbox_by_date_received.py /path/to/your/json/log/files/dir/ > tenure.json
TENURE_FILE
. Change other constants as desired.(For inline graphs, start IPython Notebook with ipython notebook --pylab inline
.)
# Some Pandas/Matplotlib initialization which I more or less blindly copied from an example ipython notebook.
import pandas as pd
pd.set_option('display.max_columns', 15)
pd.set_option('display.width', 400)
pd.set_option('display.mpl_style', 'default')
rcParams['figure.figsize'] = (12, 5)
import matplotlib
font = {'family' : 'sans-serif',
'weight' : 'normal',
'size' : 14}
matplotlib.rc('font', **font)
# File containing the output of compute_time_in_inbox_by_date_received.py
TENURE_FILE = 'tenure.json'
# 'YYYY-MM-DD'. If you don't want a min/max date, set the value to None
MIN_DATE = '2015-02-01'
MAX_DATE = '2015-05-02'
TITLE = 'Length of time a message spends in inbox, by date of arrival, 2015'
RANGES = [
(0, 5.99999),
(6, 11.99999),
(12, 23.999999),
(24, 47.99999),
(2*24, 4*24-0.00001),
(4*24, 8*24-0.00001),
(8*24, 9999999)
]
# "PRGn" from http://colorbrewer2.org/
COLORS = list(reversed([[x/255. for x in c] for c in
[(118,42,131),(175,141,195),(231,212,232),(247,247,247),(217,240,211),(127,191,123),(27,120,55)]
]))
##################
import datetime
from collections import defaultdict
import matplotlib.patches
import simplejson
tenure_by_day = simplejson.loads(open(TENURE_FILE).read())
index = sorted(tenure_by_day.keys())
if MIN_DATE:
index = [k for k in index if k >= MIN_DATE]
if MAX_DATE:
index = [k for k in index if k <= MAX_DATE]
date_index = [datetime.datetime.strptime(k, '%Y-%m-%d') for k in index]
# Create a data series (x is date, y is count) for each of the ranges (0-6 hours, 6-12 hours, etc.)
series_by_range = defaultdict(list)
for d, tenures in [(k, tenure_by_day[k]) for k in index]:
for r in RANGES:
series_by_range[r].append(
sum([r[0] <= tenure <= r[1] for tenure in tenures])
)
# Transform the above into proportions, so proportions_by_range[(0, 5.999999)][i] is the
# proportion of all email received on date_index[i] which stayed in the inbox for 0-6 hours.
# (There is certainly a better way to do this using Pandas...)
proportions_by_range = defaultdict(list)
for i in range(len(index)):
total = sum(series_by_range[r][i] for r in series_by_range.keys())
for r in series_by_range.keys():
proportions_by_range[r].append(series_by_range[r][i] / float(total))
# Make a Pandas DataFrame out of the series
df = pd.DataFrame(data=proportions_by_range, index=date_index)
# Resample as daily so that the index includes days without data
resampled = df.resample('D')
# If resampling added dates to the index that weren't there before, some dates are missing data
has_missing_data = any(set(resampled.index) - set(df.index))
# Plot as stacked bar
ax = resampled.plot(kind='bar', stacked=True, figsize=(12, 3), width=0.8, linewidth=0, color=COLORS, edgecolor=[(0.6,) * 3])
# x axis
ax.xaxis.grid(False)
ax.xaxis.set_ticklabels([d.strftime('%b %d') if d.day in (1, 15) else "" for d in resampled.index], rotation=90, size=12)
# y axis
yticks = [0, .25, .5, .75, 1]
ax.yaxis.set_ticks(yticks)
ax.yaxis.set_ticklabels(["%s%%" % int(y * 100) for y in yticks], size=12)
ax.yaxis.grid(None)
ax.set_ybound(0, 1)
# Legend labels
def format_range_name(r):
if r[0] < 24:
return "%s-%s hours" % (r[0], int(round(r[1])))
elif r[1] == 9999999:
return "%s+ days" % (r[0] / 24)
else:
return "%s-%s days" % (r[0] / 24, int(round(r[1] / 24)))
# Maybe add a "no data" label to the legend
handles, labels = ax.get_legend_handles_labels()
handles = [matplotlib.patches.Patch(alpha=0)] + handles
labels = ['no data' if has_missing_data else ''] + map(format_range_name, RANGES)
# The default draw order of the legend is top->bottom then left->right.
# Make it go left->right then top->bottom, and reverse the labels to match the order of the graph.
def cols_to_rows(arr):
return reversed([arr[i] for i in (0, 4, 1, 5, 2, 6, 3, 7)])
ax.legend(cols_to_rows(handles), cols_to_rows(labels),
loc='lower center', fontsize=12, ncol=4, columnspacing=1, framealpha=0, bbox_to_anchor=(0.5, -0.62))
# Title
ax.set_title(TITLE, fontsize=14, ha='center', va='top', position=(0.5, 1.1), color='#333333')
# Clean up the background and borders
ax.set_axis_bgcolor('white')
ax.spines['right'].set_visible(False)
ax.patch.set_alpha(0)
ax.figure.patch.set_alpha(0)
# Add dots for dates with missing data
if has_missing_data:
missing_days_x = [i for i, date in enumerate(resampled.index) if date not in df.index]
for m in missing_days_x:
ax.text(m, 0.018, ".", fontdict={'size': 14})
# May have to tweak these coordinates to make the "no data" dot show in the right spot in the legend.
#
# The correct way would be to subclass matplotlib.patches.Patch:
# http://matplotlib.org/api/patches_api.html
ax.text(63.2, -0.52, ".", fontdict={'size': 14}, zorder=99)
# don't show the result of evaluating the last command
pass