# Some Pandas/Matplotlib initialization which I more or less blindly copied from an example ipython notebook. import pandas as pd pd.set_option('display.max_columns', 15) pd.set_option('display.width', 400) pd.set_option('display.mpl_style', 'default') rcParams['figure.figsize'] = (12, 5) import matplotlib font = {'family' : 'sans-serif', 'weight' : 'normal', 'size' : 14} matplotlib.rc('font', **font) # File containing the output of compute_time_in_inbox_by_date_received.py TENURE_FILE = 'tenure.json' # 'YYYY-MM-DD'. If you don't want a min/max date, set the value to None MIN_DATE = '2015-02-01' MAX_DATE = '2015-05-02' TITLE = 'Length of time a message spends in inbox, by date of arrival, 2015' RANGES = [ (0, 5.99999), (6, 11.99999), (12, 23.999999), (24, 47.99999), (2*24, 4*24-0.00001), (4*24, 8*24-0.00001), (8*24, 9999999) ] # "PRGn" from http://colorbrewer2.org/ COLORS = list(reversed([[x/255. for x in c] for c in [(118,42,131),(175,141,195),(231,212,232),(247,247,247),(217,240,211),(127,191,123),(27,120,55)] ])) ################## import datetime from collections import defaultdict import matplotlib.patches import simplejson tenure_by_day = simplejson.loads(open(TENURE_FILE).read()) index = sorted(tenure_by_day.keys()) if MIN_DATE: index = [k for k in index if k >= MIN_DATE] if MAX_DATE: index = [k for k in index if k <= MAX_DATE] date_index = [datetime.datetime.strptime(k, '%Y-%m-%d') for k in index] # Create a data series (x is date, y is count) for each of the ranges (0-6 hours, 6-12 hours, etc.) series_by_range = defaultdict(list) for d, tenures in [(k, tenure_by_day[k]) for k in index]: for r in RANGES: series_by_range[r].append( sum([r[0] <= tenure <= r[1] for tenure in tenures]) ) # Transform the above into proportions, so proportions_by_range[(0, 5.999999)][i] is the # proportion of all email received on date_index[i] which stayed in the inbox for 0-6 hours. # (There is certainly a better way to do this using Pandas...) proportions_by_range = defaultdict(list) for i in range(len(index)): total = sum(series_by_range[r][i] for r in series_by_range.keys()) for r in series_by_range.keys(): proportions_by_range[r].append(series_by_range[r][i] / float(total)) # Make a Pandas DataFrame out of the series df = pd.DataFrame(data=proportions_by_range, index=date_index) # Resample as daily so that the index includes days without data resampled = df.resample('D') # If resampling added dates to the index that weren't there before, some dates are missing data has_missing_data = any(set(resampled.index) - set(df.index)) # Plot as stacked bar ax = resampled.plot(kind='bar', stacked=True, figsize=(12, 3), width=0.8, linewidth=0, color=COLORS, edgecolor=[(0.6,) * 3]) # x axis ax.xaxis.grid(False) ax.xaxis.set_ticklabels([d.strftime('%b %d') if d.day in (1, 15) else "" for d in resampled.index], rotation=90, size=12) # y axis yticks = [0, .25, .5, .75, 1] ax.yaxis.set_ticks(yticks) ax.yaxis.set_ticklabels(["%s%%" % int(y * 100) for y in yticks], size=12) ax.yaxis.grid(None) ax.set_ybound(0, 1) # Legend labels def format_range_name(r): if r[0] < 24: return "%s-%s hours" % (r[0], int(round(r[1]))) elif r[1] == 9999999: return "%s+ days" % (r[0] / 24) else: return "%s-%s days" % (r[0] / 24, int(round(r[1] / 24))) # Maybe add a "no data" label to the legend handles, labels = ax.get_legend_handles_labels() handles = [matplotlib.patches.Patch(alpha=0)] + handles labels = ['no data' if has_missing_data else ''] + map(format_range_name, RANGES) # The default draw order of the legend is top->bottom then left->right. # Make it go left->right then top->bottom, and reverse the labels to match the order of the graph. def cols_to_rows(arr): return reversed([arr[i] for i in (0, 4, 1, 5, 2, 6, 3, 7)]) ax.legend(cols_to_rows(handles), cols_to_rows(labels), loc='lower center', fontsize=12, ncol=4, columnspacing=1, framealpha=0, bbox_to_anchor=(0.5, -0.62)) # Title ax.set_title(TITLE, fontsize=14, ha='center', va='top', position=(0.5, 1.1), color='#333333') # Clean up the background and borders ax.set_axis_bgcolor('white') ax.spines['right'].set_visible(False) ax.patch.set_alpha(0) ax.figure.patch.set_alpha(0) # Add dots for dates with missing data if has_missing_data: missing_days_x = [i for i, date in enumerate(resampled.index) if date not in df.index] for m in missing_days_x: ax.text(m, 0.018, ".", fontdict={'size': 14}) # May have to tweak these coordinates to make the "no data" dot show in the right spot in the legend. # # The correct way would be to subclass matplotlib.patches.Patch: # http://matplotlib.org/api/patches_api.html ax.text(63.2, -0.52, ".", fontdict={'size': 14}, zorder=99) # don't show the result of evaluating the last command pass