#!/usr/bin/env python # coding: utf-8 # ### Telemetry Analysis # In[140]: import ujson as json import matplotlib.pyplot as plt import pandas as pd import numpy as np import plotly.plotly as py from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client get_ipython().run_line_magic('pylab', 'inline') # How many parallel workers do we have? # In[141]: sc.defaultParallelism # Fetch all the submissions for the nightly build made on 2015-07-15 to 2015-07-25: # In[149]: pings = get_pings(sc, app="Firefox", channel="nightly", build_id=("20150715000000", "20150725999999"), fraction=1.0) # Extract the device pixels per virtual pixel setting from each ping (one per user) and filter out the unspecified values: # In[150]: subset = get_pings_properties(pings, ["clientID", "environment/settings/userPrefs/layout.css.devPixelsPerPx"]) subset = get_one_ping_per_client(subset) valid_subset = subset.filter(lambda x: x['environment/settings/userPrefs/layout.css.devPixelsPerPx'] != None) # What percentage of users have this setting enabled? # In[153]: valid_subset.first() # In[152]: valid_count, total_count = valid_subset.count(), subset.count() "{}% ({} of {})".format(100.0 * valid_count / total_count, valid_count, total_count) # Caching is fundamental as it allows for an iterative, real-time development workflow: # In[154]: cached = valid_subset.cache() # Aggregate the settings by their value: # In[159]: settings = cached.map(lambda x: (x['environment/settings/userPrefs/layout.css.devPixelsPerPx'], 1)).reduceByKey(lambda a, b: a + b).collectAsMap() settings # And finally plot the data: # In[161]: plt.figure(figsize=(15, 7)) pairs = sorted(settings.items(), key=lambda x: x[0]) width = 0.8 plt.bar(range(len(pairs)), map(lambda x: x[1], pairs), width=width) ax = plt.gca() ax.set_xticks(np.arange(len(pairs)) + width/2) ax.set_xticklabels(map(lambda x: x[0], pairs), rotation=90) plt.xlabel("layout.css.devPixelsPerPx Value") plt.ylabel("Number of client IDs") plt.show() # Some of these values are duplicated. To group settings by their numerical values, we parse the labels: # In[164]: plt.figure(figsize=(15, 7)) from collections import defaultdict numerical_settings = defaultdict(int) for setting, count in settings.items(): try: numerical_settings[float(setting)] += count except ValueError: pass pairs = sorted(numerical_settings.items(), key=lambda x: x[0]) width = 0.8 plt.bar(range(len(pairs)), map(lambda x: x[1], pairs), width=width) ax = plt.gca() ax.set_xticks(np.arange(len(pairs)) + width/2) ax.set_xticklabels(map(lambda x: x[0], pairs), rotation=90) plt.xlabel("layout.css.devPixelsPerPx Value") plt.ylabel("Number of client IDs") plt.show() # In[ ]: