import numpy import ujson as json import matplotlib.pyplot as plt import pandas as pd import random import re import IPython import time from moztelemetry.spark import get_pings from moztelemetry.histogram import Histogram IPython.core.pylabtools.figsize(18, 7) sc.defaultParallelism %%capture pings = get_pings(sc, "Firefox", "nightly", "*", "*", ("20150101", "20150109")) pings.count() %%capture name_version_regexp = re.compile("^([-a-zA-Z\s\d]*[a-zA-Z])([\d.]*)$") def extract_name_version(raw): m = re.match(name_version_regexp, raw) if not m: # missing plugin name return [None, None] else: return m.groups() def parse(ping): return json.loads(ping) def get_keyed_histograms(name, container, os, async): histograms = container.get(name, None) if not histograms: return [] res = [] for k, v in histograms.iteritems(): plugin_name, version = extract_name_version(k) res.append([name, os, async, plugin_name, version, Histogram(name, v).get_values()]) return res def extract(ping): output = [] os = ping["info"].get("OS") keyed = ping.get("keyedHistograms", None) if not os or not keyed: return output async_surrogate = keyed.get("BLOCKED_ON_PLUGINASYNCSURROGATE_WAITFORINIT_MS", None) if async_surrogate is not None and len(async_surrogate) == 0: # Ignore existing but empty histogram return output async = bool(async_surrogate) output += get_keyed_histograms("BLOCKED_ON_PLUGIN_MODULE_INIT_MS", keyed, os, async) output += get_keyed_histograms("BLOCKED_ON_PLUGIN_INSTANCE_INIT_MS", keyed, os, async) output += get_keyed_histograms("BLOCKED_ON_PLUGIN_STREAM_INIT_MS", keyed, os, async) output += get_keyed_histograms("BLOCKED_ON_PLUGIN_INSTANCE_DESTROY_MS", keyed, os, async) return output data = pings.map(parse).flatMap(extract) %%capture collected_data = data.collect() len(collected_data) %%capture frame = pd.DataFrame(data.collect(), columns=["histogram", "os", "async", "plugin", "version", "values"]) top_plugins = frame["plugin"].value_counts()[:5].index top_frame = frame[frame["plugin"].isin(top_plugins)] top_frame["ping"] = 1 top_frame.to_json("1118762.json") #top_frame = pd.read_json("1118762.json") grouped = top_frame.groupby(["plugin", "os", "histogram", "async"]) aggregates = grouped[["values", "ping"]].apply(lambda x: pd.Series({"values": x["values"].sum().values, "pings": x["ping"].sum()})) def plot(data): for plugin in top_plugins: # Sorted by popularity plot_os(plugin, data.ix[plugin]) def plot_os(plugin, data): for os in ["WINNT", "Darwin", "Linux"]: plot_histograms(plugin, os, data.ix[os]) def plot_histograms(plugin, os, data): print "Operating System: " + os # Useful for grepping print "Plugin: " + plugin fig = plt.figure(figsize=(30, 5)) fig.subplots_adjust(hspace=0.3) for idx, histogram in enumerate(data.index.levels[0]): ax = fig.add_subplot(1, 4, idx + 1) plot_comparison(plugin, os, histogram, ax, data.ix[histogram]) plt.show() def plot_comparison(plugin, os, histogram, ax, data): hsync = Histogram(histogram, data.ix[False]["values"]) if False in data.index else None hasync = Histogram(histogram, data.ix[True]["values"]) if True in data.index else None ax2 = ax.twinx() width = 0.4 if hasync is not None: print "async {} ({} submissions): Q1 {:.2f} - Q2 {:.2f} - Q3 {:.2f}".format(histogram, data.ix[True]["pings"], hasync.percentile(25), hasync.percentile(50), hasync.percentile(75)) async = hasync.get_values() async = async/async.sum() async.plot(kind="bar", color="green", ax=ax, width=width, position=0, label="async") if hsync is not None: print "sync {} ({} submissions): Q1 {:.2f} - Q2 {:.2f} - Q3 {:.2f}\n".format(histogram, data.ix[False]["pings"], hsync.percentile(25), hsync.percentile(50), hsync.percentile(75)) sync = hsync.get_values() sync = sync/sync.sum() sync.plot(kind="bar", color="blue", ax=ax2, width=width, position=1, grid=False, label="sync", ylim=ax.get_ylim()) plt.title("{} - {} - {}".format(os, plugin, histogram)) plt.ylabel("Normalized count") if hasync and hsync: ax.legend(ax.get_legend_handles_labels()[0] + ax2.get_legend_handles_labels()[0], ["async", "sync"]) elif hasync: ax.legend(ax.get_legend_handles_labels()[0], ["async"]) else: ax.legend(ax2.get_legend_handles_labels()[0], ["sync"]) plot(aggregates)