import numpy import simplejson as json import matplotlib.pyplot as plt import pandas as pd import IPython import re from moztelemetry.spark import get_pings from __future__ import division IPython.core.pylabtools.figsize(18, 7) sc.defaultParallelism %%capture pings = get_pings(sc, "Firefox", "nightly", "*", "*", ("20150218", "20150224")) filtered = pings.filter(lambda x: "JS_TELEMETRY_ADDON_EXCEPTIONS" in x) cached = filtered.map(lambda x: json.loads(x)["keyedHistograms"].get("JS_TELEMETRY_ADDON_EXCEPTIONS", None)).filter(lambda x: x).cache() %%capture n_pings = cached.count() n_pings %%capture def clean(key): return re.sub("\?[0-9]+", "", key) exceptions = cached.flatMap(lambda excs: [(clean(key), 1) for key in excs]) grouped = exceptions.countByKey() exception_counts = pd.Series(grouped) exception_counts.sort(ascending=False) for key, value in exception_counts[:100].iteritems(): print key, "{0:.2f}%".format(100*value/n_pings) %%capture exceptions = cached.flatMap(lambda excs: [(clean(key), value["sum"]) for key, value in excs.items()]) exception_counts = pd.Series(exceptions.groupByKey().map(lambda x: (x[0], np.sum(list(x[1])))).collectAsMap()) exception_counts.sort(ascending=False) total = exception_counts.sum() for key, value in exception_counts[:100].iteritems(): print key, "{0:.2f}%".format(100*value/total)