#!/usr/bin/env python # coding: utf-8 # ### E10S Experiment Beta, EVENTLOOP_UI_ACTIVITY_EXP_MS >= 50ms # In[1]: import ujson as json import matplotlib.pyplot as plt import pandas as pd import numpy as np import plotly.plotly as py import IPython from __future__ import division from moztelemetry.spark import get_pings, get_one_ping_per_client, get_pings_properties from montecarlino import grouped_permutation_test get_ipython().run_line_magic('pylab', 'inline') IPython.core.pylabtools.figsize(16, 7) # In[2]: sc.defaultParallelism # In[18]: def chi2_distance(xs, ys, eps = 1e-10, normalize = True): histA = xs.sum(axis=0) histB = ys.sum(axis=0) if normalize: histA = histA/histA.sum() histB = histB/histB.sum() d = 0.5 * np.sum([((a - b) ** 2) / (a + b + eps) for (a, b) in zip(histA, histB)]) return d def compare_histogram(histogram, e10s, none10s, from_bin=50): e10s = e10s.map(lambda x: x[x.index >= from_bin]) none10s = none10s.map(lambda x: x[x.index >= from_bin]) pvalue = grouped_permutation_test(chi2_distance, [e10s, none10s], num_samples=100) eTotal = e10s.sum() nTotal = none10s.sum() fig = plt.figure() fig.subplots_adjust(hspace=0.3) ax = fig.add_subplot(1, 1, 1) ax2 = ax.twinx() width = 0.4 ylim = max(eTotal.max(), nTotal.max()) eTotal.plot(kind="bar", alpha=0.5, color="green", label="e10s", ax=ax, width=width, position=0, ylim=(0, ylim + 1)) nTotal.plot(kind="bar", alpha=0.5, color="blue", label="non e10s", ax=ax2, width=width, position=1, grid=False, ylim=ax.get_ylim()) ax.legend(ax.get_legend_handles_labels()[0] + ax2.get_legend_handles_labels()[0], ["e10s ({} samples".format(len(e10s)), "non e10s ({} samples)".format(len(none10s))]) plt.title(histogram) plt.xlabel(histogram) plt.ylabel("Frequency %") plt.show() print "The probability that the distributions for {} are differing by chance is {:.2f}.".format(histogram, pvalue) def compare_histograms(pings, *histogram_names): frame = pd.DataFrame(get_pings_properties(pings, histogram_names + ("e10s",) , with_processes=True).collect()) e10s = frame[frame["e10s"] == True] none10s = frame[frame["e10s"] == False] for histogram in none10s.columns: if histogram == "e10s" or histogram.endswith("_parent") or histogram.endswith("_children"): continue has_children = np.sum(e10s[histogram + "_children"].notnull()) > 0 has_parent = np.sum(e10s[histogram + "_parent"].notnull()) > 0 if has_children and has_parent: compare_histogram(histogram + " (parent + children)", e10s[histogram].dropna(), none10s[histogram].dropna()) if has_parent: compare_histogram(histogram + " (parent)", e10s[histogram + "_parent"].dropna(), none10s[histogram].dropna()) if has_children: compare_histogram(histogram + " (children)", e10s[histogram + "_children"].dropna(), none10s[histogram].dropna()) # #### Get e10s and non-e10s partitions # In[4]: dataset = sqlContext.read.load("s3://telemetry-parquet/e10s-experiment/e10s-enabled-beta-20151214@experiments.mozilla.org/generationDate=20160106", "parquet") # Only consider builds with [bug 1234618](https://bugzilla.mozilla.org/show_bug.cgi?id=1234618) BHR fix: # In[5]: dataset = dataset.filter(dataset.buildId >= '20151228134903') # Sample by clientId: # In[6]: sampled = dataset.filter(dataset.sampleId <= 50) # In[7]: sampled.count() # Transform Dataframe to RDD of pings # In[8]: def row_2_ping(row): ping = {"payload": {"simpleMeasurements": json.loads(row.simpleMeasurements), "histograms": json.loads(row.histograms), "keyedHistograms": json.loads(row.keyedHistograms), "childPayloads": json.loads(row.childPayloads), "threadHangStats": json.loads(row.threadHangStats)}, "e10s": True if row.experimentBranch == "experiment" else False} return ping # In[9]: subset = sampled.rdd.map(row_2_ping) # #### Event processing # In[19]: compare_histograms(subset, "payload/histograms/EVENTLOOP_UI_ACTIVITY_EXP_MS")