import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import IPython
from __future__ import division
from moztelemetry.spark import get_pings, get_one_ping_per_client, get_pings_properties
from montecarlino import grouped_permutation_test
%pylab inline
IPython.core.pylabtools.figsize(16, 7)
Unable to parse whitelist (/home/hadoop/anaconda2/lib/python2.7/site-packages/moztelemetry/bucket-whitelist.json). Assuming all histograms are acceptable. Populating the interactive namespace from numpy and matplotlib
sc.defaultParallelism
16
def chi2_distance(xs, ys, eps = 1e-10, normalize = True):
histA = xs.sum(axis=0)
histB = ys.sum(axis=0)
if normalize:
histA = histA/histA.sum()
histB = histB/histB.sum()
d = 0.5 * np.sum([((a - b) ** 2) / (a + b + eps)
for (a, b) in zip(histA, histB)])
return d
def compare_histogram(histogram, e10s, none10s, from_bin=50):
e10s = e10s.map(lambda x: x[x.index >= from_bin])
none10s = none10s.map(lambda x: x[x.index >= from_bin])
pvalue = grouped_permutation_test(chi2_distance, [e10s, none10s], num_samples=100)
eTotal = e10s.sum()
nTotal = none10s.sum()
fig = plt.figure()
fig.subplots_adjust(hspace=0.3)
ax = fig.add_subplot(1, 1, 1)
ax2 = ax.twinx()
width = 0.4
ylim = max(eTotal.max(), nTotal.max())
eTotal.plot(kind="bar", alpha=0.5, color="green", label="e10s", ax=ax, width=width, position=0, ylim=(0, ylim + 1))
nTotal.plot(kind="bar", alpha=0.5, color="blue", label="non e10s", ax=ax2, width=width, position=1, grid=False, ylim=ax.get_ylim())
ax.legend(ax.get_legend_handles_labels()[0] + ax2.get_legend_handles_labels()[0],
["e10s ({} samples".format(len(e10s)), "non e10s ({} samples)".format(len(none10s))])
plt.title(histogram)
plt.xlabel(histogram)
plt.ylabel("Frequency %")
plt.show()
print "The probability that the distributions for {} are differing by chance is {:.2f}.".format(histogram, pvalue)
def compare_histograms(pings, *histogram_names):
frame = pd.DataFrame(get_pings_properties(pings, histogram_names + ("e10s",) , with_processes=True).collect())
e10s = frame[frame["e10s"] == True]
none10s = frame[frame["e10s"] == False]
for histogram in none10s.columns:
if histogram == "e10s" or histogram.endswith("_parent") or histogram.endswith("_children"):
continue
has_children = np.sum(e10s[histogram + "_children"].notnull()) > 0
has_parent = np.sum(e10s[histogram + "_parent"].notnull()) > 0
if has_children and has_parent:
compare_histogram(histogram + " (parent + children)", e10s[histogram].dropna(), none10s[histogram].dropna())
if has_parent:
compare_histogram(histogram + " (parent)", e10s[histogram + "_parent"].dropna(), none10s[histogram].dropna())
if has_children:
compare_histogram(histogram + " (children)", e10s[histogram + "_children"].dropna(), none10s[histogram].dropna())
dataset = sqlContext.read.load("s3://telemetry-parquet/e10s-experiment/e10s-enabled-beta-20151214@experiments.mozilla.org/generationDate=20160106", "parquet")
Only consider builds with bug 1234618 BHR fix:
dataset = dataset.filter(dataset.buildId >= '20151228134903')
Sample by clientId:
sampled = dataset.filter(dataset.sampleId <= 50)
sampled.count()
62058
Transform Dataframe to RDD of pings
def row_2_ping(row):
ping = {"payload": {"simpleMeasurements": json.loads(row.simpleMeasurements),
"histograms": json.loads(row.histograms),
"keyedHistograms": json.loads(row.keyedHistograms),
"childPayloads": json.loads(row.childPayloads),
"threadHangStats": json.loads(row.threadHangStats)},
"e10s": True if row.experimentBranch == "experiment" else False}
return ping
subset = sampled.rdd.map(row_2_ping)
compare_histograms(subset, "payload/histograms/EVENTLOOP_UI_ACTIVITY_EXP_MS")
The probability that the distributions for payload/histograms/EVENTLOOP_UI_ACTIVITY_EXP_MS (parent + children) are differing by chance is 0.00.
The probability that the distributions for payload/histograms/EVENTLOOP_UI_ACTIVITY_EXP_MS (parent) are differing by chance is 0.00.
The probability that the distributions for payload/histograms/EVENTLOOP_UI_ACTIVITY_EXP_MS (children) are differing by chance is 0.00.