import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import IPython
from __future__ import division
from moztelemetry.spark import get_pings, get_one_ping_per_client, get_pings_properties
from montecarlino import grouped_permutation_test
%pylab inline
IPython.core.pylabtools.figsize(16, 7)
Populating the interactive namespace from numpy and matplotlib
sc.defaultParallelism
80
def chi2_distance(xs, ys, eps = 1e-10, normalize = True):
histA = xs.sum(axis=0)
histB = ys.sum(axis=0)
if normalize:
histA = histA/histA.sum()
histB = histB/histB.sum()
d = 0.5 * np.sum([((a - b) ** 2) / (a + b + eps)
for (a, b) in zip(histA, histB)])
return d
def median_diff(xs, ys):
return np.median(xs) - np.median(ys)
def compare_histogram(histogram, e10s, none10s):
pvalue = grouped_permutation_test(chi2_distance, [e10s, none10s], num_samples=100)
eTotal = e10s.sum()
nTotal = none10s.sum()
eTotal = 100*eTotal/eTotal.sum()
nTotal = 100*nTotal/nTotal.sum()
fig = plt.figure()
fig.subplots_adjust(hspace=0.3)
ax = fig.add_subplot(1, 1, 1)
ax2 = ax.twinx()
width = 0.4
ylim = max(eTotal.max(), nTotal.max())
eTotal.plot(kind="bar", alpha=0.5, color="green", label="e10s", ax=ax, width=width, position=0, ylim=(0, ylim + 1))
nTotal.plot(kind="bar", alpha=0.5, color="blue", label="non e10s", ax=ax2, width=width, position=1, grid=False, ylim=ax.get_ylim())
ax.legend(ax.get_legend_handles_labels()[0] + ax2.get_legend_handles_labels()[0],
["e10s ({} samples".format(len(e10s)), "non e10s ({} samples)".format(len(none10s))])
plt.title(histogram)
plt.xlabel(histogram)
plt.ylabel("Frequency %")
plt.show()
print "The probability that the distributions for {} are differing by chance is {:.2f}.".format(histogram, pvalue)
def normalize_uptime_hour(frame):
frame = frame[frame["simpleMeasurements/uptime"] > 0]
frame = 60 * frame.apply(lambda x: x/frame["simpleMeasurements/uptime"]) # Metric per hour
frame.drop('simpleMeasurements/uptime', axis=1, inplace=True)
return frame
def compare_count_histograms(pings, *histograms_names):
properties = histograms_names + ("simpleMeasurements/uptime", "e10s")
frame = pd.DataFrame(get_pings_properties(pings, properties).collect())
e10s = frame[frame["e10s"] == True]
e10s = normalize_uptime_hour(e10s)
none10s = frame[frame["e10s"] == False]
none10s = normalize_uptime_hour(none10s)
for histogram in e10s.columns:
if histogram == "e10s" or histogram.endswith("_parent") or histogram.endswith("_children"):
continue
compare_scalars(histogram + " per hour", e10s[histogram].dropna(), none10s[histogram].dropna())
def compare_histograms(pings, *histogram_names):
frame = pd.DataFrame(get_pings_properties(pings, histogram_names + ("e10s",), with_processes=True).collect())
e10s = frame[frame["e10s"] == True]
none10s = frame[frame["e10s"] == False]
for histogram in none10s.columns:
if histogram == "e10s" or histogram.endswith("_parent") or histogram.endswith("_children"):
continue
has_children = np.sum(e10s[histogram + "_children"].notnull()) > 0
has_parent = np.sum(e10s[histogram + "_parent"].notnull()) > 0
if has_children and has_parent:
compare_histogram(histogram + " (parent + children)", e10s[histogram].dropna(), none10s[histogram].dropna())
if has_parent:
compare_histogram(histogram + " (parent)", e10s[histogram + "_parent"].dropna(), none10s[histogram].dropna())
if has_children:
compare_histogram(histogram + " (children)", e10s[histogram + "_children"].dropna(), none10s[histogram].dropna())
def compare_scalars(metric, *groups):
print "Median difference in {} is {:.2f}, ({:.2f}, {:.2f}).".format(metric,
median_diff(*groups),
np.median(groups[0]),
np.median(groups[1]))
print "The probablity of this effect being purely by chance is {:.2f}.". \
format(grouped_permutation_test(median_diff, groups, num_samples=10000))
pings = get_pings(sc, app="Firefox", channel="nightly", build_id=("20150601000000", "20150615999999"), submission_date=("20150615"), fraction=1)
pings.count()
149656
subset = get_one_ping_per_client(pings)
def add_e10s_discriminator(ping):
ping["has_payloads"] = True if ping.get("childPayloads", {}) else False
autostart = ping["histograms"].get("E10S_AUTOSTART", [])
if autostart:
ping["e10s"] = autostart[1] > 0
else:
ping["e10s"] = False
return ping
subset = subset.map(add_e10s_discriminator)
def add_gecko_activity(ping):
threads = ping.get("threadHangStats", {})
uptime = ping["simpleMeasurements"].get("uptime", -1)
if not threads or uptime <= 0:
return ping
for thread in threads:
if thread["name"] == "Gecko":
activity = thread["activity"]["values"]
histogram = pd.Series(activity.values(), index=map(int, activity.keys())).sort_index()
over100 = histogram[histogram.index > 100].sum()
ping["gecko_hangs_per_minute"] = over100/uptime
return ping
subset = subset.map(add_gecko_activity)
simple = pd.DataFrame(get_pings_properties(subset,
["simpleMeasurements/firstPaint",
"simpleMeasurements/sessionRestored",
"simpleMeasurements/sessionRestoreRestoring",
"simpleMeasurements/shutdownDuration",
"gecko_hangs_per_minute",
"e10s"]).collect())
eSimple = simple[simple["e10s"] == True]
nSimple = simple[simple["e10s"] == False]
compare_scalars("startup time", eSimple["simpleMeasurements/firstPaint"].dropna(), nSimple["simpleMeasurements/firstPaint"].dropna())
Median difference in startup time is 551.00, (3630.00, 3079.00). The probablity of this effect being purely by chance is 0.00.
eRestoreTime = (eSimple["simpleMeasurements/sessionRestored"] - eSimple["simpleMeasurements/sessionRestoreRestoring"]).dropna()
nRestoreTime = (nSimple["simpleMeasurements/sessionRestored"] - nSimple["simpleMeasurements/sessionRestoreRestoring"]).dropna()
compare_scalars("restore time", eRestoreTime, nRestoreTime)
Median difference in restore time is -38.00, (152.00, 190.00). The probablity of this effect being purely by chance is 0.00.
compare_scalars("shutdown duration", eSimple["simpleMeasurements/shutdownDuration"], nSimple["simpleMeasurements/shutdownDuration"])
Median difference in shutdown duration is -124.00, (1156.00, 1280.00). The probablity of this effect being purely by chance is 0.00.
compare_histograms(subset,
"histograms/FX_TAB_ANIM_ANY_FRAME_INTERVAL_MS",
"histograms/FX_TAB_ANIM_OPEN_FRAME_INTERVAL_MS",
"histograms/FX_TAB_ANIM_OPEN_PREVIEW_FRAME_INTERVAL_MS")
The probability that the distributions for histograms/FX_TAB_ANIM_ANY_FRAME_INTERVAL_MS (parent) are differing by chance is 0.00.
The probability that the distributions for histograms/FX_TAB_ANIM_OPEN_FRAME_INTERVAL_MS (parent) are differing by chance is 0.68.
The probability that the distributions for histograms/FX_TAB_ANIM_OPEN_PREVIEW_FRAME_INTERVAL_MS (parent) are differing by chance is 0.08.
compare_histograms(subset, "histograms/FX_REFRESH_DRIVER_FRAME_DELAY_MS", "histograms/EVENTLOOP_UI_LAG_EXP_MS")
The probability that the distributions for histograms/EVENTLOOP_UI_LAG_EXP_MS (parent + children) are differing by chance is 0.00.
The probability that the distributions for histograms/EVENTLOOP_UI_LAG_EXP_MS (parent) are differing by chance is 0.00.
The probability that the distributions for histograms/EVENTLOOP_UI_LAG_EXP_MS (children) are differing by chance is 0.01.
The probability that the distributions for histograms/FX_REFRESH_DRIVER_FRAME_DELAY_MS (parent + children) are differing by chance is 0.98.
The probability that the distributions for histograms/FX_REFRESH_DRIVER_FRAME_DELAY_MS (parent) are differing by chance is 1.00.
The probability that the distributions for histograms/FX_REFRESH_DRIVER_FRAME_DELAY_MS (children) are differing by chance is 0.99.
compare_scalars("hangs over 100ms per minute", eSimple["gecko_hangs_per_minute"], nSimple["gecko_hangs_per_minute"])
Median difference in hangs over 100ms per minute is 2.20, (21.17, 18.96). The probablity of this effect being purely by chance is 0.03.
compare_count_histograms(subset, "keyedHistograms/SUBPROCESS_ABNORMAL_ABORT/plugin")
Median difference in keyedHistograms/SUBPROCESS_ABNORMAL_ABORT/plugin per hour is 0.06, (0.22, 0.15). The probablity of this effect being purely by chance is 0.12.
compare_histograms(subset, "histograms/FX_PAGE_LOAD_MS")
The probability that the distributions for histograms/FX_PAGE_LOAD_MS (parent) are differing by chance is 0.08.
compare_count_histograms(subset, "histograms/SLOW_SCRIPT_NOTICE_COUNT")
Median difference in histograms/SLOW_SCRIPT_NOTICE_COUNT per hour is 0.15, (0.26, 0.11). The probablity of this effect being purely by chance is 0.09.
compare_histograms(subset, "histograms/FX_NEW_WINDOW_MS")
The probability that the distributions for histograms/FX_NEW_WINDOW_MS (parent) are differing by chance is 0.00.
compare_histograms(subset,
"histograms/GC_MS",
"histograms/GC_MAX_PAUSE_MS",
"histograms/GC_MARK_MS",
"histograms/GC_SWEEP_MS",
"histograms/GC_MARK_ROOTS_MS")
The probability that the distributions for histograms/GC_MARK_MS (parent + children) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_MARK_MS (parent) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_MARK_MS (children) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_MARK_ROOTS_MS (parent + children) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_MARK_ROOTS_MS (parent) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_MARK_ROOTS_MS (children) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_MAX_PAUSE_MS (parent + children) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_MAX_PAUSE_MS (parent) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_MAX_PAUSE_MS (children) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_MS (parent + children) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_MS (parent) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_MS (children) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_SWEEP_MS (parent + children) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_SWEEP_MS (parent) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_SWEEP_MS (children) are differing by chance is 0.00.
compare_histograms(subset,
"histograms/GC_MARK_GRAY_MS",
"histograms/GC_SLICE_MS",
"histograms/GC_SCC_SWEEP_TOTAL_MS",
"histograms/GC_SCC_SWEEP_MAX_PAUSE_MS")
The probability that the distributions for histograms/GC_MARK_GRAY_MS (parent + children) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_MARK_GRAY_MS (parent) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_MARK_GRAY_MS (children) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_SCC_SWEEP_MAX_PAUSE_MS (parent + children) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_SCC_SWEEP_MAX_PAUSE_MS (parent) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_SCC_SWEEP_MAX_PAUSE_MS (children) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_SCC_SWEEP_TOTAL_MS (parent + children) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_SCC_SWEEP_TOTAL_MS (parent) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_SCC_SWEEP_TOTAL_MS (children) are differing by chance is 0.00.
The probability that the distributions for histograms/GC_SLICE_MS (parent + children) are differing by chance is 0.08.
The probability that the distributions for histograms/GC_SLICE_MS (parent) are differing by chance is 0.06.
The probability that the distributions for histograms/GC_SLICE_MS (children) are differing by chance is 0.08.
compare_histograms(subset,
"histograms/CYCLE_COLLECTOR",
"histograms/CYCLE_COLLECTOR_WORKER",
"histograms/CYCLE_COLLECTOR_FULL",
"histograms/CYCLE_COLLECTOR_MAX_PAUSE",
"histograms/CYCLE_COLLECTOR_TIME_BETWEEN")
The probability that the distributions for histograms/CYCLE_COLLECTOR (parent + children) are differing by chance is 0.00.
The probability that the distributions for histograms/CYCLE_COLLECTOR (parent) are differing by chance is 0.00.
The probability that the distributions for histograms/CYCLE_COLLECTOR (children) are differing by chance is 0.00.
The probability that the distributions for histograms/CYCLE_COLLECTOR_FULL (parent + children) are differing by chance is 0.00.
The probability that the distributions for histograms/CYCLE_COLLECTOR_FULL (parent) are differing by chance is 0.00.
The probability that the distributions for histograms/CYCLE_COLLECTOR_FULL (children) are differing by chance is 0.00.
The probability that the distributions for histograms/CYCLE_COLLECTOR_MAX_PAUSE (parent + children) are differing by chance is 0.00.
The probability that the distributions for histograms/CYCLE_COLLECTOR_MAX_PAUSE (parent) are differing by chance is 0.00.
The probability that the distributions for histograms/CYCLE_COLLECTOR_MAX_PAUSE (children) are differing by chance is 0.00.
The probability that the distributions for histograms/CYCLE_COLLECTOR_TIME_BETWEEN (parent + children) are differing by chance is 0.00.
The probability that the distributions for histograms/CYCLE_COLLECTOR_TIME_BETWEEN (parent) are differing by chance is 0.00.
The probability that the distributions for histograms/CYCLE_COLLECTOR_TIME_BETWEEN (children) are differing by chance is 0.00.
The probability that the distributions for histograms/CYCLE_COLLECTOR_WORKER (parent + children) are differing by chance is 0.14.
The probability that the distributions for histograms/CYCLE_COLLECTOR_WORKER (parent) are differing by chance is 0.06.
The probability that the distributions for histograms/CYCLE_COLLECTOR_WORKER (children) are differing by chance is 0.00.