import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import IPython
from moztelemetry.spark import get_pings, get_one_ping_per_client, get_pings_properties
from moztelemetry.histogram import Histogram
from montecarlino import grouped_permutation_test
from __future__ import division
from moztelemetry import histogram_tools
%pylab inline
IPython.core.pylabtools.figsize(18, 7)
Populating the interactive namespace from numpy and matplotlib
sc.defaultParallelism
16
def chi2_distance(xs, ys, eps = 1e-10, normalize = True):
histA = xs.sum(axis=0)
histB = ys.sum(axis=0)
if normalize:
histA = histA/histA.sum()
histB = histB/histB.sum()
d = 0.5 * np.sum([((a - b) ** 2) / (a + b + eps)
for (a, b) in zip(histA, histB)])
return d
def median_diff(xs, ys):
return np.median(xs) - np.median(ys)
def compare_histogram(histogram, e10s, none10s):
pvalue = grouped_permutation_test(chi2_distance, [e10s, none10s], num_samples=100)
eTotal = e10s.sum()
nTotal = none10s.sum()
eTotal = 100*eTotal/eTotal.sum()
nTotal = 100*nTotal/nTotal.sum()
fig = plt.figure()
fig.subplots_adjust(hspace=0.3)
ax = fig.add_subplot(1, 1, 1)
ax2 = ax.twinx()
width = 0.4
eTotal.plot(kind="bar", alpha=0.5, color="green", label="e10s", ax=ax, width=width, position=0)
nTotal.plot(kind="bar", alpha=0.5, color="blue", label="non e10s", ax=ax2, width=width, position=1, grid=False, ylim=ax.get_ylim())
ax.legend(ax.get_legend_handles_labels()[0] + ax2.get_legend_handles_labels()[0], ["e10s", "non e10s"])
plt.xlabel(histogram)
plt.ylabel("Frequency %")
plt.show()
print "The probability that the distributions for {} are differing by chance is {:.2f}.".format(histogram, pvalue)
def normalize_uptime_hour(frame):
frame = frame[frame["uptime"] > 0]
frame = 60 * frame.apply(lambda x: x/frame["uptime"]) # Metric per hour
frame.drop('uptime', axis=1, inplace=True)
return frame
def compare_count_histograms(pings, *histograms_names):
properties = histograms_names + ("simpleMeasurements.uptime", "e10s")
frame = pd.DataFrame(get_pings_properties(pings, properties).collect())
e10s = frame[frame["e10s"] == True]
e10s = normalize_uptime_hour(e10s)
none10s = frame[frame["e10s"] == False]
none10s = normalize_uptime_hour(none10s)
for histogram in e10s.columns:
if histogram == "e10s":
continue
compare_scalars(histogram + " per hour", e10s[histogram].dropna(), none10s[histogram].dropna())
def compare_histograms(pings, *histogram_names):
frame = pd.DataFrame(get_pings_properties(pings, histogram_names + ("e10s",)).collect())
e10s = frame[frame["e10s"] == True]
none10s = frame[frame["e10s"] == False]
for histogram in e10s.columns:
if histogram == "e10s":
continue
compare_histogram(histogram, e10s[histogram].dropna(), none10s[histogram].dropna())
def compare_scalars(metric, *groups):
print "Median difference in {} is {:.2f}, ({:.2f}, {:.2f}).".format(metric,
median_diff(*groups),
np.median(groups[0]),
np.median(groups[1]))
print "The probablity of this effect being purely by chance is {:.2f}.". \
format(grouped_permutation_test(median_diff, groups, num_samples=10000))
pings = get_pings(sc, "Firefox", "nightly", "*", ("20150309000000", "20150315999999"), "20150316", 1)
pings.count()
111102
subset = get_one_ping_per_client(pings)
def add_e10s_discriminator(ping):
ping["e10s"] = True if ping.get("childPayloads", {}) else False
return ping
subset = subset.map(add_e10s_discriminator)
def add_gecko_activity(ping):
threads = ping.get("threadHangStats", {})
uptime = ping["simpleMeasurements"].get("uptime", -1)
if not threads or uptime <= 0:
return ping
for thread in threads:
if thread["name"] == "Gecko":
activity = thread["activity"]["values"]
histogram = pd.Series(activity.values(), index=map(int, activity.keys())).sort_index()
over100 = histogram[histogram.index > 100].sum()
ping["gecko_hangs_per_minute"] = over100/uptime
return ping
subset = subset.map(add_gecko_activity)
simple = pd.DataFrame(get_pings_properties(subset,
["simpleMeasurements.firstPaint",
"simpleMeasurements.sessionRestored",
"simpleMeasurements.sessionRestoreRestoring",
"simpleMeasurements.shutdownDuration",
"gecko_hangs_per_minute",
"e10s"]).collect())
eSimple = simple[simple["e10s"] == True]
nSimple = simple[simple["e10s"] == False]
compare_scalars("startup time", eSimple["firstPaint"].dropna(), nSimple["firstPaint"].dropna())
Median difference in startup time is 74.00, (3570.00, 3496.00). The probablity of this effect being purely by chance is 0.24.
eRestoreTime = (eSimple["sessionRestored"] - eSimple["sessionRestoreRestoring"]).dropna()
nRestoreTime = (nSimple["sessionRestored"] - nSimple["sessionRestoreRestoring"]).dropna()
compare_scalars("restore time", eRestoreTime, nRestoreTime)
Median difference in restore time is -28.00, (118.00, 146.00). The probablity of this effect being purely by chance is 0.00.
compare_scalars("shutdown duration", eSimple["shutdownDuration"], nSimple["shutdownDuration"])
Median difference in shutdown duration is -106.00, (2240.00, 2346.00). The probablity of this effect being purely by chance is 0.00.
compare_histograms(subset,
"histograms.FX_TAB_ANIM_ANY_FRAME_INTERVAL_MS",
"histograms.FX_TAB_ANIM_OPEN_FRAME_INTERVAL_MS",
"histograms.FX_TAB_ANIM_OPEN_PREVIEW_FRAME_INTERVAL_MS")
The probability that the distributions for FX_TAB_ANIM_ANY_FRAME_INTERVAL_MS are differing by chance is 0.00.
The probability that the distributions for FX_TAB_ANIM_OPEN_FRAME_INTERVAL_MS are differing by chance is 0.02.
The probability that the distributions for FX_TAB_ANIM_OPEN_PREVIEW_FRAME_INTERVAL_MS are differing by chance is 0.00.
compare_histograms(subset, "histograms.FX_REFRESH_DRIVER_FRAME_DELAY_MS", "histograms.EVENTLOOP_UI_LAG_EXP_MS")
The probability that the distributions for EVENTLOOP_UI_LAG_EXP_MS are differing by chance is 0.00.
The probability that the distributions for FX_REFRESH_DRIVER_FRAME_DELAY_MS are differing by chance is 0.13.
compare_scalars("hangs over 100ms per minute", eSimple["gecko_hangs_per_minute"], nSimple["gecko_hangs_per_minute"])
Median difference in hangs over 100ms per minute is -4.44, (2.71, 7.15). The probablity of this effect being purely by chance is 0.00.
compare_count_histograms(subset, "keyedHistograms.SUBPROCESS_ABNORMAL_ABORT.plugin")
Median difference in SUBPROCESS_ABNORMAL_ABORT.plugin per hour is 0.01. The probablity of this effect being purely by chance is 0.67.
compare_histograms(subset, "histograms.FX_PAGE_LOAD_MS")
The probability that the distributions for FX_PAGE_LOAD_MS are differing by chance is 0.00.
compare_count_histograms(subset, "histograms.SLOW_SCRIPT_NOTICE_COUNT")
Median difference in SLOW_SCRIPT_NOTICE_COUNT per hour is -0.04. The probablity of this effect being purely by chance is 0.35.
compare_histograms(subset, "histograms.FX_NEW_WINDOW_MS")
The probability that the distributions for FX_NEW_WINDOW_MS are differing by chance is 0.00.
compare_histograms(subset, "histograms.FX_TAB_SWITCH_TOTAL_MS")
The probability that the distributions for FX_TAB_SWITCH_TOTAL_MS are differing by chance is 0.96.
compare_histograms(subset,
"histograms.GC_MS",
"histograms.GC_MAX_PAUSE_MS",
"histograms.GC_MARK_MS",
"histograms.GC_SWEEP_MS",
"histograms.GC_MARK_ROOTS_MS",
"histograms.GC_MARK_GRAY_MS",
"histograms.GC_SLICE_MS",
"histograms.GC_SCC_SWEEP_TOTAL_MS",
"histograms.GC_SCC_SWEEP_MAX_PAUSE_MS")
The probability that the distributions for GC_MARK_GRAY_MS are differing by chance is 0.00.
The probability that the distributions for GC_MARK_MS are differing by chance is 0.00.
The probability that the distributions for GC_MARK_ROOTS_MS are differing by chance is 0.00.
The probability that the distributions for GC_MAX_PAUSE_MS are differing by chance is 0.00.
The probability that the distributions for GC_MS are differing by chance is 0.00.
The probability that the distributions for GC_SCC_SWEEP_MAX_PAUSE_MS are differing by chance is 0.00.
The probability that the distributions for GC_SCC_SWEEP_TOTAL_MS are differing by chance is 0.00.
The probability that the distributions for GC_SLICE_MS are differing by chance is 0.00.
The probability that the distributions for GC_SWEEP_MS are differing by chance is 0.00.
compare_histograms(subset,
"histograms.CYCLE_COLLECTOR",
"histograms.CYCLE_COLLECTOR_WORKER",
"histograms.CYCLE_COLLECTOR_FULL",
"histograms.CYCLE_COLLECTOR_MAX_PAUSE",
"histograms.CYCLE_COLLECTOR_TIME_BETWEEN")
The probability that the distributions for CYCLE_COLLECTOR are differing by chance is 0.00.
The probability that the distributions for CYCLE_COLLECTOR_FULL are differing by chance is 0.00.
The probability that the distributions for CYCLE_COLLECTOR_MAX_PAUSE are differing by chance is 0.00.
The probability that the distributions for CYCLE_COLLECTOR_TIME_BETWEEN are differing by chance is 0.00.
The probability that the distributions for CYCLE_COLLECTOR_WORKER are differing by chance is 0.00.