%matplotlib inline
import numpy
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import IPython
import scipy.stats
import statsmodels.stats.proportion as proportion
import statsmodels.stats.weightstats as weightstats
from moztelemetry.spark import get_pings
from __future__ import division
IPython.core.pylabtools.figsize(18, 7)
Verify that all workers are good to go:
sc.defaultParallelism
144
Fetch pings:
pings = get_pings(sc, "Firefox", "beta", "35.0", "*", ("20141229", "20150104"))
The total number of submissions considered is:
%%capture
pings.count()
47584756
Get the subset of submissions that are part of the experiment:
experiment_name = "flash-protectedmode-beta35@experiments.mozilla.org"
def fast_filter(raw):
return "activeExperiment" in raw
def parse(raw):
return json.loads(raw)
def is_experiment(ping):
info = ping["info"]
return info["activeExperiment"] == experiment_name and info["OS"] == "WINNT" \
and (info["version"] == "6.1" or info["version"] == "6.0")
def extract_count(container, histogram_name, kind):
histograms = container.get(histogram_name, {})
histogram = histograms.get(kind, None)
return histogram["sum"] if histogram else 0
def is_control(ping):
return ping["info"]["activeExperimentBranch"] == "control"
def extract(ping):
keyed = ping.get("keyedHistograms", {})
clientID = ping.get("clientID", None)
uptime = ping["simpleMeasurements"]["uptime"]
control = is_control(ping)
aborts = extract_count(keyed, "SUBPROCESS_ABNORMAL_ABORT", "plugin")
crashes = extract_count(keyed, "SUBPROCESS_CRASHES_WITH_DUMP", "plugin")
hangs = extract_count(keyed, "SUBPROCESS_CRASHES_WITH_DUMP", "pluginhang")
lag = ping["histograms"].get("EVENTLOOP_UI_LAG_EXP_MS", None)
lag = lag[:-5] if lag else lag # only take buckets
return (clientID, aborts, crashes, hangs, control, lag, uptime)
data = pings.filter(fast_filter).map(parse).filter(is_experiment).map(extract)
Collect back the data to the driver:
%%capture
collected_data = data.collect()
The number of submissions that are part of the experiment is:
len(collected_data)
2312411
frame = pd.DataFrame(collected_data, columns=["clientID", "aborts", "crashes", "hangs", "control", "lag", "uptime"])
frame = frame[frame["uptime"] > 0] # Ignore submissions with negative or zero uptime (see Bug 1106048)
frame = frame.groupby("clientID").last().reset_index() # To avoid bias, use only the last submission for each Client ID
frame["lag"] = frame["lag"].map(np.array)
frame["lag"] = frame["lag"] / frame["uptime"] # Normalize lag histograms by uptime
frame["ping"] = 1
The number of submissions that are part of the experiment, after removing all but one submission per client, is:
len(frame)
167470
Plugin specific metrics:
aborts is a measure of all plugin aborts, no matter what kind
crashes is a measure of plugin crashes which triggered breakpad crash reporting
hangs is a a measure of plugin hangs which triggered breakpad crash reporting
As crashes and hangs are reported also as aborts, we can subtract the former from the latter to consider only the aborts without a crash report.
frame["aborts"] = frame["aborts"] - frame["crashes"] - frame["hangs"]
binary_frame = frame.copy()
binary_frame["aborts"] = binary_frame["aborts"].map(bool)
binary_frame["crashes"] = binary_frame["crashes"].map(bool)
binary_frame["hangs"] = binary_frame["hangs"].map(bool)
frame.to_json("1111791.json")
def error(row):
count = row[True]
nobs = row.sum()
return map(lambda x: float("{:.3f}".format(x)), proportion.proportion_confint(count, nobs, method="wilson"))
def compare_proportions(metric):
agg = pd.pivot_table(binary_frame, index=["control", metric], values="ping", aggfunc=np.sum)
default = pd.DataFrame({False: {False: 0, True: 0}, True: {False: 0, True: 0}})
contingency_table = agg.unstack().combine_first(default)
control = contingency_table.ix[True]
experiment = contingency_table.ix[False]
p1 = control[True]/control.sum()
p2 = experiment[True]/experiment.sum()
pvalue = scipy.stats.fisher_exact(contingency_table)[1]
print "Contingency table:\n"
print contingency_table
print "\nThe estimated proportion of {} in the experiment branch is {:.3f} and its CI is {}".format(metric, p2, error(experiment))
print "The estimated proportion of {} in the control branch is {:.3f} and its CI is {}".format(metric, p1, error(control))
print "The probability that the ratios are different purely by chance is {:.3f}".format(pvalue)
compare_proportions("aborts")
Contingency table: aborts False True control False 81207 2586 True 81268 2409 The estimated proportion of aborts in the experiment branch is 0.031 and its CI is [0.03, 0.032] The estimated proportion of aborts in the control branch is 0.029 and its CI is [0.028, 0.03] The probability that the ratios are different purely by chance is 0.013
def compare_means(metric):
control_mask = frame["control"] == True
experiment = weightstats.DescrStatsW(frame[-control_mask][metric])
control = weightstats.DescrStatsW(frame[control_mask][metric])
mean_diff = experiment.mean - control.mean
comparator = weightstats.CompareMeans(experiment, control)
ci_diff = comparator.tconfint_diff(usevar="unequal")[1] - mean_diff
ci_control = control.tconfint_mean()[1] - control.mean
ci_experiment = experiment.tconfint_mean()[1] - experiment.mean
print "The mean number of {} of the experiment branch is {:.3f} +- {:.3f}".format(metric, experiment.mean, ci_experiment)
print "The mean number of {} of the control branch is {:.3f} +- {:.3f}".format(metric, control.mean, ci_control)
print "The difference between the means is {:.3f} +- {:.3f}".format(mean_diff, ci_diff)
print "The probability to see this difference purely by chance is {:.3f}".format(comparator.ttest_ind(usevar="unequal")[1])
compare_means("aborts")
The mean number of aborts of the experiment branch is 0.035 +- 0.002 The mean number of aborts of the control branch is 0.037 +- 0.008 The difference between the means is -0.002 +- 0.008 The probability to see this difference purely by chance is 0.656
def hist_failures(metric):
aborts = frame.groupby(["control", metric])["clientID"].count().unstack().fillna(0)
aborts = (100 * aborts.T / aborts.sum(axis=1)).ix[1:] # remove 0 as its high frequency hides the ones of the other values
aborts.plot(kind="bar")
plt.title("Histogram of submissions for {}".format(metric))
plt.ylabel("submissions %")
hist_failures("aborts")
compare_proportions("crashes")
Contingency table: crashes False True control False 83424 369 True 82959 718 The estimated proportion of crashes in the experiment branch is 0.004 and its CI is [0.004, 0.005] The estimated proportion of crashes in the control branch is 0.009 and its CI is [0.008, 0.009] The probability that the ratios are different purely by chance is 0.000
compare_means("crashes")
The mean number of crashes of the experiment branch is 0.045 +- 0.013 The mean number of crashes of the control branch is 0.179 +- 0.079 The difference between the means is -0.134 +- 0.080 The probability to see this difference purely by chance is 0.001
hist_failures("crashes")
compare_proportions("hangs")
Contingency table: hangs False True control False 83055 738 True 82202 1475 The estimated proportion of hangs in the experiment branch is 0.009 and its CI is [0.008, 0.009] The estimated proportion of hangs in the control branch is 0.018 and its CI is [0.017, 0.019] The probability that the ratios are different purely by chance is 0.000
compare_means("hangs")
The mean number of hangs of the experiment branch is 0.012 +- 0.001 The mean number of hangs of the control branch is 0.022 +- 0.001 The difference between the means is -0.011 +- 0.002 The probability to see this difference purely by chance is 0.000
hist_failures("hangs")
control_mask = frame["control"]
control = frame[control_mask]["lag"]
experiment = frame[-control_mask]["lag"]
We can't use a simple Chi-Square test as some assumptions are not met. Furthermore, for large sample sizes, a statistical test yields always something that looks significant. To account for the variability between different profiles that make up the aggregate, let's use instead a Monte Carlo permutation test combined with a distance metric.
# Chi-Squared Histogram distance http://www.cs.huji.ac.il/~ofirpele/publications/ECCV2010.pdf
def chi2_distance(histA, histB, eps = 1e-10):
d = 0.5 * np.sum([((a - b) ** 2) / (a + b + eps)
for (a, b) in zip(histA, histB)])
return d
def mc_permutation_test(xs, ys, num):
n, k = len(xs), 0
h1 = xs.sum()
h2 = ys.sum()
diff = chi2_distance(h1, h2)
zs = pd.concat([xs, ys])
zs.index = np.arange(0, len(zs))
for j in range(num):
zs = zs.reindex(np.random.permutation(zs.index))
h1 = zs[:n].sum()
h2 = zs[n:].sum()
k += diff < chi2_distance(h1, h2)
return k / num
The probability to observe the difference between the lag histograms of the control and experimental groups by chance is:
print "{:.3f}".format(mc_permutation_test(control, experiment, 1000))
0.000
As the effect is statistically significant and not due to noise, let's plot the distributions of EVENTLOOP_UI_LAG_EXP_MS:
labels = ["0", "50", "74", "110", "163", "242", "359", "532", "789", "1.17k",
"1.74k", "2.57k", "3.81k", "5.65k", "8.38k", "12.42k", "18.42k", "27.3k", "40.47k", "60k"]
experiment_total = pd.Series(experiment.sum(), index=labels)
control_total = pd.Series(control.sum(), index=labels)
normalizer = control_total.sum()
control_total = 100*control_total/normalizer
experiment_total = 100*experiment_total/normalizer # Normalize both histogram relative to control to keep the proportions
experiment_total.plot(kind="bar", alpha=0.5, color="green", label="experiment")
control_total.plot(kind="bar", alpha=0.5, color="blue", label="control")
plt.xlabel("Time it takes for the message before a UI message (ms)")
plt.ylabel("Lag frequency %")
plt.legend()
<matplotlib.legend.Legend at 0x7f2570fd0b10>