Bug 1111791 - Telemetry report: effect of the Flash protected-mode experiment¶

In [1]:

%matplotlib inline  

import numpy
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import IPython
import scipy.stats
import statsmodels.stats.proportion as proportion
import statsmodels.stats.weightstats as weightstats

from moztelemetry.spark import get_pings
from __future__ import division

IPython.core.pylabtools.figsize(18, 7)

Verify that all workers are good to go:

In [2]:

sc.defaultParallelism

Out[2]:

Fetch pings:

In [3]:

pings = get_pings(sc, "Firefox", "beta", "35.0", "*", ("20141229", "20150104"))

The total number of submissions considered is:

In [4]:

%%capture
pings.count()

Out[4]:

47584756

Get the subset of submissions that are part of the experiment:

In [5]:

experiment_name = "flash-protectedmode-beta35@experiments.mozilla.org"

def fast_filter(raw):
    return "activeExperiment" in raw

def parse(raw):
    return json.loads(raw)

def is_experiment(ping):
    info = ping["info"]
    return info["activeExperiment"] == experiment_name and info["OS"] == "WINNT" \
        and (info["version"] == "6.1" or info["version"] == "6.0")

def extract_count(container, histogram_name, kind):
    histograms = container.get(histogram_name, {})
    histogram = histograms.get(kind, None)
    return histogram["sum"] if histogram else 0

def is_control(ping):
    return ping["info"]["activeExperimentBranch"] == "control"

def extract(ping):
    keyed = ping.get("keyedHistograms", {})
    
    clientID = ping.get("clientID", None)
    uptime = ping["simpleMeasurements"]["uptime"]
    control = is_control(ping)
    
    aborts = extract_count(keyed, "SUBPROCESS_ABNORMAL_ABORT", "plugin")
    crashes = extract_count(keyed, "SUBPROCESS_CRASHES_WITH_DUMP", "plugin")
    hangs = extract_count(keyed, "SUBPROCESS_CRASHES_WITH_DUMP", "pluginhang")
    
    lag = ping["histograms"].get("EVENTLOOP_UI_LAG_EXP_MS", None)
    lag = lag[:-5] if lag else lag # only take buckets
    
    return (clientID, aborts, crashes, hangs, control, lag, uptime)

data = pings.filter(fast_filter).map(parse).filter(is_experiment).map(extract)

Collect back the data to the driver:

In [6]:

%%capture
collected_data = data.collect()

The number of submissions that are part of the experiment is:

In [7]:

len(collected_data)

Out[7]:

In [8]:

frame = pd.DataFrame(collected_data, columns=["clientID", "aborts", "crashes", "hangs", "control", "lag", "uptime"])

frame = frame[frame["uptime"] > 0] # Ignore submissions with negative or zero uptime (see Bug 1106048)
frame = frame.groupby("clientID").last().reset_index() # To avoid bias, use only the last submission for each Client ID

frame["lag"] = frame["lag"].map(np.array)
frame["lag"] = frame["lag"] / frame["uptime"] # Normalize lag histograms by uptime
frame["ping"] = 1

The number of submissions that are part of the experiment, after removing all but one submission per client, is:

In [9]:

len(frame)

Out[9]:

Plugin specific metrics:

aborts is a measure of all plugin aborts, no matter what kind
crashes is a measure of plugin crashes which triggered breakpad crash reporting
hangs is a a measure of plugin hangs which triggered breakpad crash reporting

As crashes and hangs are reported also as aborts, we can subtract the former from the latter to consider only the aborts without a crash report.

In [10]:

frame["aborts"] = frame["aborts"] - frame["crashes"] - frame["hangs"]

In [11]:

binary_frame = frame.copy()
binary_frame["aborts"] = binary_frame["aborts"].map(bool)
binary_frame["crashes"] = binary_frame["crashes"].map(bool)
binary_frame["hangs"] = binary_frame["hangs"].map(bool)

In [12]:

frame.to_json("1111791.json")

Aborts¶

Q1: Does disabling protected mode change the proportion of sessions with abnormal plugin aborts/shutdowns without crash report?¶

In [13]:

def error(row):
    count = row[True]
    nobs = row.sum()
    return map(lambda x: float("{:.3f}".format(x)), proportion.proportion_confint(count, nobs, method="wilson"))

def compare_proportions(metric):
    agg = pd.pivot_table(binary_frame, index=["control", metric], values="ping", aggfunc=np.sum)
  
    default = pd.DataFrame({False: {False: 0, True: 0}, True: {False: 0, True: 0}})
    contingency_table = agg.unstack().combine_first(default)
    
    control = contingency_table.ix[True]
    experiment = contingency_table.ix[False]

    p1 = control[True]/control.sum()
    p2 = experiment[True]/experiment.sum()
    pvalue = scipy.stats.fisher_exact(contingency_table)[1]
   
    print "Contingency table:\n"
    print contingency_table
    print "\nThe estimated proportion of {} in the experiment branch is {:.3f} and its CI is {}".format(metric, p2, error(experiment))
    print "The estimated proportion of {} in the control branch is {:.3f} and its CI is {}".format(metric, p1, error(control))
    print "The probability that the ratios are different purely by chance is {:.3f}".format(pvalue)

In [14]:

compare_proportions("aborts")

Contingency table:

aborts   False  True 
control              
False    81207   2586
True     81268   2409

The estimated proportion of aborts in the experiment branch is 0.031 and its CI is [0.03, 0.032]
The estimated proportion of aborts in the control branch is 0.029 and its CI is [0.028, 0.03]
The probability that the ratios are different purely by chance is 0.013

A1: The difference between the branches is not statistically significant.¶

Q2: Does disabling protected mode change the average number of abnormal plugin shutdowns per session without crash report?¶

In [15]:

def compare_means(metric):
    control_mask = frame["control"] == True

    experiment = weightstats.DescrStatsW(frame[-control_mask][metric])
    control = weightstats.DescrStatsW(frame[control_mask][metric])
    
    mean_diff = experiment.mean - control.mean
    comparator = weightstats.CompareMeans(experiment, control)
    
    ci_diff = comparator.tconfint_diff(usevar="unequal")[1] - mean_diff
    ci_control = control.tconfint_mean()[1] - control.mean
    ci_experiment = experiment.tconfint_mean()[1] - experiment.mean

    print "The mean number of {} of the experiment branch is {:.3f} +- {:.3f}".format(metric, experiment.mean, ci_experiment)
    print "The mean number of {} of the control branch is {:.3f} +- {:.3f}".format(metric, control.mean, ci_control)
    print "The difference between the means is {:.3f} +- {:.3f}".format(mean_diff, ci_diff)
    print "The probability to see this difference purely by chance is {:.3f}".format(comparator.ttest_ind(usevar="unequal")[1])

In [16]:

compare_means("aborts")

The mean number of aborts of the experiment branch is 0.035 +- 0.002
The mean number of aborts of the control branch is 0.037 +- 0.008
The difference between the means is -0.002 +- 0.008
The probability to see this difference purely by chance is 0.656

A2: The difference between the branches is not statistically significant.¶

In [17]:

def hist_failures(metric):
    aborts = frame.groupby(["control", metric])["clientID"].count().unstack().fillna(0)
    aborts = (100 * aborts.T / aborts.sum(axis=1)).ix[1:] # remove 0 as its high frequency hides the ones of the other values
    aborts.plot(kind="bar")
    plt.title("Histogram of submissions for {}".format(metric))
    plt.ylabel("submissions %")
    
hist_failures("aborts")

Crashes¶

Q1: Does disabling protected mode change the proportion of sessions with plugin crashes?¶

In [18]:

compare_proportions("crashes")

Contingency table:

crashes  False  True 
control              
False    83424    369
True     82959    718

The estimated proportion of crashes in the experiment branch is 0.004 and its CI is [0.004, 0.005]
The estimated proportion of crashes in the control branch is 0.009 and its CI is [0.008, 0.009]
The probability that the ratios are different purely by chance is 0.000

A1: The proportion of sessions with plugin crashes is lower in the experiment branch compared to the control branch. The difference is statistically significant.¶

Q2: Does disabling protected mode change the average number of crashes per session?¶

In [19]:

compare_means("crashes")

The mean number of crashes of the experiment branch is 0.045 +- 0.013
The mean number of crashes of the control branch is 0.179 +- 0.079
The difference between the means is -0.134 +- 0.080
The probability to see this difference purely by chance is 0.001

A2: The average number of plugin crashes per session is lower in the experiment branch than in the control branch. The difference is statistically significant.¶

In [20]:

hist_failures("crashes")

Hangs¶

Q1: Does disabling protected mode change the proportion of sessions with plugin hangs?¶

In [21]:

compare_proportions("hangs")

Contingency table:

hangs    False  True 
control              
False    83055    738
True     82202   1475

The estimated proportion of hangs in the experiment branch is 0.009 and its CI is [0.008, 0.009]
The estimated proportion of hangs in the control branch is 0.018 and its CI is [0.017, 0.019]
The probability that the ratios are different purely by chance is 0.000

The proportion of sessions with plugin hangs is lower in the experiment branch compared to the control branch. The difference is statistically significant.¶

Q2: Does disabling protected mode change the average number of hangs per session?¶

In [22]:

compare_means("hangs")

The mean number of hangs of the experiment branch is 0.012 +- 0.001
The mean number of hangs of the control branch is 0.022 +- 0.001
The difference between the means is -0.011 +- 0.002
The probability to see this difference purely by chance is 0.000

The average number of plugin hangs per session is lower in the experiment branch than in the control branch. The difference is statistically significant.¶

In [23]:

hist_failures("hangs")

Event Processing¶

Q1: Does disabling protected mode affect the severity of UI lag?¶

In [24]:

control_mask = frame["control"]
control = frame[control_mask]["lag"]
experiment = frame[-control_mask]["lag"]

We can't use a simple Chi-Square test as some assumptions are not met. Furthermore, for large sample sizes, a statistical test yields always something that looks significant. To account for the variability between different profiles that make up the aggregate, let's use instead a Monte Carlo permutation test combined with a distance metric.

In [25]:

# Chi-Squared Histogram distance http://www.cs.huji.ac.il/~ofirpele/publications/ECCV2010.pdf
def chi2_distance(histA, histB, eps = 1e-10):
    d = 0.5 * np.sum([((a - b) ** 2) / (a + b + eps)
        for (a, b) in zip(histA, histB)])

    return d

def mc_permutation_test(xs, ys, num):
    n, k = len(xs), 0
    h1 = xs.sum()
    h2 = ys.sum()
        
    diff = chi2_distance(h1, h2)
    zs = pd.concat([xs, ys])
    zs.index = np.arange(0, len(zs))

    for j in range(num):
        zs = zs.reindex(np.random.permutation(zs.index))    
        h1 = zs[:n].sum()
        h2 = zs[n:].sum()        
        k += diff < chi2_distance(h1, h2)

    return k / num

The probability to observe the difference between the lag histograms of the control and experimental groups by chance is:

In [26]:

print "{:.3f}".format(mc_permutation_test(control, experiment, 1000))

0.000

As the effect is statistically significant and not due to noise, let's plot the distributions of EVENTLOOP_UI_LAG_EXP_MS:

In [27]:

labels = ["0", "50", "74", "110", "163", "242", "359", "532", "789", "1.17k", 
          "1.74k", "2.57k", "3.81k", "5.65k", "8.38k", "12.42k", "18.42k", "27.3k", "40.47k", "60k"]

experiment_total = pd.Series(experiment.sum(), index=labels)
control_total = pd.Series(control.sum(), index=labels)
normalizer = control_total.sum()

control_total = 100*control_total/normalizer
experiment_total = 100*experiment_total/normalizer # Normalize both histogram relative to control to keep the proportions

experiment_total.plot(kind="bar", alpha=0.5, color="green", label="experiment")
control_total.plot(kind="bar", alpha=0.5, color="blue", label="control")
plt.xlabel("Time it takes for the message before a UI message (ms)")
plt.ylabel("Lag frequency %")
plt.legend()

Out[27]:

<matplotlib.legend.Legend at 0x7f2570fd0b10>