#!/usr/bin/env python
# coding: utf-8

# ### E10S Experiment Beta, EVENTLOOP_UI_ACTIVITY_EXP_MS >= 50ms

# In[1]:


import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import IPython

from __future__ import division
from moztelemetry.spark import get_pings, get_one_ping_per_client, get_pings_properties
from montecarlino import grouped_permutation_test

get_ipython().run_line_magic('pylab', 'inline')
IPython.core.pylabtools.figsize(16, 7)


# In[2]:


sc.defaultParallelism


# In[18]:


def chi2_distance(xs, ys, eps = 1e-10, normalize = True):
    histA = xs.sum(axis=0)
    histB = ys.sum(axis=0)
    
    if normalize:
        histA = histA/histA.sum()
        histB = histB/histB.sum()
    
    d = 0.5 * np.sum([((a - b) ** 2) / (a + b + eps)
        for (a, b) in zip(histA, histB)])

    return d


def compare_histogram(histogram, e10s, none10s, from_bin=50):    
    e10s = e10s.map(lambda x: x[x.index >= from_bin])
    none10s = none10s.map(lambda x: x[x.index >= from_bin])
        
    pvalue = grouped_permutation_test(chi2_distance, [e10s, none10s], num_samples=100)
    
    eTotal = e10s.sum()
    nTotal = none10s.sum()
                
    fig = plt.figure()
    fig.subplots_adjust(hspace=0.3)
        
    ax = fig.add_subplot(1, 1, 1)
    ax2 = ax.twinx()
    width = 0.4
    ylim = max(eTotal.max(), nTotal.max())
        
    eTotal.plot(kind="bar", alpha=0.5, color="green", label="e10s", ax=ax, width=width, position=0, ylim=(0, ylim + 1))
    nTotal.plot(kind="bar", alpha=0.5, color="blue", label="non e10s", ax=ax2, width=width, position=1, grid=False, ylim=ax.get_ylim())
        
    ax.legend(ax.get_legend_handles_labels()[0] + ax2.get_legend_handles_labels()[0],
              ["e10s ({} samples".format(len(e10s)), "non e10s ({} samples)".format(len(none10s))])

    plt.title(histogram)
    plt.xlabel(histogram)
    plt.ylabel("Frequency %")
    plt.show()
        
    print "The probability that the distributions for {} are differing by chance is {:.2f}.".format(histogram, pvalue)
    
        
def compare_histograms(pings, *histogram_names):
    frame = pd.DataFrame(get_pings_properties(pings, histogram_names + ("e10s",) , with_processes=True).collect())
    e10s = frame[frame["e10s"] == True]
    none10s = frame[frame["e10s"] == False]
    
    for histogram in none10s.columns:
        if histogram == "e10s" or histogram.endswith("_parent") or histogram.endswith("_children"):
            continue
            
        has_children = np.sum(e10s[histogram + "_children"].notnull()) > 0
        has_parent = np.sum(e10s[histogram + "_parent"].notnull()) > 0
        
        if has_children and has_parent:
            compare_histogram(histogram + " (parent + children)", e10s[histogram].dropna(), none10s[histogram].dropna())
            
        if has_parent:
            compare_histogram(histogram + " (parent)", e10s[histogram + "_parent"].dropna(), none10s[histogram].dropna())
            
        if has_children:
            compare_histogram(histogram + " (children)", e10s[histogram + "_children"].dropna(), none10s[histogram].dropna())


# #### Get e10s and non-e10s partitions

# In[4]:


dataset = sqlContext.read.load("s3://telemetry-parquet/e10s-experiment/e10s-enabled-beta-20151214@experiments.mozilla.org/generationDate=20160106", "parquet")


# Only consider builds with [bug 1234618](https://bugzilla.mozilla.org/show_bug.cgi?id=1234618) BHR fix:

# In[5]:


dataset = dataset.filter(dataset.buildId >= '20151228134903')


# Sample by clientId:

# In[6]:


sampled = dataset.filter(dataset.sampleId <= 50)


# In[7]:


sampled.count()


# Transform Dataframe to RDD of pings

# In[8]:


def row_2_ping(row):
    ping = {"payload": {"simpleMeasurements": json.loads(row.simpleMeasurements),
                        "histograms": json.loads(row.histograms),
                        "keyedHistograms": json.loads(row.keyedHistograms),
                        "childPayloads": json.loads(row.childPayloads),
                        "threadHangStats": json.loads(row.threadHangStats)},
           "e10s": True if row.experimentBranch == "experiment" else False}
    return ping


# In[9]:


subset = sampled.rdd.map(row_2_ping)


# #### Event processing

# In[19]:


compare_histograms(subset, "payload/histograms/EVENTLOOP_UI_ACTIVITY_EXP_MS")