from __future__ import division
import numpy
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import random
import re
import IPython
import time
from moztelemetry.spark import get_pings
from moztelemetry.histogram import Histogram
sc.defaultParallelism
64
Before 1101487 landed:
%%capture
pings = get_pings(sc, "Firefox", "nightly", "*", ("20141201", "20141208"), ("20141208"))
pings.cache()
total = pings.count()
"Missing clientIDs: {:.0f}% of {}".format(100 - 100*pings.filter(lambda p: "clientID" in p).count()/total, total)
'Missing clientIDs: 20% of 120439'
After 1101487 landed:
%%capture
pings = get_pings(sc, "Firefox", "nightly", "*", ("20141220", "20141227"), ("20141227"))
pings.cache()
total = pings.count()
"Missing clientIDs: {:.0f}% of {}".format(100 - 100*pings.filter(lambda p: "clientID" in p).count()/total, total)
'Missing clientIDs: 100% of 99353'