from __future__ import division import numpy import ujson as json import matplotlib.pyplot as plt import pandas as pd import random import re import IPython import time from moztelemetry.spark import get_pings from moztelemetry.histogram import Histogram sc.defaultParallelism %%capture pings = get_pings(sc, "Firefox", "nightly", "*", ("20141201", "20141208"), ("20141208")) pings.cache() total = pings.count() "Missing clientIDs: {:.0f}% of {}".format(100 - 100*pings.filter(lambda p: "clientID" in p).count()/total, total) %%capture pings = get_pings(sc, "Firefox", "nightly", "*", ("20141220", "20141227"), ("20141227")) pings.cache() total = pings.count() "Missing clientIDs: {:.0f}% of {}".format(100 - 100*pings.filter(lambda p: "clientID" in p).count()/total, total)