import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import networkx as nx
import collections
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client
%pylab inline
Populating the interactive namespace from numpy and matplotlib
sc.defaultParallelism
48
pings = get_pings(sc, app="Firefox",
channel="nightly",
submission_date=("20150507","20150514"),
fraction=1,
schema="v4")
p = pings.first()
# p["payload"]["info"]
p.keys()
# p.get("payload",{}).get("info",{}).get("subsessionId",False)
# {k:p[k] for k in p.keys() if k!="main"}
[u'clientId', u'id', u'environment', u'application', u'version', 'meta', u'creationDate', u'type', u'payload']
To distinguish from the other ids running aroung, we'll call the top level 'id' field the "pingId"
Note that we have to filter out the 'meta' entry, b/c this can contain things like intake timestamps, which should be expected to change if the same ping is sent twice
pingsByPingId = pings \
.map(lambda p: (p.get("id","MISSING"),
[{k:p[k] for k in p.keys() if k!="meta"}]) ) \
.reduceByKey(lambda l1,l2: l1+l2)
pingsByPingId.cache()
pById = pingsByPingId.take(10)
Let's call the number of pings that share a pingId the "multiplicity" of that pingId. The tuples below are of the form:
(multiplicity of a pingId, number of pingIds with that multiplicity)
pingIdMultiplicities = pingsByPingId \
.map(lambda id_pList: (len(id_pList[1]), 1) ) \
.reduceByKey(lambda x1,x2: x1+x2)
pingIdMultiplicities.cache()
pingIdMultiplicities.count()
29
pingIdMultiplicities.collect()
[(1, 1578748), (2, 5146), (3, 419), (4, 116), (5, 59), (6, 37), (7, 27), (8, 33), (9, 18), (10, 29), (11, 4), (12, 9), (13, 13), (14, 1), (15, 12), (18, 9), (20, 1), (22, 2), (23, 1), (24, 1), (25, 3), (26, 1), (28, 3), (29, 1), (30, 3), (31, 2), (32, 8), (40, 1), (41, 12)]
Next let's set aside all the sets of pings that have any non-identical members
def allEqual(l):
if len(l)<=1:
return True
else:
e1 = l[0]
for e in l[1:]:
if e != e1:
return False
return True
pingsByPingId_nonexactDupes = pingsByPingId \
.filter(lambda id_pList: len(id_pList[1])>1 ) \
.filter(lambda id_pList: not allEqual(id_pList[1]) )
pingsByPingId_nonexactDupes.cache()
pingsByPingId_nonexactDupes.count()
329
# since there are only 329, pull them local for more work
nonexactDupesSample = pingsByPingId_nonexactDupes.collect()
collections.Counter(map(lambda tup: len(tup[1]), nonexactDupesSample))
Counter({2: 320, 3: 9})
So, from the above we see that there are 320 pingIds that show up in sets of two pings that are not all identical, and 9 pingIds that show up in sets of 3 pings that are not all identical. Note that in the case of the 3 ping sets, this does not mean that all 3 are necessarily different, just that at least one is not the same as the others.
So we know that not all ping sumbissions are atomic. What can change within a ping between submissions? In the 9 cases of 3 pings under the same pingId, we'll simplify by just looking at the first two.
# cribbed from http://stackoverflow.com/questions/5903720/recursive-diff-of-two-python-dictionaries-keys-and-values
def list_to_dict(l):
return dict(zip(map(str, range(len(l))), l))
def dictDiff2(d1, d2, path=""):
changes = []
for k in d1:
if k not in d2:
changes.append( ("path not present in both", path+"/"+k) )
for k in d2:
if k not in d1:
changes.append( ("path not present in both", path+"/"+k) )
continue
if d2[k] != d1[k]:
if type(d2[k]) not in (dict, list):
changes.append( ("value changed", path+"/"+k) )
else:
if type(d1[k]) != type(d2[k]):
changes.append( ("value changed", path+"/"+k) )
continue
else:
if type(d2[k]) == dict:
changes += dictDiff2(d1[k], d2[k], path+"/"+k)
continue
elif type(d2[k]) == list:
changes += dictDiff2(list_to_dict(d1[k]), list_to_dict(d2[k]), path+"/"+k)
return changes
pingChanges = map(lambda tup: dictDiff2(tup[1][0],tup[1][1]), nonexactDupesSample)
# how many difference are there per between pings for each pingId?
collections.Counter([len(pc) for pc in pingChanges])
Counter({1: 119, 2: 59, 3: 32, 8: 26, 5: 19, 4: 18, 6: 14, 11: 11, 9: 8, 14: 7, 10: 5, 12: 5, 7: 4, 0: 1, 18: 1})
from the above we see that in most instances, these mismatched pings differ in only one place, but they can sometimes differ in several places (up to 18 places in the data considered). However, we don't see massive changes across e.g. hundreds or thousands of paths. So the next sub-question is:
pingChangesFlat = reduce(lambda l1,l2:l1+l2, pingChanges, [])
changes = {}
for changeType,changePath in pingChangesFlat:
if changeType not in changes:
changes[changeType] = {changePath:1}
else:
changes[changeType][changePath] = changes[changeType].get(changePath,0)+1
The following shows json paths that are only present in one of the two pings as
(number of times the path was missing, path)
dict(sorted([(tup[1],tup[0]) for tup in changes["path not present in both"].items()],reverse=True))
{1: u'/payload/addonDetails/XPI/1tinUyqW@cx.com/shutdown_MS', 2: u'/payload/addonDetails/XPI/CanvasBlocker@kkapsner.de/shutdown_MS', 3: u'/payload/addonDetails/XPI/browsec@browsec.com/shutdown_MS', 4: u'/payload/addonDetails/XPI/firebug@software.joehewitt.com/shutdown_MS', 5: u'/payload/addonDetails/XPI/compatibility@addons.mozilla.org/shutdown_MS', 6: u'/payload/addonDetails/XPI/aboutsessionstore@dt/shutdown_MS', 7: u'/payload/addonDetails/XPI/arpit2@techraga.in/shutdown_MS', 8: u'/payload/addonDetails/XPI/CSTBB@NArisT2_Noia4dev/shutdown_MS', 9: u'/payload/addonDetails/XPI/en-US@dictionaries.addons.mozilla.org/shutdown_MS', 10: u'/payload/addonDetails/XPI/firefox@mega.co.nz/shutdown_MS', 11: u'/payload/addonDetails/XPI/the-addon-bar@GeekInTraining-GiT/shutdown_MS', 12: u'/payload/addonDetails/XPI/adbhelper@mozilla.org/shutdown_MS', 13: u'/payload/addonDetails/XPI/{b9db16a4-6edc-47ec-a1f4-b86292ed211d}/shutdown_MS', 14: u'/payload/addonDetails/XPI/jid1-xUfzOsOFlzSOXg@jetpack/shutdown_MS', 16: u'/payload/addonDetails/XPI/uriloader@pdf.js/shutdown_MS', 18: u'/payload/addonDetails/XPI/jid1-cwbvBTE216jjpg@jetpack/shutdown_MS', 20: u'/payload/addonDetails/XPI/{2b10c1c8-a11f-4bad-fe9c-1c11e82cac42}/shutdown_MS', 22: u'/payload/addonDetails/XPI/skip_compatibility_check@sdrocking.com/shutdown_MS', 24: u'/payload/addonDetails/XPI/firefox@ghostery.com/shutdown_MS', 25: u'/payload/addonDetails/XPI/check-compatibility@dactyl.googlecode.com/shutdown_MS', 30: u'/payload/addonDetails/XPI/elemhidehelper@adblockplus.org/shutdown_MS', 58: u'/payload/addonDetails/XPI/mediahint@jetpack/shutdown_MS', 81: u'/payload/addonDetails/XPI/{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}/shutdown_MS'}
The following shows json paths that had different values between the two pings as
(number of times the path differed, path)
dict(sorted([(tup[1],tup[0]) for tup in changes['value changed'].items()],reverse=True))
{1: u'/payload/addonDetails/XPI/dta@downthemall.net/shutdown_MS', 2: u'/payload/simpleMeasurements/UITelemetry/contextmenu/__DEFAULT__/["link","image"]/withoutcustom/openlinkintab', 3: u'/payload/addonDetails/XPI/firebug@software.joehewitt.com/creator', 4: u'/payload/addonDetails/XPI/firebug@software.joehewitt.com/startup_MS', 5: u'/payload/simpleMeasurements/UITelemetry/contextmenu/__DEFAULT__/["link"]/withoutcustom/openlinkintab', 6: u'/payload/simpleMeasurements/UITelemetry/toolbars/countableEvents/__DEFAULT__/click-builtin-item/back-button/left', 11: u'/payload/addonDetails/XPI/firebug@software.joehewitt.com/shutdown_MS', 47: u'/payload/info/reason', 64: u'/payload/simpleMeasurements/UITelemetry/toolbars/countableEvents/__DEFAULT__/click-builtin-item/tabbrowser-tabs/left'}