import binascii
from operator import attrgetter, itemgetter
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history
from collections import defaultdict
%pylab inline
Populating the interactive namespace from numpy and matplotlib
sc.defaultParallelism
80
Get all main pings for a set of recent build-ids:
pings = get_pings(sc,
app="Firefox",
channel="nightly",
build_id=("20150710000000", "20150717999999"),
doc_type="main",
schema="v4")
Take only 10% of nightly clients:
def sample(ping):
client_id = ping.get("clientId", None)
return client_id and binascii.crc32(ping["clientId"]) % 100 < 10
sampled = pings.filter(sample)
Get a subset of fields:
subset = get_pings_properties(sampled, ["clientId",
"meta/documentId",
"environment/system/os/name",
"payload/info/reason",
"payload/info/sessionId",
"payload/info/subsessionId",
"payload/info/previousSessionId",
"payload/info/previousSubsessionId",
"payload/info/subsessionCounter",
"payload/info/profileSubsessionCounter"])
Group fragments by client and dedupe by documentId:
def dedupe_and_sort(group):
key, history = group
seen = set()
result = []
for fragment in history:
id = fragment["meta/documentId"]
if id in seen:
continue
seen.add(id)
result.append(fragment)
result.sort(key=itemgetter("payload/info/profileSubsessionCounter"))
return result
grouped = subset.groupBy(lambda x: x["clientId"]).map(dedupe_and_sort).collect()
< Digression> What's the percentage of clients that have at least one pair of fragments with different documentIds but the same profileSubsessionCounter?
def duplicate_pssc(grouped):
dupes = 0
dupe_clients = set()
for history in grouped:
counts = defaultdict(int)
for fragment in history:
key = fragment["payload/info/profileSubsessionCounter"]
counts[key] += 1
for _, v in counts.iteritems():
if v > 1:
dupes += 1
dupe_clients.add(history[0]["clientId"])
break
print 100.0*dupes/len(grouped)
return dupe_clients
dupe_clients = duplicate_pssc(grouped)
3.10894480306
< /Digression> Let's remove those clients to be safe.
dd_grouped = filter(lambda h: h[0]["clientId"] not in dupe_clients, grouped)
Given the set of chain breaks in consecutive sessions, how many of them are due to missing starting/ending fragments?
def missing(grouped, cmp, reason="", debug=False):
cmp_missing = 0
other_missing = 0
for history in grouped:
last_session_id = history[-1]["payload/info/sessionId"]
for i in range(1, len(history)):
curr_fragment = history[i]
current_pss_counter = curr_fragment["payload/info/profileSubsessionCounter"]
prev_fragment = history[i - 1]
prev_pss_counter = prev_fragment["payload/info/profileSubsessionCounter"]
# Ignore fragments from the last session as it might not have yet completed
if curr_fragment["payload/info/sessionId"] == last_session_id:
break
# Is a fragment missing? Here we are considering only chain breaks between two consecutive sessions
if prev_pss_counter + 1 != current_pss_counter and \
prev_fragment["payload/info/sessionId"] == curr_fragment["payload/info/previousSessionId"]:
# Ignore fake missing fragments
if prev_fragment["payload/info/reason"] in ("aborted-session", "shutdown") and \
curr_fragment["payload/info/subsessionCounter"] == 1:
continue
if cmp(prev_fragment, curr_fragment):
cmp_missing += 1
else:
other_missing += 1
total_missing = cmp_missing + other_missing
frac = 100.0*cmp_missing/total_missing
if debug:
print "CMP {}, Other {}, Total {}".format(cmp_missing, other_missing, total_missing)
print "{:.2f}% of chain breaks are due to {} fragments".format(frac, reason)
return frac
def ending_cmp(prev, curr):
# Are one or more of the ending fragments missing?
return prev["payload/info/reason"] not in ("aborted-session", "shutdown")
def starting_cmp(prev, curr):
# Are one or more starting fragments missing?
return curr["payload/info/subsessionCounter"] != 1
ending = missing(dd_grouped, ending_cmp, "ending", debug=True)
starting = missing(dd_grouped, starting_cmp, "starting", debug=True)
CMP 29, Other 78, Total 107 27.10% of chain breaks are due to ending fragments CMP 78, Other 29, Total 107 72.90% of chain breaks are due to starting fragments