#!/usr/bin/env python # coding: utf-8 # In[2]: import binascii from operator import attrgetter, itemgetter from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history from collections import defaultdict get_ipython().run_line_magic('pylab', 'inline') # In[3]: sc.defaultParallelism # Get all main pings for a set of recent build-ids: # In[4]: pings = get_pings(sc, app="Firefox", channel="nightly", build_id=("20150710000000", "20150717999999"), doc_type="main", schema="v4") # Take only 10% of nightly clients: # In[5]: def sample(ping): client_id = ping.get("clientId", None) return client_id and binascii.crc32(ping["clientId"]) % 100 < 10 sampled = pings.filter(sample) # Get a subset of fields: # In[6]: subset = get_pings_properties(sampled, ["clientId", "meta/documentId", "environment/system/os/name", "payload/info/reason", "payload/info/sessionId", "payload/info/subsessionId", "payload/info/previousSessionId", "payload/info/previousSubsessionId", "payload/info/subsessionCounter", "payload/info/profileSubsessionCounter"]) # Group fragments by client and dedupe by documentId: # In[7]: def dedupe_and_sort(group): key, history = group seen = set() result = [] for fragment in history: id = fragment["meta/documentId"] if id in seen: continue seen.add(id) result.append(fragment) result.sort(key=itemgetter("payload/info/profileSubsessionCounter")) return result grouped = subset.groupBy(lambda x: x["clientId"]).map(dedupe_and_sort).collect() # **< Digression>** What's the percentage of clients that have at least one pair of fragments with different documentIds but the same profileSubsessionCounter? # In[8]: def duplicate_pssc(grouped): dupes = 0 dupe_clients = set() for history in grouped: counts = defaultdict(int) for fragment in history: key = fragment["payload/info/profileSubsessionCounter"] counts[key] += 1 for _, v in counts.iteritems(): if v > 1: dupes += 1 dupe_clients.add(history[0]["clientId"]) break print 100.0*dupes/len(grouped) return dupe_clients dupe_clients = duplicate_pssc(grouped) # **< /Digression\>** Let's remove those clients to be safe. # In[9]: dd_grouped = filter(lambda h: h[0]["clientId"] not in dupe_clients, grouped) # Given the set of chain breaks in consecutive sessions, how many of them are due to missing starting/ending fragments? # In[41]: def missing(grouped, cmp, reason="", debug=False): cmp_missing = 0 other_missing = 0 for history in grouped: last_session_id = history[-1]["payload/info/sessionId"] for i in range(1, len(history)): curr_fragment = history[i] current_pss_counter = curr_fragment["payload/info/profileSubsessionCounter"] prev_fragment = history[i - 1] prev_pss_counter = prev_fragment["payload/info/profileSubsessionCounter"] # Ignore fragments from the last session as it might not have yet completed if curr_fragment["payload/info/sessionId"] == last_session_id: break # Is a fragment missing? Here we are considering only chain breaks between two consecutive sessions if prev_pss_counter + 1 != current_pss_counter and \ prev_fragment["payload/info/sessionId"] == curr_fragment["payload/info/previousSessionId"]: # Ignore fake missing fragments if prev_fragment["payload/info/reason"] in ("aborted-session", "shutdown") and \ curr_fragment["payload/info/subsessionCounter"] == 1: continue if cmp(prev_fragment, curr_fragment): cmp_missing += 1 else: other_missing += 1 total_missing = cmp_missing + other_missing frac = 100.0*cmp_missing/total_missing if debug: print "CMP {}, Other {}, Total {}".format(cmp_missing, other_missing, total_missing) print "{:.2f}% of chain breaks are due to {} fragments".format(frac, reason) return frac def ending_cmp(prev, curr): # Are one or more of the ending fragments missing? return prev["payload/info/reason"] not in ("aborted-session", "shutdown") def starting_cmp(prev, curr): # Are one or more starting fragments missing? return curr["payload/info/subsessionCounter"] != 1 ending = missing(dd_grouped, ending_cmp, "ending", debug=True) starting = missing(dd_grouped, starting_cmp, "starting", debug=True) # In[ ]: