#!/usr/bin/env python # coding: utf-8 # In[1]: import binascii import pandas as pd from operator import attrgetter, itemgetter from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history from collections import defaultdict from __future__ import division get_ipython().run_line_magic('pylab', 'inline') # In[2]: sc.defaultParallelism # Get all main pings for a set of recent build-ids: # In[3]: build_ids = ("20150722000000", "20150729999999") pings = get_pings(sc, app="Firefox", channel="nightly", build_id=build_ids, doc_type="main", schema="v4") crashes = get_pings(sc, app="Firefox", channel="nightly", build_id=build_ids, doc_type="crash", schema="v4") # Take a subset of nightly clients: # In[4]: def sample(ping): client_id = ping.get("clientId", None) return client_id and binascii.crc32(ping["clientId"]) % 100 < 10 sampled_pings = pings.filter(sample) sampled_crashes = crashes.filter(sample) # In[5]: crashes_by_client = sampled_crashes.map(lambda c: (c["clientId"], c["meta"])).groupByKey().collectAsMap() # Get a subset of fields: # In[6]: subset = get_pings_properties(sampled_pings, ["clientId", "meta/documentId", "meta/submissionDate", "meta/creationTimestamp", "environment/system/os/name", "payload/info/reason", "payload/info/sessionId", "payload/info/subsessionId", "payload/info/previousSessionId", "payload/info/previousSubsessionId", "payload/info/subsessionCounter", "payload/info/profileSubsessionCounter", "payload/simpleMeasurements/firstPaint", "payload/simpleMeasurements/savedPings", "payload/simpleMeasurements/uptime", "payload/histograms/STARTUP_CRASH_DETECTED"]) # Group fragments by client and dedupe by documentId: # In[7]: def dedupe_and_sort(group): key, history = group seen = set() result = [] for fragment in history: id = fragment["meta/documentId"] if id in seen: continue seen.add(id) result.append(fragment) result.sort(key=itemgetter("payload/info/profileSubsessionCounter")) return result grouped = subset.groupBy(lambda x: x["clientId"]).map(dedupe_and_sort).collect() # **< Digression>** What's the percentage of clients that have at least one pair of fragments with different documentIds but the same profileSubsessionCounter? # In[8]: def duplicate_pssc(grouped): dupes = 0 dupe_clients = set() for history in grouped: counts = defaultdict(int) for fragment in history: key = fragment["payload/info/profileSubsessionCounter"] counts[key] += 1 for _, v in counts.iteritems(): if v > 1: dupes += 1 dupe_clients.add(history[0]["clientId"]) break print 100.0*dupes/len(grouped) return dupe_clients dupe_clients = duplicate_pssc(grouped) # **< /Digression\>** Let's remove those clients to be safe. # In[9]: dd_grouped = filter(lambda h: h[0]["clientId"] not in dupe_clients, grouped) # Given the set of chain breaks, how many of them are due to missing starting/ending fragments? # In[10]: class AdjacentBreaks: def __init__(self): self.missing_total = 0 self.missing_start = 0 self.missing_end = 0 self.missing_both = 0 self.crashed_prev = 0 self.reason = defaultdict(int) def process(self, prev, curr): if prev["payload/info/sessionId"] == curr["payload/info/previousSessionId"]: # Ignore fake missing fragments? See IncrementError class if prev["payload/info/reason"] in ("aborted-session", "shutdown") and \ curr["payload/info/subsessionCounter"] == 1: return self.missing_total += 1 self.reason["{} -> {}".format(prev["payload/info/reason"], curr["payload/info/reason"])] += 1 # Are there missing starting fragments? missing_start = curr["payload/info/subsessionCounter"] != 1 # Are there missing ending fragments? missing_end = prev["payload/info/reason"] not in ("aborted-session", "shutdown") if missing_start and missing_end: self.missing_both += 1 elif missing_start: self.missing_start += 1 elif missing_end: self.missing_end += 1 self.crashed_prev += curr["payload/histograms/STARTUP_CRASH_DETECTED"] or has_crash_ping(prev, curr) def stats(self, total): print "ADJACENT SESSIONS STATS" print "{:5.2f}% of edges have fragments missing".format(100*self.missing_total/total) print "{:5.2f}% of edges are missing one or more starting fragments".format(100*self.missing_start/total) print "{:5.2f}% of edges are missing one or more ending fragments".format(100*self.missing_end/total) print "{:5.2f}% of edges are missing both starting and ending fragments".format(100*self.missing_both/total) print "{:5.2f}% of edges have a crash in-between".format(100*self.crashed_prev/self.missing_total) print "" print "Reason distribution:" print dict(self.reason) print "" class WithinBreaks: def __init__(self): self.missing_total = 0 self.crashed_prev = 0 self.reason = defaultdict(int) def process(self, prev, curr): if prev["payload/info/sessionId"] == curr["payload/info/sessionId"]: self.missing_total += 1 self.reason["{} -> {}".format(prev["payload/info/reason"], curr["payload/info/reason"])] += 1 self.crashed_prev += curr["payload/histograms/STARTUP_CRASH_DETECTED"] or has_crash_ping(prev, curr) def stats(self, total): print "WITHIN SESSIONS STATS" print "{:5.2f}% of edges have fragments missing".format(100*self.missing_total/total) print "{:5.2f}% of edges have a crash in-between".format(100*self.crashed_prev/self.missing_total) print "" print "Reason distribution:" print dict(self.reason) print "" class NonAdjacentBreaks: def __init__(self): self.missing_total = 0 self.reason = defaultdict(int) self.difference = defaultdict(int) self.crashed_prev = 0 def process(self, prev, curr): if prev["payload/info/sessionId"] != curr["payload/info/sessionId"] and \ prev["payload/info/sessionId"] != curr["payload/info/previousSessionId"]: self.missing_total += 1 self.reason["{} -> {}".format(prev["payload/info/reason"], curr["payload/info/reason"])] += 1 self.difference[curr["payload/info/profileSubsessionCounter"] - prev["payload/info/profileSubsessionCounter"]] += 1 self.crashed_prev += curr["payload/histograms/STARTUP_CRASH_DETECTED"] or has_crash_ping(prev, curr) def stats(self, total): print "NON-ADJACENT SESSIONS STATS" print "{:5.2f}% of edges have fragments missing".format(100*self.missing_total/total) print "{:5.2f}% of edges have a crash in-between".format(100*self.crashed_prev/self.missing_total) print "" print "Reason distribution:" print dict(self.reason) print "" print "Difference distribution:" dist = pd.Series(self.difference) dist.sort_index() print dist print "" class IncrementError: def __init__(self): self.errors_total = 0 self.reason = defaultdict(int) def process(self, prev, curr): if prev["payload/info/sessionId"] == curr["payload/info/previousSessionId"] and \ prev["payload/info/reason"] in ("aborted-session", "shutdown") and \ curr["payload/info/subsessionCounter"] == 1: self.errors_total += 1 self.reason["{} -> {}".format(prev["payload/info/reason"], curr["payload/info/reason"])] += 1 def stats(self, total): print "PROFILESUBSESSIONCOUNTER INCREMENT ERRORS" print "{:5.2f}% of edges have a mismatching profileSubsessionCounter".format(100*self.errors_total/total) print "" print "Reason distribution:" print dict(self.reason) print "" def has_crash_ping(prev, curr): client_id = prev["clientId"] client_crashes = crashes_by_client.get(client_id, None) if client_crashes: for crash in list(client_crashes): if crash["creationTimestamp"] >= prev["meta/creationTimestamp"] and \ crash["creationTimestamp"] <= curr["meta/creationTimestamp"]: return True return False def missing(grouped): broken_clients = set() correct_clients = set() num_broken_chains = 0 num_crashed = 0 total_edges = 0 adjacent_breaks = AdjacentBreaks() within_breaks = WithinBreaks() non_adjacent_breaks = NonAdjacentBreaks() increment_errors = IncrementError() for history in grouped: correct_clients.add(history[0]["clientId"]) for i in range(1, len(history)): prev_fragment = history[i - 1] prev_pss_counter = prev_fragment["payload/info/profileSubsessionCounter"] curr_fragment = history[i] current_pss_counter = curr_fragment["payload/info/profileSubsessionCounter"] num_crashed += curr_fragment["payload/histograms/STARTUP_CRASH_DETECTED"] or has_crash_ping(prev_fragment, curr_fragment) total_edges += 1 # Is a fragment missing? if prev_pss_counter + 1 != current_pss_counter: broken_clients.add(curr_fragment["clientId"]) num_broken_chains += 1 adjacent_breaks.process(prev_fragment, curr_fragment) within_breaks.process(prev_fragment, curr_fragment) non_adjacent_breaks.process(prev_fragment, curr_fragment) increment_errors.process(prev_fragment, curr_fragment) correct_clients = correct_clients.difference(broken_clients) print "GENERAL STATS" print "{:5.2f}% clients have a broken session chain".format(100*len(broken_clients)/len(grouped)) print "{:5.2f}% of clients with a missing fragment experienced at least one crash".format(100*len(broken_clients.intersection(crashes_by_client.keys()))/len(broken_clients)) print "{:5.2f}% of clients without a missing fragment experienced at least one crash".format(100*len(correct_clients.intersection(crashes_by_client.keys()))/len(correct_clients)) print "{:5.2f}% of edges have a crash in-between\n".format(100*num_crashed/total_edges) increment_errors.stats(num_broken_chains) adjacent_breaks.stats(num_broken_chains) within_breaks.stats(num_broken_chains) non_adjacent_breaks.stats(num_broken_chains) missing(dd_grouped) # In[ ]: