#!/usr/bin/env python # coding: utf-8 # In[1]: import ujson as json import matplotlib.pyplot as plt import pandas as pd import numpy as np import plotly.plotly as py import networkx as nx import collections from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client get_ipython().run_line_magic('pylab', 'inline') # In[3]: pings = get_pings(sc, app="Firefox", channel="nightly", submission_date=("20150507","20150514"), fraction=1, schema="v4") # In[38]: def extract_sub(p): return p.get('payload', {}).get('info', {}).get('subsessionId', 'NO_SUBSESSION_ID') # In[39]: by_sub_map = pings.map(extract_sub) # In[41]: sub_ids = by_sub_map.collect() # In[46]: def dupes(l): from collections import defaultdict seen = set() dupes = defaultdict(int) for v in l: if v in seen: dupes[v] += 1 else: seen.add(v) return dupes # In[49]: dd = dupes(sub_ids) # In[50]: len(dd) # In[54]: len(sub_ids) # In[58]: dk = set(dd.keys()) # In[66]: def save_multi_subsession(p): if p['id'] not in dk: return return (p['id'], p) # In[67]: dupe_map = pings.map(save_multi_subsession) # In[69]: def reduce_multi_subsession(ping1, ping2): if not (ping1 and ping2): return [] if 'meta' in ping1: del ping1['meta'] if 'meta' in ping2: del ping2['meta'] if ping1 != ping2: return [ping1, ping2] # In[70]: diffs = dupe_map.reduce(reduce_multi_subsession) # In[71]: len(diffs) # In[ ]: