import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import networkx as nx
import collections
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client
%pylab inline
Populating the interactive namespace from numpy and matplotlib
pings = get_pings(sc, app="Firefox",
channel="nightly",
submission_date=("20150507","20150514"),
fraction=1,
schema="v4")
def extract_sub(p):
return p.get('payload', {}).get('info', {}).get('subsessionId', 'NO_SUBSESSION_ID')
by_sub_map = pings.map(extract_sub)
sub_ids = by_sub_map.collect()
def dupes(l):
from collections import defaultdict
seen = set()
dupes = defaultdict(int)
for v in l:
if v in seen:
dupes[v] += 1
else:
seen.add(v)
return dupes
dd = dupes(sub_ids)
len(dd)
17246
len(sub_ids)
1594174
dk = set(dd.keys())
def save_multi_subsession(p):
if p['id'] not in dk:
return
return (p['id'], p)
dupe_map = pings.map(save_multi_subsession)
def reduce_multi_subsession(ping1, ping2):
if not (ping1 and ping2):
return []
if 'meta' in ping1:
del ping1['meta']
if 'meta' in ping2:
del ping2['meta']
if ping1 != ping2:
return [ping1, ping2]
diffs = dupe_map.reduce(reduce_multi_subsession)
len(diffs)
0