import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import networkx as nx
import collections
import datetime
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client
%pylab inline
Populating the interactive namespace from numpy and matplotlib
sc.defaultParallelism
64
see http://nbviewer.ipython.org/gist/bcolloran/757e35f7990d62a49f83
import boto
conn = boto.connect_s3()
inBucketName = "net-mozaws-prod-us-west-2-pipeline-analysis"
pathToOutput = "bcolloran/mergedDataPerClient/nightly/2015-07-09/8937clients"
mozBucket = conn.get_bucket(inBucketName) # Substitute in your bucket name
bl = mozBucket.list(prefix=pathToOutput)
print "data size:", sum(key.size for key in bl)
list(bl)[-5:]
data size: 3320365309
[<Key: net-mozaws-prod-us-west-2-pipeline-analysis,bcolloran/mergedDataPerClient/nightly/2015-07-09/8937clients/part-00195>, <Key: net-mozaws-prod-us-west-2-pipeline-analysis,bcolloran/mergedDataPerClient/nightly/2015-07-09/8937clients/part-00196>, <Key: net-mozaws-prod-us-west-2-pipeline-analysis,bcolloran/mergedDataPerClient/nightly/2015-07-09/8937clients/part-00197>, <Key: net-mozaws-prod-us-west-2-pipeline-analysis,bcolloran/mergedDataPerClient/nightly/2015-07-09/8937clients/part-00198>, <Key: net-mozaws-prod-us-west-2-pipeline-analysis,bcolloran/mergedDataPerClient/nightly/2015-07-09/8937clients/part-00199>]
#pull just the info sections for each v4 ping
cleanedClients = sc.sequenceFile( "s3n://"+inBucketName+"/"+pathToOutput +"/part-*" ) \
.mapValues(json.loads) \
.mapValues(lambda data: [ping.get("payload/info") for ping in data["v4"]] )
# cc = cleanedClients.first()
# len(cc),cc[0]#,cc[0]["reason"]
Once we've grouped by clientId, we can use the subSessionIds and previousSubsessionId pointer to construct a graph of the session, and then look at the degree of each node in the graph to see whether the graph looks the way we expect. Ideally, we should have a graph that has zero nodes with degree greater than 1 (no subsession should be pointed at by two or more sessions) and only one node with degree zero (the first subsession in the chain won't point to any prior subsession). Additionally, any given value of 'profileSubsessionCounter' should appear only once.
def sessGraphFromList(sessInfoList,byDate=True):
g = nx.DiGraph()
subSessIds = [s['subsessionId'] for s in sessInfoList]
nodePositions = []
for s in sessInfoList:
# each s is a separate subsession submission "info" section
# have to clear these vars to prevent spillover from previous iter
subsessId = None
prevSubSess = None
try:
subsessId = s['subsessionId']
except KeyError:
continue
N = s.get("profileSubsessionCounter",-1)
subSessCounter = s.get("subsessionCounter",-1)
sessionId = s.get("sessionId","NA")
#if a node has already been added with this (x,y) position, bump up the y position
if byDate:
dateStr = s.get("sessionStartDate","2015-01-01")[0:10]
dateNum = datetime.datetime.strptime(dateStr,"%Y-%m-%d").toordinal()
thisPosition = [dateNum,subSessCounter]
else:
thisPosition = [N,subSessCounter]
while thisPosition in nodePositions:
thisPosition[1]+=1
g.add_node(subsessId,subSessNum=N,x=thisPosition[0], y=thisPosition[1], sessionId=sessionId)
nodePositions.append(thisPosition)
prevSubSess = s.get('previousSubsessionId',None)
if prevSubSess in subSessIds:
#NB: adding these nodes allows us to infer the existence
#of some sessions that have not actually been submitted.
# Nodes added this way will not have any data attached to them.
g.add_node(prevSubSess)
g.add_edge(subsessId,prevSubSess)
#add placement details for inferred nodes
for nodeId,nodeData in (tup for tup in g.nodes(data=True) if not tup[1]):
pred = g.predecessors(nodeId)[0]
nodeData["x"] = g.node[pred]["x"]-random.random()/3 +.1/.6
nodeData["y"] = g.node[pred]["y"]-1-random.random()
return g
# def degreeDistrib(g):
# return collections.Counter(g.in_degree().values())
def subsessSummary(sessInfoList):
g = sessGraphFromList(sessInfoList)
# d = degreeDistrib(g)
subSessCounterMult = collections.Counter(map(lambda x:x["profileSubsessionCounter"], sessInfoList))
return {"info":sorted(sessInfoList, key=lambda x:x["profileSubsessionCounter"]),
"graph":g,
"inDegDist": collections.Counter(g.in_degree().values()), #how many nodes point at this one?
"outDegDist": collections.Counter(g.out_degree().values()), #how many does this node point at?
"subSessCounterMult":subSessCounterMult}
subsessSummaries = cleanedClients.map(lambda c: (c[0],subsessSummary(c[1])))
# subsessSummaries.cache()
sss = subsessSummaries.take(10)
sss[0][1].keys()
['info', 'graph', 'outDegDist', 'subSessCounterMult', 'inDegDist']
# badGraphSummaries = subsessSummaries.filter(
# lambda id_summary: any([k>1 for k in id_summary[1]["degDist"].keys()]) \
# or any([v>1 for k,v in id_summary[1]["degDist"].iteritems() if k!=1]) \
# or any([v>1 for v in id_summary[1]["subSessCounterMult"].values()]) )
# badGraphSummaries.cache()
# numBadGraphs = badGraphSummaries.count()
# numBadGraphs
def badGraphCounter(summary):
"""
explanations of the following bad graph reasons:
every graph should have exactly ONE node with degree 0 (the root node),
and all the rest should have degree 1
"nodeWithInDegGtOne":
There is a subsess that is pointed at by more than one following subsess.
This indicates a branching history.
"moreThanOneNodeWithDegGtOne":
This indicates multiple branching
"moreThanOneNodeWithOutDegZero":
There is more than one node that has out-degree 0, i.e. more than one root node,
which indicates a broken chain.
"repeatedProfileSubsessionCounter":
one or more values of profileSubsessionCounter have been repeated,
which can indicate a reset or a failure to increment.
"""
outDict = {"nodeWithInDegGtOne":
1 if any([k>1 for k in summary["inDegDist"].keys()]) else 0,
"moreThanOneNodeWithInDegGtOne":
1 if any([v>1 for k,v in summary["inDegDist"].iteritems() if k>1]) else 0,
"moreThanOneNodeWithOutDegZero":
1 if any([v>1 for k,v in summary["outDegDist"].iteritems() if k==0]) else 0,
"repeatedProfileSubsessionCounter":
1 if any([v>1 for v in summary["subSessCounterMult"].values()]) else 0}
return collections.Counter(outDict)
numClients = cleanedClients.count()
badGraphReasons = subsessSummaries \
.map(lambda id_data: badGraphCounter(id_data[1])) \
.reduce(lambda l1,l2: l1+l2)
badGraphReasons
Counter({'moreThanOneNodeWithOutDegZero': 1881, 'repeatedProfileSubsessionCounter': 710, 'nodeWithInDegGtOne': 35, 'moreThanOneNodeWithInDegGtOne': 8})
print "number of clients in this sample:", numClients, "\n"
for k in badGraphReasons:
print badGraphReasons[k]/float(numClients), " --",k
number of clients in this sample: 8937 0.00391630300996 -- nodeWithInDegGtOne 0.210473313192 -- moreThanOneNodeWithOutDegZero 0.0794450039163 -- repeatedProfileSubsessionCounter 0.000895154973705 -- moreThanOneNodeWithInDegGtOne
reminder, these mean:
nodeWithInDegGtOne -- profile branching
moreThanOneNodeWithOutDegZero -- multiple chains/trees (missing sessions)
repeatedProfileSubsessionCounter
moreThanOneNodeWithInDegGtOne -- multiple profile branching
So a little over 20% of clients appear to be missing sessions. Let's grab a few of these clients and plot them just to see what we're dealing with, then let's try to characterize what the gaps looks like.
gapClients = subsessSummaries \
.filter(lambda id_data: badGraphCounter(id_data[1])["moreThanOneNodeWithOutDegZero"] )
gapClients_10 = gapClients.take(10)
def plotSessGraph(g):
pos = {n:[d['x'],d["y"]] for n,d in g.nodes(data=True)}
fig, ax = plt.subplots(1,figsize=(18,3), dpi=100)
G = nx.draw(g,pos=pos,ax=ax,node_size=300, node_color="w")
for sessId,nodeData in [(n,d) for n,d in g.nodes(data=True) if d]:
x,y = nodeData["x"],nodeData["y"]
subSessNum = nodeData.get("subSessNum","NA")
plt.text(x,y-.1,s=subSessNum
,horizontalalignment='center')
plt.text(x-.2,y-.3,s=nodeData.get("sessionId","NA")[0:4]
,rotation=90
,horizontalalignment='center')
plt.show()
def plotSessGraphAndInfo(i_x):
i,x = i_x
printSessGraphAndInfo(i_x)
sessInfoList = x[1]["info"]
g = sessGraphFromList(sessInfoList)
print "subsessions by date"
plotSessGraph(g)
print "subsessions by profileSubsessionCounter (x-axis)"
g = sessGraphFromList(sessInfoList,byDate=False)
plotSessGraph(g)
def printSessGraphAndInfo(i_x):
i,x = i_x
sessInfoList = x[1]["info"]
g = sessGraphFromList(sessInfoList)
print "################################ sample index:", i
profileSubsessionCounterVals = sorted(map(lambda x:x["profileSubsessionCounter"], sessInfoList))
print "profileSubsessionCounter values:"
minPsc = min(profileSubsessionCounterVals)
maxPsc = max(profileSubsessionCounterVals)
print "min:", minPsc , "max:", maxPsc
print "expected number profileSubsessionCounter values:", maxPsc,"-",minPsc," + 1 =",maxPsc-minPsc+1
print "observed number of profileSubsessionCounter values:", len(profileSubsessionCounterVals)
subSessCounterMult = collections.Counter(map(lambda x:x["profileSubsessionCounter"], sessInfoList))
if any([v>1 for v in subSessCounterMult.values()]):
print "repeated profileSubsessionCounter(s): ", {c:m for c,m in subSessCounterMult.iteritems() if m>1}
else:
print "NUMBER OF SESSION MISSING:", maxPsc-minPsc+1 - len(profileSubsessionCounterVals)
In what follows, two plots are drawn for depicting the subsession histories of a sample of clients. For each client, we draw:
for i,clientData in enumerate(gapClients_10):
clientDates = [ss.get('sessionStartDate',0) for ss in clientData[1]["info"]]
# if any([ (ss.get('profileSubsessionCounter',0)==1
# and any(ss.get('sessionStartDate',None)>d for d in clientDates))
# for ss in clientData[1]["info"]]):
plotSessGraphAndInfo((i,clientData))
################################ sample index: 0 profileSubsessionCounter values: min: 51 max: 173 expected number profileSubsessionCounter values: 173 - 51 + 1 = 123 observed number of profileSubsessionCounter values: 114 NUMBER OF SESSION MISSING: 9 subsessions by date
subsessions by profileSubsessionCounter (x-axis)
################################ sample index: 1 profileSubsessionCounter values: min: 153 max: 397 expected number profileSubsessionCounter values: 397 - 153 + 1 = 245 observed number of profileSubsessionCounter values: 226 NUMBER OF SESSION MISSING: 19 subsessions by date
subsessions by profileSubsessionCounter (x-axis)
################################ sample index: 2 profileSubsessionCounter values: min: 4 max: 23 expected number profileSubsessionCounter values: 23 - 4 + 1 = 20 observed number of profileSubsessionCounter values: 20 repeated profileSubsessionCounter(s): {14: 2} subsessions by date
subsessions by profileSubsessionCounter (x-axis)
################################ sample index: 3 profileSubsessionCounter values: min: 30 max: 40 expected number profileSubsessionCounter values: 40 - 30 + 1 = 11 observed number of profileSubsessionCounter values: 4 NUMBER OF SESSION MISSING: 7 subsessions by date
subsessions by profileSubsessionCounter (x-axis)
################################ sample index: 4 profileSubsessionCounter values: min: 28 max: 136 expected number profileSubsessionCounter values: 136 - 28 + 1 = 109 observed number of profileSubsessionCounter values: 108 NUMBER OF SESSION MISSING: 1 subsessions by date
subsessions by profileSubsessionCounter (x-axis)
################################ sample index: 5 profileSubsessionCounter values: min: 120 max: 416 expected number profileSubsessionCounter values: 416 - 120 + 1 = 297 observed number of profileSubsessionCounter values: 278 NUMBER OF SESSION MISSING: 19 subsessions by date
subsessions by profileSubsessionCounter (x-axis)
################################ sample index: 6 profileSubsessionCounter values: min: 1 max: 25 expected number profileSubsessionCounter values: 25 - 1 + 1 = 25 observed number of profileSubsessionCounter values: 24 NUMBER OF SESSION MISSING: 1 subsessions by date
subsessions by profileSubsessionCounter (x-axis)
################################ sample index: 7 profileSubsessionCounter values: min: 1 max: 9 expected number profileSubsessionCounter values: 9 - 1 + 1 = 9 observed number of profileSubsessionCounter values: 15 repeated profileSubsessionCounter(s): {1: 4, 2: 2, 3: 2, 4: 2} subsessions by date
subsessions by profileSubsessionCounter (x-axis)
################################ sample index: 8 profileSubsessionCounter values: min: 27 max: 69 expected number profileSubsessionCounter values: 69 - 27 + 1 = 43 observed number of profileSubsessionCounter values: 42 NUMBER OF SESSION MISSING: 1 subsessions by date
subsessions by profileSubsessionCounter (x-axis)
################################ sample index: 9 profileSubsessionCounter values: min: 1 max: 8 expected number profileSubsessionCounter values: 8 - 1 + 1 = 8 observed number of profileSubsessionCounter values: 7 NUMBER OF SESSION MISSING: 1 subsessions by date
subsessions by profileSubsessionCounter (x-axis)
For clients that have a few nice flat subsession chains (no branching or anything weird), we can get figure out how many sessions are missing with a pretty simple technique:
The number of subsessions we expect to see is given by:
max(profileSubsessionCounterVals) - min(profileSubsessionCounterVals) + 1
The number we actually do see is bydefinition just the number of subsessions records. We can use e.g.:
len(profileSubsessionCounterVals)
(For clients that co have either repeated values of profileSubsessionCounter or that have branching histories, figuring out how many sessions are missing not so straighforward.
'''
nodeWithInDegGtOne -- profile branching
moreThanOneNodeWithOutDegZero -- multiple chains/trees (missing sessions)
repeatedProfileSubsessionCounter
moreThanOneNodeWithInDegGtOne -- multiple profile branching
'''
def getNumMissing(clientData):
bgc = badGraphCounter(clientData)
if (bgc["nodeWithInDegGtOne"]==1 or #branching
bgc["repeatedProfileSubsessionCounter"]==1): #repeated profileSubsessionCounter
numMissing = np.nan
else:
profileSubsessionCounterVals = sorted(map(lambda x:x["profileSubsessionCounter"], clientData["info"]))
minPsc = min(profileSubsessionCounterVals)
maxPsc = max(profileSubsessionCounterVals)
numMissing = maxPsc-minPsc+1 - len(profileSubsessionCounterVals)
# elif all([v==0 for v in bgc.values()]):
# numMissing = 0
# else:
return numMissing
numMissingList = subsessSummaries \
.map(lambda id_data: getNumMissing(id_data[1])).collect()
plt.hist([x for x in numMissingList if not isnan(x)], bins=100, alpha=0.5)
plt.title('distributions of subsession missingness')
plt.xlabel('subsessions missing')
plt.ylabel('count')
print "number of clients", numClients
v1 = sum([1 for x in numMissingList if x==0])
print "clients with no missingness and OK session chains:"
print "%d clients, %f pct" % (v1, 100.0*v1/numClients)
v2 = sum([1 for x in numMissingList if isnan(x)])
print "clients for which missingness cannot be determined:"
print "%d clients, %f pct" % (v2, 100.0*v2/numClients)
v3 = sum([1 for x in numMissingList if (not isnan(x) and x!=0)])
print "clients with missing sessions:"
print "%d clients, %f pct" % (v3, 100.0*v3/numClients)
print "\nextrapolated % of clients missing subsessions:", 100.0*v3/numClients + (1.0*v3/(v1+v3))*100.0*v2/numClients
number of clients 8937 clients with no missingness and OK session chains: 6937 clients, 77.621126 pct clients for which missingness cannot be determined: 710 clients, 7.944500 pct clients with missing sessions: 1290 clients, 14.434374 pct extrapolated % of clients missing subsessions: 15.6800777926
From the above, we see that about 14.5% of clients can be known to be missing subsessions, and another ~8% have profile branching or resets, and so cannot rule out the possiblity that they are missing some subsessions (I think it is likely that they are missing sessions at an above-average rate).
If we assume that the 8% of client about which we cannot state anything conlcusive does have missingness at the same rate as the rest of the population, we can extrapolate that about 15.7% of the population has some missingness.
plt.hist([x for x in numMissingList if (not isnan(x) and x!=0)], bins=100, alpha=0.5)
plt.title('distributions of subsession missingness\n(among clients missing at least one subsession)')
plt.xlabel('subsessions missing')
plt.ylabel('count')
<matplotlib.text.Text at 0x7fc706e62150>
For clients that don't have repeated profileSubsessionCounter values or branching, we can get figure out how many sessions are missing. (For clients that co have either of those anomlies, figuring out how many sessions are missing is not so straighforward).
def getPctMissing(clientData):
bgc = badGraphCounter(clientData)
if (bgc["nodeWithInDegGtOne"]==1 or #branching
bgc["repeatedProfileSubsessionCounter"]==1): #repeated profileSubsessionCounter
numMissing = np.nan
else:
profileSubsessionCounterVals = sorted(map(lambda x:x["profileSubsessionCounter"], clientData["info"]))
minPsc = min(profileSubsessionCounterVals)
maxPsc = max(profileSubsessionCounterVals)
numMissing = 100.0*(maxPsc-minPsc+1 - len(profileSubsessionCounterVals))/(maxPsc-minPsc+1)
# elif all([v==0 for v in bgc.values()]):
# numMissing = 0
# else:
return numMissing
pctMissingList = subsessSummaries \
.map(lambda id_data: getPctMissing(id_data[1])).collect()
plt.hist([x for x in pctMissingList if not isnan(x)], bins=100, alpha=0.5)
plt.title('distributions of subsession missingness')
plt.xlabel('subsessions missing')
plt.ylabel('count in bin')
<matplotlib.text.Text at 0x7fc706181910>
plt.hist([x for x in pctMissingList if (not isnan(x) and x!=0)], bins=100, alpha=0.5)
plt.title('precentage of subsessions missing\n(among clients missing at least one subsession)')
plt.xlabel('pct of subsessions missing')
plt.ylabel('count in bin')
<matplotlib.text.Text at 0x7fc705f57310>
def getNumMissingAndExpected(clientData):
bgc = badGraphCounter(clientData)
if (bgc["nodeWithInDegGtOne"]==1 or #branching
bgc["repeatedProfileSubsessionCounter"]==1): #repeated profileSubsessionCounter
numMissing = np.nan
numExpected = np.nan
else:
profileSubsessionCounterVals = sorted(map(lambda x:x["profileSubsessionCounter"], clientData["info"]))
minPsc = min(profileSubsessionCounterVals)
maxPsc = max(profileSubsessionCounterVals)
numExpected = (maxPsc-minPsc+1)
numMissing = 100.0*(maxPsc-minPsc+1 - len(profileSubsessionCounterVals))/numExpected
return (numMissing,numExpected)
numMissingAndExpected = subsessSummaries \
.map(lambda id_data: getNumMissingAndExpected(id_data[1])).collect()
totalMissing = sum([x[0] for x in numMissingAndExpected if not isnan(x[0])])
totalExpected = sum([x[1] for x in numMissingAndExpected if not isnan(x[1])])
print "total subsessions expected in this sample:", totalExpected
print "total subsessions missing:", totalMissing
print "pct missing:", 1.0*totalMissing/totalExpected
total subsessions expected in this sample: 326318 total subsessions missing: 18508.5003925 pct missing: 0.0567192137501
So it appears that overall we're missing at least 5.7% of the subsessions we should be expecting to see-- and this number is quite possibly a low estimate, because we cannot estimate for clients with branching histories or profileSubsessionCounter resets.
As is typically the case in this kind of data, this appears to be a long-tail phenomenon: most clients are not missing any subsession; of those clients that are missing some subsessions, most clients are missing only a few; but there are a enough clients missing a few subsessions and enough clients missing a lot of subsessions that all together we're missing nearly 6% of the subsessions we'd expect to see.