In [1]:

import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
import networkx as nx
import collections
import datetime

from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client

%pylab inline

Populating the interactive namespace from numpy and matplotlib

In [3]:

sc.defaultParallelism

Out[3]:

Load the cleaned per-client info sections¶

see http://nbviewer.ipython.org/gist/bcolloran/757e35f7990d62a49f83

In [2]:

import boto
conn = boto.connect_s3()

In [4]:

inBucketName = "net-mozaws-prod-us-west-2-pipeline-analysis"
pathToOutput = "bcolloran/mergedDataPerClient/nightly/2015-07-09/8937clients"

In [5]:

mozBucket = conn.get_bucket(inBucketName) # Substitute in your bucket name
bl = mozBucket.list(prefix=pathToOutput)
print "data size:", sum(key.size for key in bl)
list(bl)[-5:]

data size: 3320365309

Out[5]:

[<Key: net-mozaws-prod-us-west-2-pipeline-analysis,bcolloran/mergedDataPerClient/nightly/2015-07-09/8937clients/part-00195>,
 <Key: net-mozaws-prod-us-west-2-pipeline-analysis,bcolloran/mergedDataPerClient/nightly/2015-07-09/8937clients/part-00196>,
 <Key: net-mozaws-prod-us-west-2-pipeline-analysis,bcolloran/mergedDataPerClient/nightly/2015-07-09/8937clients/part-00197>,
 <Key: net-mozaws-prod-us-west-2-pipeline-analysis,bcolloran/mergedDataPerClient/nightly/2015-07-09/8937clients/part-00198>,
 <Key: net-mozaws-prod-us-west-2-pipeline-analysis,bcolloran/mergedDataPerClient/nightly/2015-07-09/8937clients/part-00199>]

In [6]:

#pull just the info sections for each v4 ping

cleanedClients = sc.sequenceFile( "s3n://"+inBucketName+"/"+pathToOutput +"/part-*" ) \
    .mapValues(json.loads) \
    .mapValues(lambda data: [ping.get("payload/info") for ping in data["v4"]] )

In [7]:

# cc = cleanedClients.first()

In [8]:

# len(cc),cc[0]#,cc[0]["reason"]

Construct session graphs and compute degree distributions over graphs¶

Once we've grouped by clientId, we can use the subSessionIds and previousSubsessionId pointer to construct a graph of the session, and then look at the degree of each node in the graph to see whether the graph looks the way we expect. Ideally, we should have a graph that has zero nodes with degree greater than 1 (no subsession should be pointed at by two or more sessions) and only one node with degree zero (the first subsession in the chain won't point to any prior subsession). Additionally, any given value of 'profileSubsessionCounter' should appear only once.

In [9]:

def sessGraphFromList(sessInfoList,byDate=True):
    g = nx.DiGraph()
    subSessIds = [s['subsessionId'] for s in sessInfoList]
    nodePositions = []
    for s in sessInfoList:
        # each s is a separate subsession submission "info" section
        
        # have to clear these vars to prevent spillover from previous iter
        subsessId = None 
        prevSubSess = None
        
        try:
            subsessId = s['subsessionId']
        except KeyError:
            continue

        N = s.get("profileSubsessionCounter",-1)
        subSessCounter = s.get("subsessionCounter",-1)
        sessionId = s.get("sessionId","NA")

        #if a node has already been added with this (x,y) position, bump up the y position
        if byDate:
            dateStr = s.get("sessionStartDate","2015-01-01")[0:10]
            dateNum = datetime.datetime.strptime(dateStr,"%Y-%m-%d").toordinal()
            thisPosition = [dateNum,subSessCounter]
        else:
            thisPosition = [N,subSessCounter]
        while thisPosition in nodePositions:
            thisPosition[1]+=1
            
        g.add_node(subsessId,subSessNum=N,x=thisPosition[0], y=thisPosition[1], sessionId=sessionId)
        nodePositions.append(thisPosition)
        
        prevSubSess = s.get('previousSubsessionId',None)
        if prevSubSess in subSessIds:
            #NB: adding these nodes allows us to infer the existence
            #of some sessions that have not actually been submitted.
            # Nodes added this way will not have any data attached to them.
            g.add_node(prevSubSess)
            g.add_edge(subsessId,prevSubSess)   
    #add placement details for inferred nodes
    for nodeId,nodeData in (tup for tup in g.nodes(data=True) if not tup[1]):
        pred = g.predecessors(nodeId)[0]
        nodeData["x"] = g.node[pred]["x"]-random.random()/3 +.1/.6
        nodeData["y"] = g.node[pred]["y"]-1-random.random()
          
    return g


# def degreeDistrib(g):
#     return collections.Counter(g.in_degree().values())

def subsessSummary(sessInfoList):
    g = sessGraphFromList(sessInfoList)
#     d = degreeDistrib(g)
    subSessCounterMult = collections.Counter(map(lambda x:x["profileSubsessionCounter"], sessInfoList))
    return {"info":sorted(sessInfoList, key=lambda x:x["profileSubsessionCounter"]),
            "graph":g,
            "inDegDist": collections.Counter(g.in_degree().values()), #how many nodes point at this one?
            "outDegDist": collections.Counter(g.out_degree().values()), #how many does this node point at?
            "subSessCounterMult":subSessCounterMult}

In [10]:

subsessSummaries = cleanedClients.map(lambda c: (c[0],subsessSummary(c[1])))
# subsessSummaries.cache()

In [11]:

sss = subsessSummaries.take(10)

In [65]:

sss[0][1].keys()

Out[65]:

['info', 'graph', 'outDegDist', 'subSessCounterMult', 'inDegDist']

In [12]:

# badGraphSummaries = subsessSummaries.filter(
#     lambda id_summary: any([k>1 for k in id_summary[1]["degDist"].keys()]) \
#             or any([v>1 for k,v in id_summary[1]["degDist"].iteritems() if k!=1]) \
#             or any([v>1 for v in id_summary[1]["subSessCounterMult"].values()]) )
# badGraphSummaries.cache()
# numBadGraphs = badGraphSummaries.count()
# numBadGraphs

In [13]:

def badGraphCounter(summary):
    """
    explanations of the following bad graph reasons:
    every graph should have exactly ONE node with degree 0 (the root node),
    and all the rest should have degree 1
        
    "nodeWithInDegGtOne":
        There is a subsess that is pointed at by more than one following subsess.
        This indicates a branching history.
    "moreThanOneNodeWithDegGtOne":
        This indicates multiple branching
    "moreThanOneNodeWithOutDegZero":
        There is more than one node that has out-degree 0, i.e. more than one root node,
        which indicates a broken chain.
    "repeatedProfileSubsessionCounter":
        one or more values of profileSubsessionCounter have been repeated,
        which can indicate a reset or a failure to increment.
    """
    outDict = {"nodeWithInDegGtOne":
                   1 if any([k>1 for k in summary["inDegDist"].keys()]) else 0,
               "moreThanOneNodeWithInDegGtOne":
                   1 if any([v>1 for k,v in summary["inDegDist"].iteritems() if k>1]) else 0,
               "moreThanOneNodeWithOutDegZero":
                   1 if any([v>1 for k,v in summary["outDegDist"].iteritems() if k==0]) else 0,
               "repeatedProfileSubsessionCounter":
                   1 if any([v>1 for v in summary["subSessCounterMult"].values()]) else 0}
    return collections.Counter(outDict)

Percentage of clients with bad session graphs, and kinds of bad graphs¶

In [16]:

numClients = cleanedClients.count()

In [14]:

badGraphReasons = subsessSummaries \
    .map(lambda id_data: badGraphCounter(id_data[1])) \
    .reduce(lambda l1,l2: l1+l2)

In [15]:

badGraphReasons

Out[15]:

Counter({'moreThanOneNodeWithOutDegZero': 1881, 'repeatedProfileSubsessionCounter': 710, 'nodeWithInDegGtOne': 35, 'moreThanOneNodeWithInDegGtOne': 8})

In [19]:

print "number of clients in this sample:", numClients, "\n"
for k in badGraphReasons:
    print badGraphReasons[k]/float(numClients), " --",k

number of clients in this sample: 8937 

0.00391630300996  -- nodeWithInDegGtOne
0.210473313192  -- moreThanOneNodeWithOutDegZero
0.0794450039163  -- repeatedProfileSubsessionCounter
0.000895154973705  -- moreThanOneNodeWithInDegGtOne

reminder, these mean:

nodeWithInDegGtOne               -- profile branching
moreThanOneNodeWithOutDegZero    -- multiple chains/trees (missing sessions)
repeatedProfileSubsessionCounter
moreThanOneNodeWithInDegGtOne    -- multiple profile branching

Clients with multiple session chains (i.e., clients missing sessions)¶

So a little over 20% of clients appear to be missing sessions. Let's grab a few of these clients and plot them just to see what we're dealing with, then let's try to characterize what the gaps looks like.

In [20]:

gapClients = subsessSummaries \
    .filter(lambda id_data: badGraphCounter(id_data[1])["moreThanOneNodeWithOutDegZero"] )

In [21]:

gapClients_10 = gapClients.take(10)

In [25]:

def plotSessGraph(g):
    pos = {n:[d['x'],d["y"]] for n,d in g.nodes(data=True)}
    fig, ax = plt.subplots(1,figsize=(18,3), dpi=100)
    G = nx.draw(g,pos=pos,ax=ax,node_size=300, node_color="w")
    for sessId,nodeData in [(n,d) for n,d in g.nodes(data=True) if d]:
        x,y = nodeData["x"],nodeData["y"]
        subSessNum = nodeData.get("subSessNum","NA")
        plt.text(x,y-.1,s=subSessNum
                 ,horizontalalignment='center')
        
        plt.text(x-.2,y-.3,s=nodeData.get("sessionId","NA")[0:4]
                 ,rotation=90
                 ,horizontalalignment='center')
    plt.show()

In [59]:

def plotSessGraphAndInfo(i_x):
    i,x = i_x
    printSessGraphAndInfo(i_x)
    sessInfoList = x[1]["info"]
    g = sessGraphFromList(sessInfoList)
    
    print "subsessions by date"
    plotSessGraph(g)
    print "subsessions by profileSubsessionCounter (x-axis)"
    g = sessGraphFromList(sessInfoList,byDate=False)
    plotSessGraph(g)

In [61]:

def printSessGraphAndInfo(i_x):
    i,x = i_x
    sessInfoList = x[1]["info"]
    g = sessGraphFromList(sessInfoList)
    print "################################ sample index:", i
    profileSubsessionCounterVals = sorted(map(lambda x:x["profileSubsessionCounter"], sessInfoList))
    print "profileSubsessionCounter values:"
    minPsc = min(profileSubsessionCounterVals)
    maxPsc = max(profileSubsessionCounterVals)
    print "min:", minPsc , "max:", maxPsc
    print "expected number profileSubsessionCounter values:", maxPsc,"-",minPsc," + 1 =",maxPsc-minPsc+1
    print "observed number of profileSubsessionCounter values:", len(profileSubsessionCounterVals)    

    subSessCounterMult = collections.Counter(map(lambda x:x["profileSubsessionCounter"], sessInfoList))
    if any([v>1 for v in subSessCounterMult.values()]):
        print "repeated profileSubsessionCounter(s): ", {c:m for c,m in subSessCounterMult.iteritems() if m>1}
    else:
        print "NUMBER OF SESSION MISSING:", maxPsc-minPsc+1 - len(profileSubsessionCounterVals) 
    

Explanation of the session graph plots in this notebook.¶

In what follows, two plots are drawn for depicting the subsession histories of a sample of clients. For each client, we draw:

The subsession graph organized by date the x-axis. In these plots, all the subsession that occured on one day are stacked in vertical column coresponding to the integer day number. These plots help to show cases in which "profileSubsessionCounter" is reset to 1.
The subsession graph organized by subsessionId. In these plots, the x-axis position of each subsession is given by its subsession number. These plots help to show cases in which there are big gaps between recorded subsession.
The "profileSubsessionCounter" is drawn on the nodes, and the first 4 chars of the sessionId is drawn under the node. (in the case of graphs where all the node have the same y values the labels are not positioned correctly. Apologies for the weird node labeling. It is a deficiency in the networkx drawing functions that I don't want to mess with.)

In [62]:

for i,clientData in enumerate(gapClients_10):
    clientDates = [ss.get('sessionStartDate',0) for ss in clientData[1]["info"]]
#     if any([ (ss.get('profileSubsessionCounter',0)==1
#              and any(ss.get('sessionStartDate',None)>d for d in clientDates))
#             for ss in clientData[1]["info"]]):
    plotSessGraphAndInfo((i,clientData))

################################ sample index: 0
profileSubsessionCounter values:
min: 51 max: 173
expected number profileSubsessionCounter values: 173 - 51  + 1 = 123
observed number of profileSubsessionCounter values: 114
NUMBER OF SESSION MISSING: 9
subsessions by date

subsessions by profileSubsessionCounter (x-axis)

################################ sample index: 1
profileSubsessionCounter values:
min: 153 max: 397
expected number profileSubsessionCounter values: 397 - 153  + 1 = 245
observed number of profileSubsessionCounter values: 226
NUMBER OF SESSION MISSING: 19
subsessions by date

subsessions by profileSubsessionCounter (x-axis)

################################ sample index: 2
profileSubsessionCounter values:
min: 4 max: 23
expected number profileSubsessionCounter values: 23 - 4  + 1 = 20
observed number of profileSubsessionCounter values: 20
repeated profileSubsessionCounter(s):  {14: 2}
subsessions by date

subsessions by profileSubsessionCounter (x-axis)

################################ sample index: 3
profileSubsessionCounter values:
min: 30 max: 40
expected number profileSubsessionCounter values: 40 - 30  + 1 = 11
observed number of profileSubsessionCounter values: 4
NUMBER OF SESSION MISSING: 7
subsessions by date

subsessions by profileSubsessionCounter (x-axis)

################################ sample index: 4
profileSubsessionCounter values:
min: 28 max: 136
expected number profileSubsessionCounter values: 136 - 28  + 1 = 109
observed number of profileSubsessionCounter values: 108
NUMBER OF SESSION MISSING: 1
subsessions by date

subsessions by profileSubsessionCounter (x-axis)

################################ sample index: 5
profileSubsessionCounter values:
min: 120 max: 416
expected number profileSubsessionCounter values: 416 - 120  + 1 = 297
observed number of profileSubsessionCounter values: 278
NUMBER OF SESSION MISSING: 19
subsessions by date

subsessions by profileSubsessionCounter (x-axis)

################################ sample index: 6
profileSubsessionCounter values:
min: 1 max: 25
expected number profileSubsessionCounter values: 25 - 1  + 1 = 25
observed number of profileSubsessionCounter values: 24
NUMBER OF SESSION MISSING: 1
subsessions by date

subsessions by profileSubsessionCounter (x-axis)

################################ sample index: 7
profileSubsessionCounter values:
min: 1 max: 9
expected number profileSubsessionCounter values: 9 - 1  + 1 = 9
observed number of profileSubsessionCounter values: 15
repeated profileSubsessionCounter(s):  {1: 4, 2: 2, 3: 2, 4: 2}
subsessions by date

subsessions by profileSubsessionCounter (x-axis)

################################ sample index: 8
profileSubsessionCounter values:
min: 27 max: 69
expected number profileSubsessionCounter values: 69 - 27  + 1 = 43
observed number of profileSubsessionCounter values: 42
NUMBER OF SESSION MISSING: 1
subsessions by date

subsessions by profileSubsessionCounter (x-axis)

################################ sample index: 9
profileSubsessionCounter values:
min: 1 max: 8
expected number profileSubsessionCounter values: 8 - 1  + 1 = 8
observed number of profileSubsessionCounter values: 7
NUMBER OF SESSION MISSING: 1
subsessions by date

subsessions by profileSubsessionCounter (x-axis)

Check the distribution of session missingness (when available)¶

For clients that have a few nice flat subsession chains (no branching or anything weird), we can get figure out how many sessions are missing with a pretty simple technique:

The number of subsessions we expect to see is given by:

max(profileSubsessionCounterVals) - min(profileSubsessionCounterVals) + 1
The number we actually do see is bydefinition just the number of subsessions records. We can use e.g.:

len(profileSubsessionCounterVals)

(For clients that co have either repeated values of profileSubsessionCounter or that have branching histories, figuring out how many sessions are missing not so straighforward.

In [145]:

'''
nodeWithInDegGtOne               -- profile branching
moreThanOneNodeWithOutDegZero    -- multiple chains/trees (missing sessions)
repeatedProfileSubsessionCounter
moreThanOneNodeWithInDegGtOne    -- multiple profile branching
'''

def getNumMissing(clientData):
    bgc = badGraphCounter(clientData)
    if (bgc["nodeWithInDegGtOne"]==1 or #branching
            bgc["repeatedProfileSubsessionCounter"]==1):  #repeated profileSubsessionCounter
        numMissing = np.nan
    else:
        profileSubsessionCounterVals = sorted(map(lambda x:x["profileSubsessionCounter"], clientData["info"]))
        minPsc = min(profileSubsessionCounterVals)
        maxPsc = max(profileSubsessionCounterVals)
        numMissing = maxPsc-minPsc+1 - len(profileSubsessionCounterVals)
#     elif all([v==0 for v in bgc.values()]):
#         numMissing = 0
#     else:
        
    return numMissing

In [90]:

numMissingList = subsessSummaries \
    .map(lambda id_data: getNumMissing(id_data[1])).collect()

In [133]:

plt.hist([x for x in numMissingList if not isnan(x)], bins=100, alpha=0.5)
plt.title('distributions of subsession missingness')
plt.xlabel('subsessions missing')
plt.ylabel('count')

print "number of clients", numClients

v1 = sum([1 for x in numMissingList if x==0])
print "clients with no missingness and OK session chains:"
print "%d clients, %f pct" % (v1, 100.0*v1/numClients)

v2 = sum([1 for x in numMissingList if isnan(x)])
print "clients for which missingness cannot be determined:"
print "%d clients, %f pct" % (v2, 100.0*v2/numClients)

v3 = sum([1 for x in numMissingList if (not isnan(x) and x!=0)])
print "clients with missing sessions:"
print "%d clients, %f pct" % (v3, 100.0*v3/numClients)


print "\nextrapolated % of clients missing subsessions:", 100.0*v3/numClients + (1.0*v3/(v1+v3))*100.0*v2/numClients

number of clients 8937
clients with no missingness and OK session chains:
6937 clients, 77.621126 pct
clients for which missingness cannot be determined:
710 clients, 7.944500 pct
clients with missing sessions:
1290 clients, 14.434374 pct

extrapolated % of clients missing subsessions: 15.6800777926

From the above, we see that about 14.5% of clients can be known to be missing subsessions, and another ~8% have profile branching or resets, and so cannot rule out the possiblity that they are missing some subsessions (I think it is likely that they are missing sessions at an above-average rate).

If we assume that the 8% of client about which we cannot state anything conlcusive does have missingness at the same rate as the rest of the population, we can extrapolate that about 15.7% of the population has some missingness.

In [113]:

plt.hist([x for x in numMissingList if (not isnan(x) and x!=0)], bins=100, alpha=0.5)
plt.title('distributions of subsession missingness\n(among clients missing at least one subsession)')
plt.xlabel('subsessions missing')
plt.ylabel('count')

Out[113]:

<matplotlib.text.Text at 0x7fc706e62150>

Check the distribution of session missingness by percentage missing (when available)¶

For clients that don't have repeated profileSubsessionCounter values or branching, we can get figure out how many sessions are missing. (For clients that co have either of those anomlies, figuring out how many sessions are missing is not so straighforward).

In [122]:

def getPctMissing(clientData):
    bgc = badGraphCounter(clientData)
    if (bgc["nodeWithInDegGtOne"]==1 or #branching
            bgc["repeatedProfileSubsessionCounter"]==1):  #repeated profileSubsessionCounter
        numMissing = np.nan
    else:
        profileSubsessionCounterVals = sorted(map(lambda x:x["profileSubsessionCounter"], clientData["info"]))
        minPsc = min(profileSubsessionCounterVals)
        maxPsc = max(profileSubsessionCounterVals)
        numMissing = 100.0*(maxPsc-minPsc+1 - len(profileSubsessionCounterVals))/(maxPsc-minPsc+1)
#     elif all([v==0 for v in bgc.values()]):
#         numMissing = 0
#     else:
        
    return numMissing

In [123]:

pctMissingList = subsessSummaries \
    .map(lambda id_data: getPctMissing(id_data[1])).collect()

In [127]:

plt.hist([x for x in pctMissingList if not isnan(x)], bins=100, alpha=0.5)
plt.title('distributions of subsession missingness')
plt.xlabel('subsessions missing')
plt.ylabel('count in bin')

Out[127]:

<matplotlib.text.Text at 0x7fc706181910>

In [128]:

plt.hist([x for x in pctMissingList if (not isnan(x) and x!=0)], bins=100, alpha=0.5)
plt.title('precentage of subsessions missing\n(among clients missing at least one subsession)')
plt.xlabel('pct of subsessions missing')
plt.ylabel('count in bin')

Out[128]:

<matplotlib.text.Text at 0x7fc705f57310>

Overall, what precentage of subsessions are missing?¶

In [137]:

def getNumMissingAndExpected(clientData):
    bgc = badGraphCounter(clientData)
    if (bgc["nodeWithInDegGtOne"]==1 or #branching
            bgc["repeatedProfileSubsessionCounter"]==1):  #repeated profileSubsessionCounter
        numMissing = np.nan
        numExpected = np.nan
    else:
        profileSubsessionCounterVals = sorted(map(lambda x:x["profileSubsessionCounter"], clientData["info"]))
        minPsc = min(profileSubsessionCounterVals)
        maxPsc = max(profileSubsessionCounterVals)
        numExpected = (maxPsc-minPsc+1)
        numMissing = 100.0*(maxPsc-minPsc+1 - len(profileSubsessionCounterVals))/numExpected       
    return (numMissing,numExpected)

In [141]:

numMissingAndExpected = subsessSummaries \
    .map(lambda id_data: getNumMissingAndExpected(id_data[1])).collect()

In [144]:

totalMissing = sum([x[0] for x in numMissingAndExpected if not isnan(x[0])])
totalExpected = sum([x[1] for x in numMissingAndExpected if not isnan(x[1])])
print "total subsessions expected in this sample:", totalExpected
print "total subsessions missing:", totalMissing
print "pct missing:", 1.0*totalMissing/totalExpected

total subsessions expected in this sample: 326318
total subsessions missing: 18508.5003925
pct missing: 0.0567192137501

So it appears that overall we're missing at least 5.7% of the subsessions we should be expecting to see-- and this number is quite possibly a low estimate, because we cannot estimate for clients with branching histories or profileSubsessionCounter resets.

As is typically the case in this kind of data, this appears to be a long-tail phenomenon: most clients are not missing any subsession; of those clients that are missing some subsessions, most clients are missing only a few; but there are a enough clients missing a few subsessions and enough clients missing a lot of subsessions that all together we're missing nearly 6% of the subsessions we'd expect to see.

In [ ]: