#!/usr/bin/env python # coding: utf-8 # #Matching totalTime and activeTicks per session signature # # https://bugzilla.mozilla.org/show_bug.cgi?id=1186713 # In[90]: import ujson as json import matplotlib.pyplot as plt import pandas as pd import numpy as np import plotly.plotly as py import networkx as nx import collections from IPython.display import display,HTML from scipy import stats from statsmodels.stats import proportion from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client get_ipython().run_line_magic('pylab', 'inline') # In[2]: import boto conn = boto.connect_s3() # In[3]: numCores = sc.defaultParallelism numCores # #Load saved data # In[4]: outBucketName = "net-mozaws-prod-us-west-2-pipeline-analysis" pathToOutput = "bcolloran/mergedDataPerClient/nightly/2015-07-09/8937clients/" # In[5]: #list the files in the bucket outBucket = conn.get_bucket(outBucketName) # Substitute in your bucket name bl = outBucket.list(prefix=pathToOutput) mergedDataSize = sum(key.size for key in bl) print "mergedDataPerClient_repart data size:", mergedDataSize/(1.0*(10**9)),"GB" list(bl)[-5:] # In[38]: #load the data-- # for a tiny sample, you can load one part: "part-00000" # or you can do more-- # ten parts: part-0000* # or 10% of parts: part-*0 # or all parts: part-* pathToMergedData = "s3n://"+outBucketName+"/"+pathToOutput+"part-*" print pathToMergedData mergedData = sc.sequenceFile(pathToMergedData) # In[39]: mergedData.count() # In[40]: v4PerClient = mergedData.mapValues(json.loads)\ .mapValues(lambda x:x["v4"]) # #Reason codes for subsessions with negative subsessionLength # # First, we need to establish the background rates of different reason codes # In[41]: numV4Subsess = v4PerClient.values().map(len).reduce(lambda x,y:x+y) numV4Subsess # In[42]: def countReasonsPerClient(pingList): reasons = collections.Counter() for p in pingList: reasons[p.get("payload/info",{}).get("reason","MISSING")] +=1 return reasons # In[43]: v4Reasons = v4PerClient.mapValues(countReasonsPerClient)\ .values()\ .reduce(lambda x,y:x+y) # In[44]: v4Reasons # Now that we have the baseline reasons, let's look into the the reason codes for subsession with negative subsessionLength # In[45]: def numWithNegDuration(pingList): num = 0 for p in pingList: if p.get("payload/info",{}).get("subsessionLength",0) < 0: num += 1 return num # In[46]: numV4SubsessNegDuration = v4PerClient.values().map(numWithNegDuration).reduce(lambda x,y:x+y) numV4SubsessNegDuration # In[47]: def countReasonsWithNegDurationPerClient(pingList): reasons = collections.Counter() for p in pingList: if p.get("payload/info",{}).get("subsessionLength",0) < 0: reasons[p.get("payload/info",{}).get("reason","MISSING")] += 1 return reasons # In[48]: v4ReasonsNegDuration = v4PerClient.mapValues(countReasonsWithNegDurationPerClient)\ .values()\ .reduce(lambda x,y:x+y) # In[49]: v4ReasonsNegDuration # ###Rates of negative subsessionLength by reason code # In[99]: for k in v4Reasons: print k print "\t",v4ReasonsNegDuration.get(k,0)/(1.0*v4Reasons[k]) # In[115]: testObs = np.array(v4Reasons.values()) testCount = np.array([v4ReasonsNegDuration.get(k,0) for k in v4Reasons.keys()]) chiSqAll = proportion.proportions_chisquare_allpairs(testCount,testObs) # In[116]: print v4Reasons.keys() print testCount print testObs,"\n" print chiSqAll # FWIW, based on a simple chi squared test, the pairwise difference in the rates of negative subsessionLength values may be significantly different between 'daily' pings and 'environment-change' pings-- but because of the samples sizes and event counts, we're violating the assumptions of the chi-squared simplification, so that result is suspect. None of the others even appear to be significant. In my judgement, we don't have enough evidence to say that any particular ping type has more negative subsessionLengths than any other. # #Get session signatures and supporting data for each client # # "Session signature" are tuples of (main,firstPaint,sessionRestored). Supporting data includes the date and a few other items, including the values of activeTicks and totalTime. # # Important note: in v2, we distinguish between "clean" and "aborted" sessions; on any day that has both a clean and an aborted session, it is *not possible* to know which values of activeticks and totalTime should be matched with which session. Therefore, we drop all v2 session signatures that are recorded on a day with both clean and aborted sessions. # # Also, the version of getV4SessionSignatures in this notebook is different than the one used in e.g. # - https://bugzilla.mozilla.org/show_bug.cgi?id=1154113 # - https://bugzilla.mozilla.org/show_bug.cgi?id=1150105 # # because in this case we don't want just the first subsession of each session # In[50]: mf10 = mergedData.mapValues(json.loads).take(10) # In[51]: mf10[0][1]["v4"][0].keys() # from bug https://bugzilla.mozilla.org/show_bug.cgi?id=1187054 , we need to look at payload/info/subsessionLength field # In[52]: def getV4SessionSignatures(v4Subsessions): # for subsessions with subsessionCounter==1, get the session signature sessStarts = [{"date": s['payload/info']['sessionStartDate'][0:10], "ssCounter": s['payload/info']['subsessionCounter'], "profSsCounter": s['payload/info']['profileSubsessionCounter'], "buildId": s['environment']['build']['buildId'], "sig": (s['payload/simpleMeasurements/main'], s['payload/simpleMeasurements/firstPaint'], s['payload/simpleMeasurements/sessionRestored']), "activeTicks":s['payload/simpleMeasurements/activeTicks'], "totalTime":s['payload/simpleMeasurements/totalTime'], "subsessionLength":s['payload/info']['subsessionLength'] } for s in v4Subsessions] return sessStarts def getV2SessionSignatures(v2): # get the data.days entries with session starts sessStartDays = [(date, data.get('org.mozilla.appSessions.previous')) for date,data in v2['data']['days'].items() if data.get('org.mozilla.appSessions.previous',None) ] # for each date with a session start, get the session signatures from that date sessStarts = [] try: for date,data in sessStartDays: if ('cleanActiveTicks' in data.keys()) and ('abortedActiveTicks' in data.keys()): pass else: if 'cleanActiveTicks' in data.keys(): activeTickKey = 'cleanActiveTicks' totalTimeKey = 'cleanTotalTime' else: activeTickKey = 'abortedActiveTicks' totalTimeKey = 'abortedTotalTime' for i in range(len(data['main'])): sessStarts.append( {"date": date, "sig": (data['main'][i], data['firstPaint'][i], data['sessionRestored'][i]), "activeTicks":data[activeTickKey][i], "totalTime":data[totalTimeKey][i]} ) except: sessStarts = "ERROR" return sessStarts # In[53]: sessionSigs.count() # In[54]: ssf = sessionSigs.take(10) # In[55]: def isIsoDateStr(x): nums = set("0123456789") try: if ( (set(x[0:4]).issubset(nums)) and (x[4]=="-") and (set(x[5:7]).issubset(nums)) and (x[7]=="-") and (set(x[8:10]).issubset(nums)) ): return True else: return False except: return False print isIsoDateStr("0000-00-00"),isIsoDateStr("01000-00-00"),isIsoDateStr("w000-00-00") def sessSortKey(dfRow): if isIsoDateStr(dfRow["date_v2"]): return dfRow["date_v2"],dfRow["profSsCounter"] else: return dfRow["date_v4"],dfRow["profSsCounter"] # In[56]: def sumTimeAndTicksPerSessSig(v4DataFrame): dfGrouped = v4DataFrame.groupby("sig") dfAgg = dfGrouped.agg({"activeTicks":np.sum, "totalTime":max, "date":min, "profSsCounter":min, "ssCounter":min, "buildId":min}) dfAgg["numSubsessions"] = dfGrouped["ssCounter"].agg(len) dfAgg["summedTime"] = dfGrouped["subsessionLength"].agg(sum) dfAgg["totalTimeMax"] = dfGrouped["totalTime"].agg(max) dfAgg.reset_index(inplace=True) return dfAgg # In[57]: # """display info about v4 subsessions""" # for i in range(len(ssf)): # ssgs = ssf[i] # dfv2 = pd.DataFrame(ssgs[1]["v2sigs"]) # dfv4 = pd.DataFrame(ssgs[1]["v4sigs"]) # # print dfv4.sort("profSsCounter") # print dfv4.keys() # dfv4Grouped = dfv4.groupby("sig") # print dfv4Grouped # for name, group in dfv4Grouped: # if len(group)>1: # group = group.sort("profSsCounter") # print "sessionSig:", name # display(group[['activeTicks','totalTime', 'profSsCounter', 'ssCounter','buildId']]) # In[58]: """ In this case, since we are only interested in session signatures that appear in both v2 and v2, we do an inner join. Ultimately, we'll be most interested in session with numSubsessions > 1, but we'll leave all subsessions on for now, b/c we should check that even simple sessions composed of just one session have the matching values for activeTicks and totalTime """ def mergedFrameFromSigLists(v2List,v4List): outCols = ["sig","date_v2","date_v4", "totalTime_v2","totalTime_v4","totalTimeMax","summedTime", "activeTicks_v2","activeTicks_v4", "ssCounter","profSsCounter","buildId","numSubsessions"] if v2List and v4List: try: dfv2 = pd.DataFrame(v2List) dfv4 = sumTimeAndTicksPerSessSig(pd.DataFrame(v4List)) df = pd.merge(dfv2,dfv4,how="inner",on="sig", suffixes=("_v2","_v4")) if len(df)>0: df["sortKey"] = df.apply(lambda x: sessSortKey(x),axis=1) df.sort("sortKey",inplace=True) # df.drop("sortKey", axis=1, inplace=True) df.index = range(1,len(df)+1) return df[outCols] except: return pd.DataFrame(columns=outCols) return pd.DataFrame(columns=outCols) # In[59]: colsToShow = [#"sig", "numSubsessions", "totalTime_v2","totalTimeMax","summedTime", "deltaV2V4","deltaV2V4Sum", "deltaV2V4Pct","deltaV2V4SumPct" # "activeTicks_v2","activeTicks_v4", # "totalTimeDiff","activeTicksDiff", ] for i in range(len(ssf)): mffsl = mergedFrameFromSigLists(ssf[i][1]["v2sigs"],ssf[i][1]["v4sigs"]) mffsl["deltaV2V4"] = mffsl["totalTime_v2"] - mffsl["totalTimeMax"] mffsl["deltaV2V4Sum"] = mffsl["totalTime_v2"] - mffsl["summedTime"] mffsl["deltaV2V4Pct"] = 1.0*(mffsl["totalTime_v2"] - mffsl["totalTimeMax"])/mffsl["totalTime_v2"] mffsl["deltaV2V4SumPct"] = 1.0*(mffsl["totalTime_v2"] - mffsl["summedTime"])/mffsl["totalTime_v2"] # mffsl["activeTicksDiff"] = mffsl["activeTicks_v2"] - mffsl["activeTicks_v4"] mffsl.sort("numSubsessions",inplace=True) if len(mffsl)>0: display(mffsl[colsToShow]) print list(mffsl["deltaV2V4Pct"]) # #How do the maximum of totalTime in v4 compare to the value recorded in v2? # # Since totalTime is a running total of the time recorded so far during a client session, the maximum value across v4 subsession pings belonging to a client session should match the value recorded in v2. # # Also, scale up to a the rest of the sample. # In[60]: #load the data-- # for a tiny sample, you can load one part: "part-00000" # or you can do more-- # ten parts: part-0000* # or 10% of parts: part-*0 # or all parts: part-* pathToMergedData_all = "s3n://"+outBucketName+"/"+pathToOutput+"part-*" print pathToMergedData_all mergedData_all = sc.sequenceFile(pathToMergedData_all) # In[61]: sessionSigs_all = mergedData_all.mapValues(json.loads)\ .mapValues(lambda x:{"v4sigs":getV4SessionSignatures(x["v4"]), "v2sigs":getV2SessionSignatures(x["v2"])} ) # In[62]: def getPctDiffsV2VsMaxTotalTime(df): if len(df)>0: return list(1.0*(df["totalTime_v2"] - df["totalTimeMax"]) /df["totalTime_v2"] ) else: return [] # In[63]: mergedFrames_all = sessionSigs_all.mapValues(lambda x: mergedFrameFromSigLists(x["v2sigs"], x["v4sigs"]) ) # In[64]: pctDiffsV2VsMaxTotalTime = mergedFrames_all \ .flatMapValues(getPctDiffsV2VsMaxTotalTime)\ .values() # In[65]: sinhBins = [np.sinh((i-100)/40.0) for i in range(201)] # In[66]: histOfDeltaV2V4Pct = pctDiffsV2VsMaxTotalTime\ .histogram( sinhBins ) # In[67]: # histOfDeltaV2V4Pct # In[68]: binEdges = histOfDeltaV2V4Pct[0] binVals = histOfDeltaV2V4Pct[1] plotX = reduce(lambda x,y:x+y,[binEdges[i:i+2] for i in range(len(binEdges)-1)] ) plotY = reduce(lambda x,y:x+y, map(lambda x: [x,x],binVals)) ax = plt.subplot(1,1,1) # ax.plot(binEdges[:-1],binVals) ax.plot(plotX,plotY) plt.yscale('log') plt.xscale('symlog') # In[69]: binEdges = histOfDeltaV2V4Pct[0] binVals = histOfDeltaV2V4Pct[1] plotX = binEdges[:-1] plotY = (1.0*np.cumsum(binVals))/sum(binVals) ax = plt.subplot(1,1,1) # ax.plot(binEdges[:-1],binVals) ax.plot(plotX,plotY) # plt.yscale('log') plt.xscale('symlog') plt.xlabel("pct difference of v4 from v2") plt.ylabel("cumulative proportion") # So the above plot shows the ECDF across the population of sessions [1] of the percentage difference between the value for totalTime recorded in v2 vs v4, more specifically: # # (totalTime_v2 - totalTime_v4)/totalTime_v2 # # Where totalTime_v4 is the maximum value of totalTime recorded in any of the sessions with this session signature. # # This looks mostly OK. In the vast majority of cases the values are off by only a few ms (maybe about 90%), and the percentage difference is miniscule in most cases. Maybe about 1% of sessions have recorded values that differ by by more than 25% between v2 and v4. # # The notably weird thing is that to the extent the number are off, we have them skewing negative (hardly any of the differences are positive). That means that totalTime_v2 < totalTime_v4. This strikes me as kind of strange -- if anything, I'd have expected to see a lot of cases of totalTime_v2 > totalTime_v4 (because if the last session is missing, then the v4 data sent back to our servers will not have any records of the last bits of the session. This may indicate problems with totalTime values in v4 that are too big. # # Things to look into next: # - tail behavior on the negative side in v4 # - overall sums between v2 and v4. Do these discrepancies matter when aggregated? # # # [1] Remember that we have dropped sessions from some days. See the comment above. Also, we are only considering session that have matching signatures in the v2 and v4 data # # Comparing overall sums of totalTime between v2 and v4 # # Do the discrepancies seen above amount to a substantial difference when aggregated? # # In[70]: mfa = mergedFrames_all.first() # In[71]: mfa # In[72]: def getTotalTimeSums(df): if len(df)>0: return {"totalTime_v2":sum(df["totalTime_v2"]), "totalTimeMax":sum(df["totalTimeMax"]) } else: return {"totalTime_v2":0,"totalTimeMax":0} def totalTimeAdder(tt1,tt2): return {"totalTime_v2": tt1["totalTime_v2"]+tt2["totalTime_v2"], "totalTimeMax": tt1["totalTimeMax"]+tt2["totalTimeMax"] } # In[73]: totalTimesAggregated = mergedFrames_all.values()\ .map(getTotalTimeSums) \ .reduce(lambda x,y:totalTimeAdder(x,y)) # In[74]: print totalTimesAggregated print 1.0*(totalTimesAggregated["totalTime_v2"] - totalTimesAggregated["totalTimeMax"])/totalTimesAggregated["totalTime_v2"] print 1.0*(totalTimesAggregated["totalTimeMax"])/totalTimesAggregated["totalTime_v2"] # Wow, so this is waaaay off. The sum of the per-session max of totalTime is more than 3x the value from v2. That's kind of crazy. # # It seems like this might come from the tail of the distribution above-- The cases where totalTime_v2 << totalTime_v4. Let's see if we can look at a few examples likt that. # ### What's the deal with the cases where totalTime_v2 << totalTimeMax ? # In[76]: def totalTimeMismatch(df): return 3*sum(df["totalTime_v2"]) < sum(df["totalTimeMax"]) # In[77]: totalTimeMismatchFrames = mergedFrames_all.filter(lambda id_df: totalTimeMismatch(id_df[1])) # In[79]: totalTimeMismatchFrames.count() # Only 125 clients in ~8k have this strong mismatch. That is reasonably in line with only 137 subsession pings having negative subsessionLength # In[80]: ttmf = totalTimeMismatchFrames.take(10) # In[81]: ttmf colsToShow = [#"sig", "numSubsessions", "totalTime_v2","totalTimeMax","summedTime", "deltaV2V4","deltaV2V4Sum", "deltaV2V4Pct","deltaV2V4SumPct" # "activeTicks_v2","activeTicks_v4", # "totalTimeDiff","activeTicksDiff", ] for i in range(len(ttmf)): mffsl = ttmf[i][1] mffsl["deltaV2V4"] = mffsl["totalTime_v2"] - mffsl["totalTimeMax"] mffsl["deltaV2V4Sum"] = mffsl["totalTime_v2"] - mffsl["summedTime"] mffsl["deltaV2V4Pct"] = 1.0*(mffsl["totalTime_v2"] - mffsl["totalTimeMax"])/mffsl["totalTime_v2"] mffsl["deltaV2V4SumPct"] = 1.0*(mffsl["totalTime_v2"] - mffsl["summedTime"])/mffsl["totalTime_v2"] # mffsl["activeTicksDiff"] = mffsl["activeTicks_v2"] - mffsl["activeTicks_v4"] mffsl.sort("numSubsessions",inplace=True) if len(mffsl)>0: display(mffsl[colsToShow]) print sum(mffsl["totalTime_v2"]), sum(mffsl["totalTimeMax"]) # Ok, so we see that one thing that's happening is that we have a good number of sessions with negative values (of quite extreme absolute value). So let's ignore those for now (though we may have to consider them later). # # ###Compare aggregate differences, dropping sessions with negative values. # In[69]: # mffsl[(mffsl["totalTime_v2"]>0) & (mffsl["totalTimeMax"]>0)] # In[70]: def getTotalTimeSums_dropNegs(df): df = df[(df["totalTime_v2"]>0) & (df["totalTimeMax"]>0)] if len(df)>0: return {"totalTime_v2":sum(df["totalTime_v2"]), "totalTimeMax":sum(df["totalTimeMax"]) } else: return {"totalTime_v2":0,"totalTimeMax":0} # In[72]: totalTimesAggregated = mergedFrames_all.values()\ .map(getTotalTimeSums_dropNegs) \ .reduce(lambda x,y:totalTimeAdder(x,y)) # In[73]: print totalTimesAggregated print 1.0*(totalTimesAggregated["totalTime_v2"] - totalTimesAggregated["totalTimeMax"])/totalTimesAggregated["totalTime_v2"] print 1.0*(totalTimesAggregated["totalTimeMax"])/totalTimesAggregated["totalTime_v2"] # So the aggregate discrepancy still exists, but it is much more muted. This kind of indicates that there might be a number of v2 sessions where the negative values are much worse in v2 than v4. So let's investigate the negative values a bit. # ###Look at some negative totalTime values # In[75]: def getNegRowFrame(df): df = df[(df["totalTime_v2"]<0) | (df["totalTimeMax"]<0)] return df # In[76]: negFrames = mergedFrames_all\ .mapValues(getNegRowFrame) \ .filter(lambda id_df: len(id_df[1])>0) # In[77]: nf = negFrames.take(20) # In[87]: def showFramesWithDeltas(frameList,colsToShow): for i in range(len(frameList)): df = frameList[i][1] df["deltaV2V4"] = df["totalTime_v2"] - df["totalTimeMax"] df["deltaV2V4Sum"] = df["totalTime_v2"] - df["summedTime"] df["deltaV2V4Pct"] = 1.0*(df["totalTime_v2"] - df["totalTimeMax"])/df["totalTime_v2"] df["deltaV2V4SumPct"] = 1.0*(df["totalTime_v2"] - df["summedTime"])/df["totalTime_v2"] df.sort("numSubsessions",inplace=True) if len(df)>0: display(df[colsToShow]) # In[86]: nf # In[92]: colsToShow = [#"sig", "numSubsessions", "totalTime_v2","totalTimeMax","summedTime", "deltaV2V4","deltaV2V4Sum", "deltaV2V4Pct","deltaV2V4SumPct" # "activeTicks_v2","activeTicks_v4", # "totalTimeDiff","activeTicksDiff", ] showFramesWithDeltas(nf,colsToShow) # for clientId,df in nf: # display(df) # So this is kind of disturbing-- in some cases above, the value of totalTime_v2 is quite far from totalTimeMax but summedTime is close to totalTime_v2 (remember that totalTimeMax is the maximum value recorded for "totalTime" over all the subsessions in a session, whereas summedTime is the sum of all the values of "info/subsessionLength" for all subsessions in a session. # # This kind of suggests that in the case of totalTimeMax, at the end of one of the subsessions the time counter/timer was positive and that something happened in a subsequent subsession that set the timer negative. But it kind of looks like there may only be negative values in the last subsession of a session. Maybe something is happening on shutdown? # ## Are all negative values "subsessionLength" in the final subsession? # We need to go back to the subsession data for this (rather than the frames of data aggregated into full sessions). # # For subsessions with subsessionLength < 0, We'll start by just looking how far this subsession is from the end of the series of subsessions in this session. Concretely, we'll look at the distribution of # # (number of subsessions in session) - (subsessionCounter) # # # In[100]: def getSubsessSigsForSessionsWithANegSubsessLength(sigList): negSigs = set([ ss["sig"] for ss in sigList if ss["subsessionLength"]<0 ]) out = {sig:[] for sig in negSigs} for ss in sigList: if ss["sig"] in negSigs: out[ss["sig"]].append(ss) return out # In[126]: sessionSigs_all.count() # In[110]: negSubsess = sessionSigs_all.mapValues(lambda x:x["v4sigs"])\ .mapValues(getSubsessSigsForSessionsWithANegSubsessLength) \ .filter(lambda id_dict: id_dict[1]!={} ) \ # In[125]: negSubsess.count() # So there are only 80 clients with a negative value for any subsessionLength. That's good. Out of 8973 clients that's less than 1%. # In[124]: negSubsess.take(10) # In[120]: def subsessPosOfNegs(negSubsessDict): out = [] for sig,ssList in negSubsessDict.items(): out+= [(len(ssList),ss["ssCounter"]) for ss in ssList if ss["subsessionLength"]<0] return out # In[121]: negSubsess.values().flatMap(subsessPosOfNegs).count() # But there are 137 subsessions with negative subsessionLength measurements. That means that the clients that are affected have almost two negative measurements on average. # In[122]: negSubsessSeqPos = negSubsess.values().flatMap(subsessPosOfNegs).collect() # In[123]: collections.Counter( negSubsessSeqPos) # Ok, so although the sample is small, it does appear to be th case that negative measurements are more likely to occur at the end of a session. In this data there are 19 sessions that have a negative measurement and two subsessions-- from the counter above, {(2, 2): 18, (2, 1): 1} -- so only *one* of these 19 negative subsessionLength measurements occured in the first of the two sessions. It would be virtually impossible to see this outcome if the negative measurements we're disributed at random accross subsessions. # # So in answer to **Are all negative values "subsessionLength" in the final subsession?** -- no, but *most are* # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # #Check for differences between summedTime and totalTime # # These should track each other quite closely. If they don't, then there may be problems with correctly assigning time to days, environments, etc. # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: