import pandas as pd # our core data analysis toolkit import numpy as np import pylab as pl # plotting libraries from ggplot import * from pandas import read_json # a function for reading data in JSON files # This option shows our plots directly in iPython Notebooks %matplotlib inline # This option gives a more pleasing visual style to our plots pd.set_option('display.mpl_style', 'default') # The location of our playtest data file filepath = "2014-05-13 makescape playtest.json" def loadDataSortedByTimestamp(filepath): x = read_json(filepath) x = x.sort(columns='timestamp') x.index = range(0, len(x)) return(x) ms = loadDataSortedByTimestamp(filepath) ms.columns len(ms.columns) len(ms) # returns 8505 columns = ['key', 'timestamp'] ms.head(n=5)[columns] ms['human-readable-timestamp'] = ms.timestamp.apply(lambda x: pd.Timestamp(x, unit='ms')) columns = ['key', 'timestamp', 'human-readable-timestamp'] ms[columns].head() # Manipulating the data to get a cumulative sum # and nicely formatted timestamps connectionEvents = ms[ms.key == 'MakeConnectComponent'] connectionEvents['cumulativeCount'] = connectionEvents.timestamp.apply(lambda x: 1).cumsum() connectionEvents['timestamp1'] = connectionEvents.timestamp.apply(lambda x: pd.Timestamp(x, unit='ms')) # Creating the basic plot p = ggplot(aes(x='timestamp1', y='cumulativeCount'), data=connectionEvents) p = p + geom_line() p = p + ggtitle('Cumulative Distribution of MakeConnectComponent Events') p = p + xlab('Time') p = p + ylab('Event Count') # Showing the plot print(p) # Saving the plot ggsave(plot=p, filename='cumulativeDistributionOfMakeConnectComponent1.png') connectionEvents['delta1'] = connectionEvents.timestamp1.diff() connectionEvents[connectionEvents.delta1 > np.timedelta64(5, 'h')]['timestamp'] whenBoringPartEnds = 1400077592798 day1 = connectionEvents[connectionEvents.timestamp < whenBoringPartEnds] day2 = connectionEvents[connectionEvents.timestamp >= whenBoringPartEnds] # Creating the day1 plot p = ggplot(aes(x='timestamp1', y='cumulativeCount'), data=day1) p = p + geom_line() p = p + ggtitle('Cumulative Distribution of MakeConnectComponent Events\nDay1') p = p + xlab('Time') p = p + ylab('Event Count') # Showing the plot print(p) # Saving the plot ggsave(plot=p, filename='cumulativeDistributionDay1.png') # Creating the day2 plot p = ggplot(aes(x='timestamp1', y='cumulativeCount'), data=day2) p = p + geom_line() p = p + ggtitle('Cumulative Distribution of MakeConnectComponent Events\nDay2') p = p + xlab('Time') p = p + ylab('Event Count') # Showing the plot print(p) # Saving the plot ggsave(plot=p, filename='cumulativeDistributionDay2.png') ms['delta1'] = ms.timestamp.diff() ms['delta2'] = ms.delta1.diff() ms['delta3'] = ms.delta2.diff() ms['delta4'] = ms.delta3.diff() # A boolean expression to select events where deltas 1–3 are all zero thirdOrderZeroes = (ms.delta3 == 0) & (ms.delta2 == 0) & (ms.delta1 == 0) # The columns we'll want to view columns = ['key', 'timestamp', 'delta1', 'delta2', 'delta3', 'delta4'] ms[thirdOrderZeroes][columns] ms[2470:2485][columns] topFiveMostFrequentEvents = ms.groupby('key').count().sort(columns=['timestamp'], ascending=False)[:5] topFiveMostFrequentEvents['timestamp'] topFive = loadDataSortedByTimestamp(filepath) topFiveMostFrequentEvents = list(ms.groupby('key').count().sort(columns=['timestamp']).index)[-5:] frequencyFilter = topFive.key.apply(lambda x: x in topFiveMostFrequentEvents) topFive = topFive[frequencyFilter] topFive['delta1'] = topFive.timestamp.diff() binBreaks = [-1, 1, 50, 100, 200, 300, 500, 1000] # binBreaks = [1000, 2000, 3000, 4000, 5000] p = ggplot(aes(x='delta1', fill='key'), data=topFive) + \ geom_histogram(breaks=binBreaks) + \ scale_x_continuous(breaks=binBreaks) + \ ggtitle('Distribution of Time Deltas Between Successive Events') + \ ylab('Number of Events') + \ xlab('Time Between Events (ms)') # print(p) # ggsave(p, "histogram.png") # topFive.head(n = 20)[['timestamp', 'delta1', 'key']] print(p) topFive = loadDataSortedByTimestamp(filepath) topFiveMostFrequentEvents = list(ms.groupby('key').count().sort(columns=['timestamp']).index)[-5:] frequencyFilter = topFive.key.apply(lambda x: x in topFiveMostFrequentEvents) topFive['delta1'] = topFive.timestamp.diff() topFive = topFive[frequencyFilter] p = ggplot(aes(x = 'delta1', group='key'), data=topFive) p = p + geom_density() # a Kernel Density Estimate p = p + scale_x_continuous(limits=[-1000, 20000]) p = p + facet_wrap(y='key', ncol=1, scales='fixed') p = p + xlab('Time Between Successive Events (ms)') p = p + ggtitle('Smoothed Kernel Density Estimates') print(p) ggsave(plot=p, filename='kernelDensityEstimate.png') connections = loadDataSortedByTimestamp(filepath) connections = connections[connections.key == 'MakeConnectComponent'] connections['delta1'] = connections.timestamp.diff() binBreaks = [0, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000] p = ggplot(aes(x='delta1', fill='key'), data=connections) + \ geom_histogram(breaks=binBreaks) + \ scale_x_continuous(breaks=binBreaks) + \ ggtitle('Distribution of Time Deltas Between Successive Events') + \ ylab('Number of Events') + \ xlab('Time Between Events (ms)') print(p) ggsave(plot=p, filename='histogram2.png') topFive[topFive.key == 'MakeDisconnectComponent'][['key', 'delta1']].head() print(topFive[topFive.key == 'MakeDisconnectComponent']['delta1'].describe()) topFive[topFive.key == 'MakeDisconnectComponent']['delta1'].plot(kind='kde') # print(p) # ggsave(p, "histogram.png") # topFive.head(n = 20)[['timestamp', 'delta1', 'key']] connects = loadDataSortedByTimestamp(filepath) connects = connects[connects.key == 'MakeConnectComponent'] connects['delta1'] = connects.timestamp.diff() p = ggplot(aes(x='delta1', fill='key'), data=connects) + \ geom_histogram(breaks=binBreaks) + \ scale_x_continuous(breaks=binBreaks) + \ ggtitle('Distribution of Time Deltas Between Successive Events') + \ ylab('Number of Events') + \ xlab('Time Between Events (ms)') print(p) len(connections[connections.delta1 <= 1000]) # 1744 events columns = ['timestamp', 'key'] ms.groupby('key').count().sort(columns=['timestamp'], ascending=False)[columns] # We can also see what this looks like as a plot msdata = ms.groupby('key').count().sort(columns=['timestamp', 'key'], ascending=False) p = msdata['timestamp'].plot(kind='bar') print(p) pl.savefig("barChart.jpg", dpi=300, figsize=(8, 11), bbox_inches='tight') nullComponentLists = pd.isnull(ms['component_list']) ms[nullComponentLists][ms.key == 'MakeDisconnectComponent'][['key', 'component_list']] ms[970:986][['key', 'timestamp']] ms[980:984][['key', 'timestamp', 'delta1']] list(ms[975:978].component_list) list(ms[980:984].component_list) ms[980:984].timestamp