import pandas as pd          # our core data analysis toolkit
import numpy as np           
import pylab as pl           # plotting libraries
from ggplot import *
from pandas import read_json # a function for reading data in JSON files

# This option shows our plots directly in iPython Notebooks
%matplotlib inline 

# This option gives a more pleasing visual style to our plots
pd.set_option('display.mpl_style', 'default')

# The location of our playtest data file
filepath = "2014-05-13 makescape playtest.json"

def loadDataSortedByTimestamp(filepath):
    x = read_json(filepath)
    x = x.sort(columns='timestamp')
    x.index = range(0, len(x))
    return(x)

ms = loadDataSortedByTimestamp(filepath)

ms.columns

len(ms.columns)

len(ms) # returns 8505

columns = ['key', 'timestamp']
ms.head(n=5)[columns]

ms['human-readable-timestamp'] = ms.timestamp.apply(lambda x: pd.Timestamp(x, unit='ms'))
columns = ['key', 'timestamp', 'human-readable-timestamp']
ms[columns].head()

# Manipulating the data to get a cumulative sum
# and nicely formatted timestamps
connectionEvents = ms[ms.key == 'MakeConnectComponent']
connectionEvents['cumulativeCount'] = connectionEvents.timestamp.apply(lambda x: 1).cumsum()
connectionEvents['timestamp1'] = connectionEvents.timestamp.apply(lambda x: pd.Timestamp(x, unit='ms')) 

# Creating the basic plot
p = ggplot(aes(x='timestamp1',
               y='cumulativeCount'),
           data=connectionEvents)
p = p + geom_line()
p = p + ggtitle('Cumulative Distribution of MakeConnectComponent Events')
p = p + xlab('Time')
p = p + ylab('Event Count')

# Showing the plot
print(p)

# Saving the plot
ggsave(plot=p,
       filename='cumulativeDistributionOfMakeConnectComponent1.png')

connectionEvents['delta1'] = connectionEvents.timestamp1.diff()

connectionEvents[connectionEvents.delta1 > np.timedelta64(5, 'h')]['timestamp']

whenBoringPartEnds = 1400077592798 
day1 = connectionEvents[connectionEvents.timestamp < whenBoringPartEnds]
day2 = connectionEvents[connectionEvents.timestamp >= whenBoringPartEnds]

# Creating the day1 plot
p = ggplot(aes(x='timestamp1',
               y='cumulativeCount'),
           data=day1)
p = p + geom_line()
p = p + ggtitle('Cumulative Distribution of MakeConnectComponent Events\nDay1')
p = p + xlab('Time')
p = p + ylab('Event Count')

# Showing the plot
print(p)

# Saving the plot
ggsave(plot=p,
       filename='cumulativeDistributionDay1.png')

# Creating the day2 plot
p = ggplot(aes(x='timestamp1',
               y='cumulativeCount'),
           data=day2)
p = p + geom_line()
p = p + ggtitle('Cumulative Distribution of MakeConnectComponent Events\nDay2')
p = p + xlab('Time')
p = p + ylab('Event Count')

# Showing the plot
print(p)

# Saving the plot
ggsave(plot=p,
       filename='cumulativeDistributionDay2.png')


ms['delta1'] = ms.timestamp.diff()
ms['delta2'] = ms.delta1.diff()
ms['delta3'] = ms.delta2.diff()
ms['delta4'] = ms.delta3.diff()

# A boolean expression to select events where deltas 1–3 are all zero
thirdOrderZeroes = (ms.delta3 == 0) & (ms.delta2 == 0) & (ms.delta1 == 0) 

# The columns we'll want to view
columns = ['key', 'timestamp', 'delta1', 'delta2', 'delta3', 'delta4']

ms[thirdOrderZeroes][columns]

ms[2470:2485][columns]

topFiveMostFrequentEvents = ms.groupby('key').count().sort(columns=['timestamp'], ascending=False)[:5]
topFiveMostFrequentEvents['timestamp']

topFive = loadDataSortedByTimestamp(filepath)
topFiveMostFrequentEvents = list(ms.groupby('key').count().sort(columns=['timestamp']).index)[-5:]
frequencyFilter = topFive.key.apply(lambda x: x in topFiveMostFrequentEvents)
topFive = topFive[frequencyFilter]
topFive['delta1'] = topFive.timestamp.diff()
binBreaks = [-1, 1, 50, 100, 200, 300, 500, 1000]
# binBreaks = [1000, 2000, 3000, 4000, 5000]

p = ggplot(aes(x='delta1',
               fill='key'),
           data=topFive) + \
        geom_histogram(breaks=binBreaks) + \
        scale_x_continuous(breaks=binBreaks) + \
        ggtitle('Distribution of Time Deltas Between Successive Events') + \
        ylab('Number of Events') + \
        xlab('Time Between Events (ms)')
# print(p)
# ggsave(p, "histogram.png")
# topFive.head(n = 20)[['timestamp', 'delta1', 'key']]
print(p)


topFive = loadDataSortedByTimestamp(filepath)
topFiveMostFrequentEvents = list(ms.groupby('key').count().sort(columns=['timestamp']).index)[-5:]
frequencyFilter = topFive.key.apply(lambda x: x in topFiveMostFrequentEvents)
topFive['delta1'] = topFive.timestamp.diff()
topFive = topFive[frequencyFilter]


p = ggplot(aes(x = 'delta1', 
               group='key'), 
           data=topFive)
p = p + geom_density() # a Kernel Density Estimate
p = p + scale_x_continuous(limits=[-1000, 20000])
p = p + facet_wrap(y='key', 
                   ncol=1, 
                   scales='fixed')
p = p + xlab('Time Between Successive Events (ms)')
p = p + ggtitle('Smoothed Kernel Density Estimates')

print(p)
ggsave(plot=p,
       filename='kernelDensityEstimate.png')


connections = loadDataSortedByTimestamp(filepath)
connections = connections[connections.key == 'MakeConnectComponent']
connections['delta1'] = connections.timestamp.diff()

binBreaks = [0, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
p = ggplot(aes(x='delta1',
               fill='key'),
           data=connections) + \
        geom_histogram(breaks=binBreaks) + \
        scale_x_continuous(breaks=binBreaks) + \
        ggtitle('Distribution of Time Deltas Between Successive Events') + \
        ylab('Number of Events') + \
        xlab('Time Between Events (ms)')
print(p)
ggsave(plot=p, 
       filename='histogram2.png')

topFive[topFive.key == 'MakeDisconnectComponent'][['key', 'delta1']].head()
print(topFive[topFive.key == 'MakeDisconnectComponent']['delta1'].describe())
topFive[topFive.key == 'MakeDisconnectComponent']['delta1'].plot(kind='kde')

# print(p)
# ggsave(p, "histogram.png")
# topFive.head(n = 20)[['timestamp', 'delta1', 'key']]

connects = loadDataSortedByTimestamp(filepath)
connects = connects[connects.key == 'MakeConnectComponent']
connects['delta1'] = connects.timestamp.diff()
p = ggplot(aes(x='delta1',
               fill='key'),
           data=connects) + \
        geom_histogram(breaks=binBreaks) + \
        scale_x_continuous(breaks=binBreaks) + \
        ggtitle('Distribution of Time Deltas Between Successive Events') + \
        ylab('Number of Events') + \
        xlab('Time Between Events (ms)')
        
print(p)

len(connections[connections.delta1 <= 1000]) # 1744 events

columns = ['timestamp', 'key']
ms.groupby('key').count().sort(columns=['timestamp'], ascending=False)[columns]

# We can also see what this looks like as a plot
msdata = ms.groupby('key').count().sort(columns=['timestamp', 'key'], ascending=False)
p = msdata['timestamp'].plot(kind='bar')
print(p)
pl.savefig("barChart.jpg", 
           dpi=300, 
           figsize=(8, 11),
           bbox_inches='tight')


nullComponentLists = pd.isnull(ms['component_list'])
ms[nullComponentLists][ms.key == 'MakeDisconnectComponent'][['key', 'component_list']]

ms[970:986][['key', 'timestamp']]

ms[980:984][['key', 'timestamp', 'delta1']]

list(ms[975:978].component_list)

list(ms[980:984].component_list)

ms[980:984].timestamp