#!/usr/bin/env python
# coding: utf-8

# In[1]:


import gzip
import json
from collections import Counter
from scipy.misc import factorial


# In[2]:


def Poisson(m,z=1): return np.exp(-z)*z**m/factorial(m)


# In[3]:


#list of events from 24 Sep '13
#each event has a key 'utc' and a key 'ip_hash'
events=json.load(gzip.open('130924_usage.json.gz'))['entries']


# In[4]:


#list of events from 22 Sep '13
wevents=json.load(gzip.open('130922_usage.json.gz'))['entries']


# In[5]:


#http://docs.python.org/2/library/time.html
from time import time,mktime,strptime,strftime,localtime

def ts2utc(ts):    #convert timestring to utc
    return int(mktime(strptime(ts,'%d/%b/%Y:%H:%M:%S %Z')))

def utc2ts(utc):   #convert utc to timestring
    return strftime('%d/%b/%Y:%H:%M:%S %Z',localtime(utc))


# In[6]:


for ts in '24/Sep/2013:10','24/Sep/2013:00','22/Sep/2013:00':
    ts += ':00:00 EDT'
    print ts,'\t',ts2utc(ts)


# In[7]:


t= int(time())
days=t/86400
years=int(days/365.25)

print t,'seconds ago =',days,'days ago =',years,'years ago'
print 'so roughly',2016-years


# In[8]:


print t,'\t',utc2ts(t)
print '        ',0,'\t',utc2ts(0)


# In[9]:


from urllib2 import urlopen
from IPython.display import Image
Image(urlopen('http://imgs.xkcd.com/comics/bug.png').read())


# In[10]:


#create simple lists of utc's for all events in given timeframes
#use one hour periods to avoid averaging periods with too different average rates
# 24 Sep 10:00-11:00
t10= [e['utc'] for e in events if 1380031200 <= e['utc'] < 1380031200 + 3600 ]
# 24 Sep 00:00-01:00
tm = [e['utc'] for e in events if 1379995200 <= e['utc'] < 1379995200 + 3600 ]
# 22 Sep 00:00-01:00
tmw=[e['utc'] for e in wevents if 1379822400 <= e['utc'] < 1379822400 + 3600 ]


# In[11]:


#24 Sep 10:00-11:00, 24 Sep 00:00-01:00, 22 Sep 00:00-01:00
print len(t10),len(tm),len(tmw)
print 'averages:',map(lambda x: round(len(x)/3600.,3),[t10,tm,tmw]),'per second'


# In[12]:


import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np


# In[13]:


scounts=Counter(tm).values()
zeroes = 3600-len(scounts)

plt.hist(scounts+[0]*zeroes,np.arange(-.5,20),label='data')
plt.plot(3600*Poisson(np.arange(30),6.62),'r',lw=1.5,label='Poisson (avg=6.62/s)')
plt.xlabel('# web hits / second')
plt.ylabel('#seconds')
plt.title('arXiv.org web log data, 3600 seconds from 00:00-01:00 EDT 24 Sep 2013')
plt.xticks(range(0,31,2)), plt.xlim(-.5)
plt.legend();


# In[14]:


scounts=Counter(tmw).values()
zeroes = 3600-len(scounts)

plt.hist(scounts+[0]*zeroes,np.arange(-.5,20),label='data')
plt.plot(3600*Poisson(np.arange(21),3.38),'r',lw=1.5,label='Poisson (avg=3.38/s)')
plt.xlabel('# web hits / second'), plt.ylabel('#seconds')
plt.title('arXiv.org web log data, 3600 seconds from 00:00-01:00 EDT 22 Sep 2013')
plt.xticks(range(21)), plt.xlim(-.5)
plt.legend();


# In[15]:


scounts=Counter(t10).values()
zeroes = 3600-len(scounts)

plt.hist(scounts+[0]*zeroes,np.arange(-.5,30),label='data')
plt.plot(3600*Poisson(np.arange(30),9.81),'r',lw=1.5,label='Poisson (avg=9.81/s)')
plt.xlabel('# web hits / second')
plt.ylabel('#seconds')
plt.xticks(range(0,31,2)), plt.xlim(-.5)
plt.title('arXiv.org web log data, 3600 seconds from 10:00-11:00 EDT 24 Sep 2013')
plt.legend();


# This is not quite Poissonian, since a single host could be hitting multiple times (browser malfunction, multiple clicks, robotic access) and hence click times could be correlated.
# Instead can use *distinct* hosts/sec.

# In[16]:


mhosts={}
for e in events:
    utc = e['utc']
    if 1380031200 < utc < 1380031200+3600:  #24 Sep 2013 10:00
        if utc not in mhosts: mhosts[utc]=[]
        mhosts[utc].append(e['ip_hash'])


# In[17]:


#count distinct hosts / sec
dhostsm=map(lambda x:len(set(x)),mhosts.values())
zeroes = 3600-len(dhostsm)
np.mean(dhostsm)


# In[18]:


plt.hist(dhostsm+[0]*zeroes,np.arange(-.5,30),label='data')
p=Poisson(np.arange(24),9.22)
yerr=np.sqrt(3600*p*(1-p))
plt.errorbar(np.arange(24),3600*p,yerr,fmt='ro',lw=1.5,label='Poisson (avg=9.22/s)')
plt.xlabel('# distinct hosts / second')
plt.ylabel('#seconds')
plt.xticks(range(0,30,2)), plt.xlim(-.5)
plt.title('arXiv.org web log data, 3600 seconds from 10:00-11:00 EDT 24 Sep 2013')
plt.ylim(0,)
plt.legend();


# If you want to play with any of this data, the following three files are available:<br>
# https://courses.cit.cornell.edu/info2950_2013fa/resources/130924_10.json.gz<br>
# https://courses.cit.cornell.edu/info2950_2013fa/resources/130924_00.json.gz<br>
# https://courses.cit.cornell.edu/info2950_2013fa/resources/130922_00.json.gz
# 
# They were created from utc's and ip_hashes for the three one-hour time periods as below:

# In[19]:


#create lists of utc's and ip_hashes for all events in given timeframes
# 24 Sep 10:00-11:00
e24Sep13_10= [{'utc':e['utc'],'ip_hash':e['ip_hash']}
      for e in events if 1380031200 <= e['utc'] < 1380031200 + 3600 ]
# 24 Sep 00:00-01:00
e24Sep13_00= [{'utc':e['utc'],'ip_hash':e['ip_hash']}
       for e in events if 1379995200 <= e['utc'] < 1379995200 + 3600 ]
# 22 Sep 00:00-01:00
e22Sep13_00= [{'utc':e['utc'],'ip_hash':e['ip_hash']}
           for e in wevents if 1379822400 <= e['utc'] < 1379822400 + 3600 ]

json.dump(e24Sep13_10,gzip.open('130924_10.json.gz','w'))
json.dump(e24Sep13_00,gzip.open('130924_00.json.gz','w'))
json.dump(e22Sep13_00,gzip.open('130922_00.json.gz','w'))


# In[ ]:


# In[20]:


#"galaxies" in the sky
n=10  #number of bins per axis
mu=3  #average number of points per bin
xdata,ydata=n*np.random.random([2,mu*n*n])


# In[21]:


plt.figure(figsize=(6.5,6))
plt.scatter(xdata,ydata);
plt.axis((0,10,0,10))
plt.grid('on');
plt.xticks(range(11))
plt.yticks(range(11));


# In[22]:


n=100
mu=3
data=n*np.random.random([mu*n*n,2])
integerbins= [(int(x),int(y)) for x,y in data]
counts=Counter(integerbins)
countvalues = [counts[(x,y)] for x in range(n) for y in range(n)]

plt.hist(countvalues,np.arange(-.5,3*mu))
p=Poisson(np.arange(15),mu)
yerr=np.sqrt(n*n*p*(1-p))
plt.errorbar(np.arange(15),n*n*p,yerr,fmt='ro')
plt.plot(np.arange(0,15,.1),n*n*Poisson(np.arange(0,15,.1),mu),'r-')
plt.ylim(0,)
plt.xlim(-.5,15);


# The above closely matches a Poisson distribution.<br>
# In class I reran the above for values of $n$ equal to 10, 100, and 1000, to show how the number counts fluctuate much more from the Poission values for smaller values of $n$.
# The effective number of trials is $N=n^2$ since that is the number of squares in the 2d grid where data points are counted.  Each number count can be considered a Bernoulli trial with probability of success given by the Poisson probability. For example, the probability of counting 2 data points in a bin is given by $q=p(2)\approx .22$, so the expected value is  given by $Nq$, the variance by $Nq(1-q)$, and the relative fractional deviation by $\sqrt{Nq(1-q)} \big/ Nq = \sqrt{(1-q)/q}\Big/n$. This is roughly 2% for $q=.22$ and $n=100$, which is why the agreement for the bin counting 2's looks good. For $n=10$, it becomes 20%, so the numbers visibly fluctuate much more from run to run. A similar estimate can be made for each number count $m=0,1,\ldots$, using $q=p(m)$.

# In[ ]: