In [1]:
import gzip
import json
from collections import Counter
from scipy.misc import factorial
In [2]:
def Poisson(m,z=1): return np.exp(-z)*z**m/factorial(m)
In [3]:
#list of events from 24 Sep '13
#each event has a key 'utc' and a key 'ip_hash'
events=json.load(gzip.open('130924_usage.json.gz'))['entries']
In [4]:
#list of events from 22 Sep '13
wevents=json.load(gzip.open('130922_usage.json.gz'))['entries']
In [5]:
#http://docs.python.org/2/library/time.html
from time import time,mktime,strptime,strftime,localtime

def ts2utc(ts):    #convert timestring to utc
    return int(mktime(strptime(ts,'%d/%b/%Y:%H:%M:%S %Z')))

def utc2ts(utc):   #convert utc to timestring
    return strftime('%d/%b/%Y:%H:%M:%S %Z',localtime(utc))
In [6]:
for ts in '24/Sep/2013:10','24/Sep/2013:00','22/Sep/2013:00':
    ts += ':00:00 EDT'
    print ts,'\t',ts2utc(ts)
24/Sep/2013:10:00:00 EDT 	1380031200
24/Sep/2013:00:00:00 EDT 	1379995200
22/Sep/2013:00:00:00 EDT 	1379822400
In [7]:
t= int(time())
days=t/86400
years=int(days/365.25)

print t,'seconds ago =',days,'days ago =',years,'years ago'
print 'so roughly',2016-years
1457638489 seconds ago = 16870 days ago = 46 years ago
so roughly 1970
In [8]:
print t,'\t',utc2ts(t)
print '        ',0,'\t',utc2ts(0)
1457638489 	10/Mar/2016:14:34:49 EST
         0 	31/Dec/1969:19:00:00 EST
In [9]:
from urllib2 import urlopen
from IPython.display import Image
Image(urlopen('http://imgs.xkcd.com/comics/bug.png').read())
Out[9]:
In [10]:
#create simple lists of utc's for all events in given timeframes
#use one hour periods to avoid averaging periods with too different average rates
# 24 Sep 10:00-11:00
t10= [e['utc'] for e in events if 1380031200 <= e['utc'] < 1380031200 + 3600 ]
# 24 Sep 00:00-01:00
tm = [e['utc'] for e in events if 1379995200 <= e['utc'] < 1379995200 + 3600 ]
# 22 Sep 00:00-01:00
tmw=[e['utc'] for e in wevents if 1379822400 <= e['utc'] < 1379822400 + 3600 ]
In [11]:
#24 Sep 10:00-11:00, 24 Sep 00:00-01:00, 22 Sep 00:00-01:00
print len(t10),len(tm),len(tmw)
print 'averages:',map(lambda x: round(len(x)/3600.,3),[t10,tm,tmw]),'per second'
35315 23831 12176
averages: [9.81, 6.62, 3.382] per second
In [12]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
In [13]:
scounts=Counter(tm).values()
zeroes = 3600-len(scounts)

plt.hist(scounts+[0]*zeroes,np.arange(-.5,20),label='data')
plt.plot(3600*Poisson(np.arange(30),6.62),'r',lw=1.5,label='Poisson (avg=6.62/s)')
plt.xlabel('# web hits / second')
plt.ylabel('#seconds')
plt.title('arXiv.org web log data, 3600 seconds from 00:00-01:00 EDT 24 Sep 2013')
plt.xticks(range(0,31,2)), plt.xlim(-.5)
plt.legend();
In [14]:
scounts=Counter(tmw).values()
zeroes = 3600-len(scounts)

plt.hist(scounts+[0]*zeroes,np.arange(-.5,20),label='data')
plt.plot(3600*Poisson(np.arange(21),3.38),'r',lw=1.5,label='Poisson (avg=3.38/s)')
plt.xlabel('# web hits / second'), plt.ylabel('#seconds')
plt.title('arXiv.org web log data, 3600 seconds from 00:00-01:00 EDT 22 Sep 2013')
plt.xticks(range(21)), plt.xlim(-.5)
plt.legend();
In [15]:
scounts=Counter(t10).values()
zeroes = 3600-len(scounts)

plt.hist(scounts+[0]*zeroes,np.arange(-.5,30),label='data')
plt.plot(3600*Poisson(np.arange(30),9.81),'r',lw=1.5,label='Poisson (avg=9.81/s)')
plt.xlabel('# web hits / second')
plt.ylabel('#seconds')
plt.xticks(range(0,31,2)), plt.xlim(-.5)
plt.title('arXiv.org web log data, 3600 seconds from 10:00-11:00 EDT 24 Sep 2013')
plt.legend();

This is not quite Poissonian, since a single host could be hitting multiple times (browser malfunction, multiple clicks, robotic access) and hence click times could be correlated. Instead can use distinct hosts/sec.

In [16]:
mhosts={}
for e in events:
    utc = e['utc']
    if 1380031200 < utc < 1380031200+3600:  #24 Sep 2013 10:00
        if utc not in mhosts: mhosts[utc]=[]
        mhosts[utc].append(e['ip_hash'])
In [17]:
#count distinct hosts / sec
dhostsm=map(lambda x:len(set(x)),mhosts.values())
zeroes = 3600-len(dhostsm)
np.mean(dhostsm)
Out[17]:
9.2195664257921059
In [18]:
plt.hist(dhostsm+[0]*zeroes,np.arange(-.5,30),label='data')
p=Poisson(np.arange(24),9.22)
yerr=np.sqrt(3600*p*(1-p))
plt.errorbar(np.arange(24),3600*p,yerr,fmt='ro',lw=1.5,label='Poisson (avg=9.22/s)')
plt.xlabel('# distinct hosts / second')
plt.ylabel('#seconds')
plt.xticks(range(0,30,2)), plt.xlim(-.5)
plt.title('arXiv.org web log data, 3600 seconds from 10:00-11:00 EDT 24 Sep 2013')
plt.ylim(0,)
plt.legend();

If you want to play with any of this data, the following three files are available:
https://courses.cit.cornell.edu/info2950_2013fa/resources/130924_10.json.gz
https://courses.cit.cornell.edu/info2950_2013fa/resources/130924_00.json.gz
https://courses.cit.cornell.edu/info2950_2013fa/resources/130922_00.json.gz

They were created from utc's and ip_hashes for the three one-hour time periods as below:

In [19]:
#create lists of utc's and ip_hashes for all events in given timeframes
# 24 Sep 10:00-11:00
e24Sep13_10= [{'utc':e['utc'],'ip_hash':e['ip_hash']}
      for e in events if 1380031200 <= e['utc'] < 1380031200 + 3600 ]
# 24 Sep 00:00-01:00
e24Sep13_00= [{'utc':e['utc'],'ip_hash':e['ip_hash']}
       for e in events if 1379995200 <= e['utc'] < 1379995200 + 3600 ]
# 22 Sep 00:00-01:00
e22Sep13_00= [{'utc':e['utc'],'ip_hash':e['ip_hash']}
           for e in wevents if 1379822400 <= e['utc'] < 1379822400 + 3600 ]

json.dump(e24Sep13_10,gzip.open('130924_10.json.gz','w'))
json.dump(e24Sep13_00,gzip.open('130924_00.json.gz','w'))
json.dump(e22Sep13_00,gzip.open('130922_00.json.gz','w'))
In [ ]:
 
In [20]:
#"galaxies" in the sky
n=10  #number of bins per axis
mu=3  #average number of points per bin
xdata,ydata=n*np.random.random([2,mu*n*n])
In [21]:
plt.figure(figsize=(6.5,6))
plt.scatter(xdata,ydata);
plt.axis((0,10,0,10))
plt.grid('on');
plt.xticks(range(11))
plt.yticks(range(11));
In [22]:
n=100
mu=3
data=n*np.random.random([mu*n*n,2])
integerbins= [(int(x),int(y)) for x,y in data]
counts=Counter(integerbins)
countvalues = [counts[(x,y)] for x in range(n) for y in range(n)]

plt.hist(countvalues,np.arange(-.5,3*mu))
p=Poisson(np.arange(15),mu)
yerr=np.sqrt(n*n*p*(1-p))
plt.errorbar(np.arange(15),n*n*p,yerr,fmt='ro')
plt.plot(np.arange(0,15,.1),n*n*Poisson(np.arange(0,15,.1),mu),'r-')
plt.ylim(0,)
plt.xlim(-.5,15);

The above closely matches a Poisson distribution.
In class I reran the above for values of $n$ equal to 10, 100, and 1000, to show how the number counts fluctuate much more from the Poission values for smaller values of $n$. The effective number of trials is $N=n^2$ since that is the number of squares in the 2d grid where data points are counted. Each number count can be considered a Bernoulli trial with probability of success given by the Poisson probability. For example, the probability of counting 2 data points in a bin is given by $q=p(2)\approx .22$, so the expected value is given by $Nq$, the variance by $Nq(1-q)$, and the relative fractional deviation by $\sqrt{Nq(1-q)} \big/ Nq = \sqrt{(1-q)/q}\Big/n$. This is roughly 2% for $q=.22$ and $n=100$, which is why the agreement for the bin counting 2's looks good. For $n=10$, it becomes 20%, so the numbers visibly fluctuate much more from run to run. A similar estimate can be made for each number count $m=0,1,\ldots$, using $q=p(m)$.

In [ ]: