#!/usr/bin/env python # coding: utf-8 # In[1]: import gzip import json from collections import Counter from scipy.misc import factorial # In[2]: def Poisson(m,z=1): return np.exp(-z)*z**m/factorial(m) # In[3]: #list of events from 24 Sep '13 #each event has a key 'utc' and a key 'ip_hash' events=json.load(gzip.open('130924_usage.json.gz'))['entries'] # In[4]: #list of events from 22 Sep '13 wevents=json.load(gzip.open('130922_usage.json.gz'))['entries'] # In[5]: #http://docs.python.org/2/library/time.html from time import time,mktime,strptime,strftime,localtime def ts2utc(ts): #convert timestring to utc return int(mktime(strptime(ts,'%d/%b/%Y:%H:%M:%S %Z'))) def utc2ts(utc): #convert utc to timestring return strftime('%d/%b/%Y:%H:%M:%S %Z',localtime(utc)) # In[6]: for ts in '24/Sep/2013:10','24/Sep/2013:00','22/Sep/2013:00': ts += ':00:00 EDT' print ts,'\t',ts2utc(ts) # In[7]: t= int(time()) days=t/86400 years=int(days/365.25) print t,'seconds ago =',days,'days ago =',years,'years ago' print 'so roughly',2016-years # In[8]: print t,'\t',utc2ts(t) print ' ',0,'\t',utc2ts(0) # In[9]: from urllib2 import urlopen from IPython.display import Image Image(urlopen('http://imgs.xkcd.com/comics/bug.png').read()) # In[10]: #create simple lists of utc's for all events in given timeframes #use one hour periods to avoid averaging periods with too different average rates # 24 Sep 10:00-11:00 t10= [e['utc'] for e in events if 1380031200 <= e['utc'] < 1380031200 + 3600 ] # 24 Sep 00:00-01:00 tm = [e['utc'] for e in events if 1379995200 <= e['utc'] < 1379995200 + 3600 ] # 22 Sep 00:00-01:00 tmw=[e['utc'] for e in wevents if 1379822400 <= e['utc'] < 1379822400 + 3600 ] # In[11]: #24 Sep 10:00-11:00, 24 Sep 00:00-01:00, 22 Sep 00:00-01:00 print len(t10),len(tm),len(tmw) print 'averages:',map(lambda x: round(len(x)/3600.,3),[t10,tm,tmw]),'per second' # In[12]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np # In[13]: scounts=Counter(tm).values() zeroes = 3600-len(scounts) plt.hist(scounts+[0]*zeroes,np.arange(-.5,20),label='data') plt.plot(3600*Poisson(np.arange(30),6.62),'r',lw=1.5,label='Poisson (avg=6.62/s)') plt.xlabel('# web hits / second') plt.ylabel('#seconds') plt.title('arXiv.org web log data, 3600 seconds from 00:00-01:00 EDT 24 Sep 2013') plt.xticks(range(0,31,2)), plt.xlim(-.5) plt.legend(); # In[14]: scounts=Counter(tmw).values() zeroes = 3600-len(scounts) plt.hist(scounts+[0]*zeroes,np.arange(-.5,20),label='data') plt.plot(3600*Poisson(np.arange(21),3.38),'r',lw=1.5,label='Poisson (avg=3.38/s)') plt.xlabel('# web hits / second'), plt.ylabel('#seconds') plt.title('arXiv.org web log data, 3600 seconds from 00:00-01:00 EDT 22 Sep 2013') plt.xticks(range(21)), plt.xlim(-.5) plt.legend(); # In[15]: scounts=Counter(t10).values() zeroes = 3600-len(scounts) plt.hist(scounts+[0]*zeroes,np.arange(-.5,30),label='data') plt.plot(3600*Poisson(np.arange(30),9.81),'r',lw=1.5,label='Poisson (avg=9.81/s)') plt.xlabel('# web hits / second') plt.ylabel('#seconds') plt.xticks(range(0,31,2)), plt.xlim(-.5) plt.title('arXiv.org web log data, 3600 seconds from 10:00-11:00 EDT 24 Sep 2013') plt.legend(); # This is not quite Poissonian, since a single host could be hitting multiple times (browser malfunction, multiple clicks, robotic access) and hence click times could be correlated. # Instead can use *distinct* hosts/sec. # In[16]: mhosts={} for e in events: utc = e['utc'] if 1380031200 < utc < 1380031200+3600: #24 Sep 2013 10:00 if utc not in mhosts: mhosts[utc]=[] mhosts[utc].append(e['ip_hash']) # In[17]: #count distinct hosts / sec dhostsm=map(lambda x:len(set(x)),mhosts.values()) zeroes = 3600-len(dhostsm) np.mean(dhostsm) # In[18]: plt.hist(dhostsm+[0]*zeroes,np.arange(-.5,30),label='data') p=Poisson(np.arange(24),9.22) yerr=np.sqrt(3600*p*(1-p)) plt.errorbar(np.arange(24),3600*p,yerr,fmt='ro',lw=1.5,label='Poisson (avg=9.22/s)') plt.xlabel('# distinct hosts / second') plt.ylabel('#seconds') plt.xticks(range(0,30,2)), plt.xlim(-.5) plt.title('arXiv.org web log data, 3600 seconds from 10:00-11:00 EDT 24 Sep 2013') plt.ylim(0,) plt.legend(); # If you want to play with any of this data, the following three files are available:
# https://courses.cit.cornell.edu/info2950_2013fa/resources/130924_10.json.gz
# https://courses.cit.cornell.edu/info2950_2013fa/resources/130924_00.json.gz
# https://courses.cit.cornell.edu/info2950_2013fa/resources/130922_00.json.gz # # They were created from utc's and ip_hashes for the three one-hour time periods as below: # In[19]: #create lists of utc's and ip_hashes for all events in given timeframes # 24 Sep 10:00-11:00 e24Sep13_10= [{'utc':e['utc'],'ip_hash':e['ip_hash']} for e in events if 1380031200 <= e['utc'] < 1380031200 + 3600 ] # 24 Sep 00:00-01:00 e24Sep13_00= [{'utc':e['utc'],'ip_hash':e['ip_hash']} for e in events if 1379995200 <= e['utc'] < 1379995200 + 3600 ] # 22 Sep 00:00-01:00 e22Sep13_00= [{'utc':e['utc'],'ip_hash':e['ip_hash']} for e in wevents if 1379822400 <= e['utc'] < 1379822400 + 3600 ] json.dump(e24Sep13_10,gzip.open('130924_10.json.gz','w')) json.dump(e24Sep13_00,gzip.open('130924_00.json.gz','w')) json.dump(e22Sep13_00,gzip.open('130922_00.json.gz','w')) # In[ ]: # In[20]: #"galaxies" in the sky n=10 #number of bins per axis mu=3 #average number of points per bin xdata,ydata=n*np.random.random([2,mu*n*n]) # In[21]: plt.figure(figsize=(6.5,6)) plt.scatter(xdata,ydata); plt.axis((0,10,0,10)) plt.grid('on'); plt.xticks(range(11)) plt.yticks(range(11)); # In[22]: n=100 mu=3 data=n*np.random.random([mu*n*n,2]) integerbins= [(int(x),int(y)) for x,y in data] counts=Counter(integerbins) countvalues = [counts[(x,y)] for x in range(n) for y in range(n)] plt.hist(countvalues,np.arange(-.5,3*mu)) p=Poisson(np.arange(15),mu) yerr=np.sqrt(n*n*p*(1-p)) plt.errorbar(np.arange(15),n*n*p,yerr,fmt='ro') plt.plot(np.arange(0,15,.1),n*n*Poisson(np.arange(0,15,.1),mu),'r-') plt.ylim(0,) plt.xlim(-.5,15); # The above closely matches a Poisson distribution.
# In class I reran the above for values of $n$ equal to 10, 100, and 1000, to show how the number counts fluctuate much more from the Poission values for smaller values of $n$. # The effective number of trials is $N=n^2$ since that is the number of squares in the 2d grid where data points are counted. Each number count can be considered a Bernoulli trial with probability of success given by the Poisson probability. For example, the probability of counting 2 data points in a bin is given by $q=p(2)\approx .22$, so the expected value is given by $Nq$, the variance by $Nq(1-q)$, and the relative fractional deviation by $\sqrt{Nq(1-q)} \big/ Nq = \sqrt{(1-q)/q}\Big/n$. This is roughly 2% for $q=.22$ and $n=100$, which is why the agreement for the bin counting 2's looks good. For $n=10$, it becomes 20%, so the numbers visibly fluctuate much more from run to run. A similar estimate can be made for each number count $m=0,1,\ldots$, using $q=p(m)$. # In[ ]: