import gzip import json from collections import Counter from scipy.misc import factorial def Poisson(mu=1,M=10,s=1): return array([exp(-mu)*mu**m/factorial(m) for m in arange(0,M,s)]) #list of events from 24 Sep '13 #each event has a key 'utc' and a key 'ip_hash' events=json.load(gzip.open('130924_usage.json.gz'))['entries'] #list of events from 22 Sep '13 wevents=json.load(gzip.open('130922_usage.json.gz'))['entries'] #http://docs.python.org/2/library/time.html from time import time,mktime,strptime,strftime,localtime def ts2utc(ts): #convert timestring to utc return int(mktime(strptime(ts,'%d/%b/%Y:%H:%M:%S %Z'))) def utc2ts(utc): #convert utc to timestring return strftime('%d/%b/%Y:%H:%M:%S %Z',localtime(utc)) for ts in '24/Sep/2013:10','24/Sep/2013:00','22/Sep/2013:00': ts += ':00:00 EDT' print ts,'\t',ts2utc(ts) t= int(time()) days=t/86400 years=int(days/365.25) print t,'seconds ago =',days,'days ago =',years,'years ago' print 'so roughly',2013-years print t,'\t',utc2ts(t) print ' ',0,'\t',utc2ts(0) from urllib2 import urlopen from IPython.display import Image Image(urlopen('http://imgs.xkcd.com/comics/bug.png').read()) #create simple lists of utc's for all events in given timeframes # 24 Sep 10:00-11:00 t10= [e['utc'] for e in events if 1380031200 <= e['utc'] < 1380031200 + 3600 ] # 24 Sep 00:00-01:00 tm = [e['utc'] for e in events if 1379995200 <= e['utc'] < 1379995200 + 3600 ] # 22 Sep 00:00-01:00 tmw=[e['utc'] for e in wevents if 1379822400 <= e['utc'] < 1379822400 + 3600 ] #24 Sep 10:00-11:00, 24 Sep 00:00-01:00, 22 Sep 00:00-01:00 print len(t10),len(tm),len(tmw) print 'averages:',map(lambda x: round(len(x)/3600.,3),[t10,tm,tmw]),'per second' scounts=Counter(tm).values() zeroes = 3600-len(scounts) hist(scounts+[0]*zeroes,arange(-.5,20),label='data') plot(3600*Poisson(6.62,30),'r',lw=1.5,label='Poisson (avg=6.62/s)') xlabel('# web hits / second') ylabel('#seconds') title('arXiv.org web log data, 3600 seconds from 00:00-01:00 EDT 24 Sep 2013') xticks(range(0,31,2)), xlim(-.5) legend(); scounts=Counter(tmw).values() zeroes = 3600-len(scounts) hist(scounts+[0]*zeroes,arange(-.5,20),label='data') plot(3600*Poisson(3.38,21),'r',lw=1.5,label='Poisson (avg=3.38/s)') xlabel('# web hits / second'), ylabel('#seconds') title('arXiv.org web log data, 3600 seconds from 00:00-01:00 EDT 22 Sep 2013') xticks(range(21)), xlim(-.5) legend(); scounts=Counter(t10).values() zeroes = 3600-len(scounts) hist(scounts+[0]*zeroes,arange(-.5,30),label='data') plot(3600*Poisson(9.81,30),'r',lw=1.5,label='Poisson (avg=9.81/s)') xlabel('# web hits / second') ylabel('#seconds') xticks(range(0,31,2)), xlim(-.5) title('arXiv.org web log data, 3600 seconds from 10:00-11:00 EDT 24 Sep 2013') legend(); mhosts={} for e in events: utc = e['utc'] if 1380031200 < utc < 1380031200+3600: #24 Sep 2013 10:00 if utc not in mhosts: mhosts[utc]=[] mhosts[utc].append(e['ip_hash']) #count distinct hosts / sec dhostsm=map(lambda x:len(set(x)),mhosts.values()) zeroes = 3600-len(dhostsm) mean(dhostsm) hist(dhostsm+[0]*zeroes,arange(-.5,30),label='data') plot(3600*Poisson(9.22,30),'r',lw=1.5,label='Poisson (avg=9.22/s)') xlabel('# distinct hosts / second') ylabel('#seconds') xticks(range(0,31,2)), xlim(-.5) title('arXiv.org web log data, 3600 seconds from 10:00-11:00 EDT 24 Sep 2013') legend(); #create lists of utc's and ip_hashes for all events in given timeframes # 24 Sep 10:00-11:00 e24Sep13_10= [{'utc':e['utc'],'ip_hash':e['ip_hash']} for e in events if 1380031200 <= e['utc'] < 1380031200 + 3600 ] # 24 Sep 00:00-01:00 e24Sep13_00= [{'utc':e['utc'],'ip_hash':e['ip_hash']} for e in events if 1379995200 <= e['utc'] < 1379995200 + 3600 ] # 22 Sep 00:00-01:00 e22Sep13_00= [{'utc':e['utc'],'ip_hash':e['ip_hash']} for e in wevents if 1379822400 <= e['utc'] < 1379822400 + 3600 ] json.dump(e24Sep13_10,gzip.open('130924_10.json.gz','w')) json.dump(e24Sep13_00,gzip.open('130924_00.json.gz','w')) json.dump(e22Sep13_00,gzip.open('130922_00.json.gz','w')) #"galaxies" in the sky n=10 #number of bins per axis mu=3 #average number of points per bin xdata,ydata=n*random.random([2,mu*n*n]) figure(figsize=(6.5,6)) scatter(xdata,ydata); axis((0,10,0,10)) grid('on'); xticks(range(11)) yticks(range(11)); n=100 mu=3 data=n*random.random([mu*n*n,2]) integerbins= [(int(x),int(y)) for x,y in data] counts=Counter(integerbins) countvalues = [counts[(x,y)] for x in range(n) for y in range(n)] hist(countvalues,arange(-.5,3*mu)) plot(n*n*Poisson(mu,15),'ro') plot(arange(0,15,.1),n*n*Poisson(mu,15,.1),'r-') xlim(-.5,15);