import urllib2 import pandas from collections import Counter #data = urllib2.urlopen("http://stats202.com/stats202log.txt").readlines() data = open('/Users/antigen/Downloads/logs').readlines() list_data=[] for item in data: list_data.append(item.split()) df = pandas.DataFrame(list_data) df top_10 = Counter(df[0]).most_common(10) ip=[] count=[] for key, value in top_10: ip.append(key) count.append(value) ts = pandas.Series(count, ip) ts.plot(kind="barh") status_code=[] code_counts=[] status_codes = Counter(df[10]).most_common(10) for key, value in status_codes: status_code.append(key) code_counts.append(value) codes = pandas.Series(code_counts, status_code) codes.plot(kind="barh") status_code=[] code_counts=[] status_codes = Counter(df[8]).most_common(10) for key, value in status_codes: status_code.append(key) code_counts.append(value) codes = pandas.Series(code_counts, status_code) codes.plot(kind="barh") status_code=[] code_counts=[] status_codes = Counter(df[6]).most_common(10) for key, value in status_codes: status_code.append(key) code_counts.append(value) codes = pandas.Series(code_counts, status_code) codes.plot(kind="barh") status_code=[] code_counts=[] status_codes = Counter(df[13]).most_common(10) for key, value in status_codes: status_code.append(key) code_counts.append(value) codes = pandas.Series(code_counts, status_code) codes.plot(kind="barh") status_code=[] code_counts=[] status_codes = Counter(df[14]).most_common(10) for key, value in status_codes: status_code.append(key) code_counts.append(value) codes = pandas.Series(code_counts, status_code) codes.plot(kind="barh") # http://memojo.com/~sgala/blog/2007/09/29/Python-Erlang-Map-Reduce import re from collections import defaultdict import time, sys timer = time.time FILENAME = "/Users/antigen/Downloads/o1000k.ap" t0, t1 = timer(), time.clock() matches = (re.search(r"GET /ongoing/When/\d\d\dx/(\d\d\d\d/\d\d/\d\d/[^ .]+) ",line) for line in open(FILENAME)) mapp = (match.groups()[0] for match in matches if match) count=defaultdict(int) for page in mapp: count[page] +=1 for key in sorted(count.keys(), key=count.get)[-10:]: pass # print "%40s = %s" % (key, count[key]) print timer() - t0, time.clock() - t1 for key in sorted(count.keys(), key=count.get)[-10:]: print "%40s = %s" % (key, count[key]) # a slightly optimized version of Santiago Gala's original Python # implementation. see: # http://memojo.com/~sgala/blog/2007/09/29/Python-Erlang-Map-Reduce import re from collections import defaultdict FILE = "/Users/antigen/Downloads/o1000k.ap" import time, sys if sys.platform == "win32": timer = time.clock else: timer = time.time t0, t1 = timer(), time.clock() pat = re.compile(r"GET /ongoing/When/\d\d\dx/(\d\d\d\d/\d\d/\d\d/[^ .]+) ") search = pat.search # map matches = (search(line) for line in open(FILE, "rb") if "GET /ongoing/When" in line) mapp = (match.group(1) for match in matches if match) # reduce count = defaultdict(int) for page in mapp: count[page] +=1 for key in sorted(count, key=count.get)[:10]: pass # print "%40s = %s" % (key, count[key]) print timer() - t0, time.clock() - t1 # sanity check for key in sorted(count, key=count.get)[-10:]: print "%40s = %s" % (key, count[key]) # chunked access import re from collections import defaultdict #FILE = "o1000k.ap" def getchunks(file, size=1024*1024): # scan a file, and yield sequence of (start, size) chunk descriptors # where all chunks contain full lines f = open(file, "rb") while 1: start = f.tell() f.seek(size, 1) s = f.readline() # skip forward to next line ending yield start, f.tell() - start if not s: break if 0: def process(file, chunk): # collect statistics for a chunk (process lines) f = open(file, "rb") f.seek(chunk[0]) d = defaultdict(int) search = pat.search for line in f.read(chunk[1]).splitlines(): if "GET /ongoing/When" in line: m = search(line) if m: d[m.group(1)] += 1 return d else: def process(file, chunk): # collect statistics for a chunk (process entire chunk) f = open(file, "rb") f.seek(chunk[0]) d = defaultdict(int) s = f.read(chunk[1]) for page in pat.findall(s): d[page] += 1 return d # -------------------------------------------------------------------- # main program import time, sys if sys.platform == "win32": timer = time.clock else: timer = time.time t0, t1 = timer(), time.clock() pat = re.compile(r"GET /ongoing/When/\d\d\dx/(\d\d\d\d/\d\d/\d\d/[^ .]+) ") count = defaultdict(int) for chunk in getchunks(FILE): for key, value in process(FILE, chunk).items(): count[key] += value for key in sorted(count, key=count.get)[:10]: pass # print "%40s = %s" % (key, count[key]) print timer() - t0, time.clock() - t1 # -------------------------------------------------------------------- for key in sorted(count, key=count.get)[-10:]: print "%40s = %s" % (key, count[key]) import re from collections import defaultdict, namedtuple format_pat= re.compile( r"(?P[\d\.]+)\s" r"(?P\S*)\s" r"(?P\S*)\s" r"\[(?P