import pandas as pd
import numpy as np
import datetime as dt
import time
from ggplot import *
from __future__ import division
%pylab --no-import-all
%matplotlib inline
Using matplotlib backend: Qt4Agg Populating the interactive namespace from numpy and matplotlib
files = !ls *.log
len(files)
63
parse_datetime = lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S %f')
def read_data(filename, n_client, report=False):
logfilename = filename
logfile = pd.read_csv(logfilename, header=None, error_bad_lines=False)
logfile['timestamp_with_ms'] = (logfile[0] + ' ' + logfile[1].astype(str)).apply(parse_datetime)
logfile = logfile.drop([1], axis=1)
logfile = logfile.sort('timestamp_with_ms').reset_index(drop=True)
logfile.columns = ['timestamp', 'cid', 'qsize', 'timestamp_with_ms']
# if average throughput per client is reported in the log
if report:
logfile = logfile[:-(n_client + 3)]
logfile = logfile.dropna()
return logfile
def per_second_aggregate(logfile_df):
qsize_agg_second = logfile_df.groupby('timestamp').sum()
agg_second = pd.DataFrame({'qsize':qsize_agg_second.qsize}).reset_index()
agg_second.timestamp = agg_second.timestamp.apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
return agg_second
logfile = read_data('test_12_10.log', 12, report=True)
agg_second = per_second_aggregate(logfile)
# queue size over time, aggregated per second
ggplot(aes('timestamp', 'qsize'), agg_second) + \
geom_line(linestyle='') + \
geom_point(marker='o', alpha=.5, size=30)
<ggplot: (9826337)>
agg_second.max()
timestamp 2014-01-17 18:28:33 qsize 89 dtype: object
# average processing time
t = logfile.timestamp_with_ms
t_1 = logfile.timestamp_with_ms.shift()
(t - t_1).fillna(0).apply(lambda x: x / np.timedelta64(1, 's')).mean()
0.96366452304394457
# average throughput per client
filesize = 1.865336 # MB
total_served = logfile.groupby('cid').count()['cid']
total_served = total_served[total_served.index != ' _']
test_runtime = (logfile.timestamp_with_ms.iloc[-1] - logfile.timestamp_with_ms.iloc[0]).total_seconds()
client_tput = total_served * filesize * 8 / test_runtime
print client_tput,\
'\n\naverage: ' + client_tput.mean().astype(str),\
'\n\npercent>=1.3Mbps: ' + str(len(client_tput[client_tput >= 1.3]) / len(client_tput) * 100)
cid 0 1.311193 1 1.327790 10 1.211609 11 1.244804 2 1.277998 3 1.327790 4 1.344388 5 1.344388 6 1.294596 7 1.261401 8 1.294596 9 1.244804 Name: cid, dtype: float64 average: 1.2904463157 percent>=1.3Mbps: 41.6666666667
!cat test_12* | grep Mbps | grep -v average > tputs
tput = pd.read_csv('tputs', sep=' ', header=None,
names=['date', 'time', 'cid', 'Mbps', 'unit'])
tput = tput.drop(['date', 'time', 'unit'], axis=1)
tput.cid = tput.cid.apply(lambda x: x.strip(':'))
ggplot(aes('Mbps'), tput) + \
geom_histogram(binwidth=.02) + \
geom_density()
<ggplot: (9778425)>
!cat test_12* | grep -v ================= | grep -v Mbps > all_log.log
logfile = read_data('all_log.log', 13, report=False)
agg_second = per_second_aggregate(logfile)
# queue size over time, aggregated per second
ggplot(aes('timestamp', 'qsize'), agg_second) + \
geom_line(linestyle='') + \
geom_point(marker='o', alpha=.5, size=30)
<ggplot: (9974767)>
hours = logfile.timestamp_with_ms.apply(lambda x: x.hour)
logfile.groupby(hours).mean()['qsize'].apply(lambda x: round(x))
timestamp_with_ms 0 0 2 0 4 0 6 0 7 0 9 0 11 0 13 0 18 13 19 0 21 0 22 0 Name: qsize, dtype: float64