In [1]:

import pandas as pd
import numpy as np
import datetime as dt
import time
from ggplot import * 
from __future__ import division

In [2]:

%pylab --no-import-all
%matplotlib inline

Using matplotlib backend: Qt4Agg
Populating the interactive namespace from numpy and matplotlib

In [3]:

files = !ls *.log
len(files)

Out[3]:

In [4]:

parse_datetime = lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S %f')

In [5]:

def read_data(filename, n_client, report=False):
    logfilename = filename
    logfile = pd.read_csv(logfilename, header=None, error_bad_lines=False)
    logfile['timestamp_with_ms'] = (logfile[0] + ' ' + logfile[1].astype(str)).apply(parse_datetime)
    logfile = logfile.drop([1], axis=1)
    logfile = logfile.sort('timestamp_with_ms').reset_index(drop=True)
    logfile.columns = ['timestamp', 'cid', 'qsize', 'timestamp_with_ms']

    # if average throughput per client is reported in the log
    if report:    
        logfile = logfile[:-(n_client + 3)]
        
    logfile = logfile.dropna()
        
    return logfile

In [6]:

def per_second_aggregate(logfile_df):
    qsize_agg_second = logfile_df.groupby('timestamp').sum()
    agg_second = pd.DataFrame({'qsize':qsize_agg_second.qsize}).reset_index()
    agg_second.timestamp = agg_second.timestamp.apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
    return agg_second

Individual log files¶

In [16]:

logfile = read_data('test_12_10.log', 12, report=True)
agg_second = per_second_aggregate(logfile)

# queue size over time, aggregated per second
ggplot(aes('timestamp', 'qsize'), agg_second) + \
    geom_line(linestyle='') + \
    geom_point(marker='o', alpha=.5, size=30)

Out[16]:

<ggplot: (9826337)>

In [12]:

agg_second.max()

Out[12]:

timestamp    2014-01-17 18:28:33
qsize                         89
dtype: object

In [13]:

# average processing time
t = logfile.timestamp_with_ms
t_1 = logfile.timestamp_with_ms.shift()
(t - t_1).fillna(0).apply(lambda x: x / np.timedelta64(1, 's')).mean()

Out[13]:

0.96366452304394457

In [14]:

# average throughput per client
filesize = 1.865336  # MB
total_served = logfile.groupby('cid').count()['cid']
total_served = total_served[total_served.index != ' _']
test_runtime = (logfile.timestamp_with_ms.iloc[-1] - logfile.timestamp_with_ms.iloc[0]).total_seconds()
client_tput = total_served * filesize * 8 / test_runtime
print client_tput,\
'\n\naverage: ' + client_tput.mean().astype(str),\
'\n\npercent>=1.3Mbps: ' + str(len(client_tput[client_tput >= 1.3]) / len(client_tput) * 100)

cid
 0     1.311193
 1     1.327790
 10    1.211609
 11    1.244804
 2     1.277998
 3     1.327790
 4     1.344388
 5     1.344388
 6     1.294596
 7     1.261401
 8     1.294596
 9     1.244804
Name: cid, dtype: float64 

average: 1.2904463157 

percent>=1.3Mbps: 41.6666666667

Aggregate throughput¶

In [93]:

!cat test_12* | grep Mbps | grep -v average > tputs

In [94]:

tput = pd.read_csv('tputs', sep=' ', header=None, 
                   names=['date', 'time', 'cid', 'Mbps', 'unit'])
tput = tput.drop(['date', 'time', 'unit'], axis=1)
tput.cid = tput.cid.apply(lambda x: x.strip(':'))

ggplot(aes('Mbps'), tput) + \
    geom_histogram(binwidth=.02) + \
    geom_density()

Out[94]:

<ggplot: (9778425)>

Aggregate log files¶

In [95]:

!cat test_12* | grep -v ================= | grep -v Mbps > all_log.log

In [96]:

logfile = read_data('all_log.log', 13, report=False)
agg_second = per_second_aggregate(logfile)

# queue size over time, aggregated per second
ggplot(aes('timestamp', 'qsize'), agg_second) + \
    geom_line(linestyle='') + \
    geom_point(marker='o', alpha=.5, size=30)

Out[96]:

<ggplot: (9974767)>

In [97]:

hours = logfile.timestamp_with_ms.apply(lambda x: x.hour)
logfile.groupby(hours).mean()['qsize'].apply(lambda x: round(x))

Out[97]:

timestamp_with_ms
0                     0
2                     0
4                     0
6                     0
7                     0
9                     0
11                    0
13                    0
18                   13
19                    0
21                    0
22                    0
Name: qsize, dtype: float64

In [ ]: