%matplotlib inline
import bz2
import os
import glob
import re
import pandas as pd
class PathNotExists(Exception):
"""
Exception raised when requested node does not exists
"""
pass
class Node(dict):
"""
Nodes are representing entries in the directory structure and store cummulated size of all sub-nodes
"""
def __init__(self, name, size=0, replica_size=0):
self.name = name
self.size = size
self.replica_size = replica_size
def add_size(self, size):
self.size += size
def add_replica_size(self, replica_size):
self.replica_size += replica_size
def get_node(self, relative_path):
return_node = self
for name in relative_path.split(os.sep):
if name:
return_node = return_node.get(name)
if not return_node:
raise PathNotExists(relative_path)
return return_node
def __missing__(self, key):
new_node = Node(name=key)
self[key] = new_node
return new_node
def get_space_usage_chimera(chimera_dump):
root = Node('root')
with bz2.BZ2File(chimera_dump, 'r') as f:
for line in f:
current_node = root
if '/pnfs'in line:
prefix=line
else:
lfn, pnfs_id, chksum, size, access_time, pool = line.split('\t')
lfn = "%s/%s" % (prefix, lfn)
size = long(size)
number_of_replicas = len(pool.split(','))-1
current_node.add_size(size)
if number_of_replicas:
current_node.add_replica_size(number_of_replicas*size)
for name in lfn.split(os.sep):
if name:
current_node = current_node[name]
current_node.add_size(size)
if number_of_replicas:
current_node.add_replica_size(number_of_replicas*size)
return root
convert_to_tb = lambda bytes: bytes/1000/1000/1000/1000
def read_cached_values(filename):
try:
data_frame = pd.read_csv(filename, parse_dates=['date'])
data_frame.drop(['total_size_ddm', 'total_size_non_ddm'], inplace=True, axis=1)
except IOError:
data_frame = pd.DataFrame()
finally:
return data_frame
def get_phedex_subscription(date):
pass #to be implemented
#Read cached values
my_space_history = read_cached_values('history_space_data.csv')
my_replica_history = read_cached_values('history_replica_data.csv')
timestamp_re = re.compile('[0-9]+')
space_history = list()
replica_history = list()
for chimera_dump in map(os.path.basename, glob.glob(os.path.join(os.curdir,'*.bz2'))):
date_of_dump = pd.to_datetime(timestamp_re.search(chimera_dump).group(0))
if any(my_space_history.date==date_of_dump) and any(my_replica_history.date==date_of_dump):
print date_of_dump, "already exists!"
continue
root = get_space_usage_chimera(chimera_dump)
directory_space = dict()
replica_space = dict()
for directory, node in root.get_node('/pnfs/gridka.de/cms/disk-only/store').iteritems():
directory_space[directory.strip()] = convert_to_tb(float(node.size))
replica_space[directory.strip()] = convert_to_tb(float(node.replica_size))
directory_space['date'] = date_of_dump
replica_space['date'] = date_of_dump
space_history.append(directory_space)
replica_history.append(replica_space)
2016-04-07 07:05:00 already exists! 2016-04-08 07:05:00 already exists! 2016-04-09 07:05:00 already exists! 2016-04-10 07:05:00 already exists! 2016-04-11 07:05:00 already exists! 2016-04-12 07:05:00 already exists! 2016-04-13 07:05:00 already exists! 2016-04-14 07:05:00 already exists! 2016-04-15 07:05:00 already exists! 2016-04-16 07:05:00 already exists! 2016-04-17 07:05:00 already exists! 2016-04-18 07:05:00 already exists! 2016-04-19 07:05:00 already exists! 2016-04-20 07:05:00 already exists! 2016-04-21 07:05:00 already exists! 2016-04-22 07:05:00 already exists! 2016-04-23 07:05:00 already exists! 2016-04-24 07:05:00 already exists! 2016-04-25 07:05:00 already exists! 2016-04-26 07:05:00 already exists! 2016-04-27 07:05:00 already exists! 2016-04-28 07:05:00 already exists! 2016-04-29 07:05:00 already exists! 2016-04-30 07:05:00 already exists! 2016-05-01 07:05:00 already exists! 2016-05-02 07:05:00 already exists! 2016-05-03 07:05:00 already exists! 2016-05-04 07:05:00 already exists! 2016-05-05 07:05:00 already exists! 2016-05-06 07:05:00 already exists! 2016-05-07 07:05:00 already exists! 2016-05-08 07:05:00 already exists! 2016-05-09 07:05:00 already exists! 2016-05-10 07:05:00 already exists! 2016-05-11 07:05:00 already exists! 2016-05-12 07:05:00 already exists! 2016-05-13 07:05:00 already exists! 2016-05-14 07:05:00 already exists! 2016-05-15 07:05:00 already exists! 2016-05-16 07:05:00 already exists! 2016-05-17 07:05:00 already exists! 2016-05-18 07:05:00 already exists! 2016-05-19 07:05:00 already exists! 2016-05-20 07:05:00 already exists! 2016-05-21 07:05:00 already exists! 2016-05-22 07:05:00 already exists! 2016-05-23 07:05:00 already exists! 2016-05-24 07:05:00 already exists! 2016-05-25 07:05:00 already exists! 2016-05-26 07:05:00 already exists! 2016-05-27 07:05:00 already exists! 2016-05-28 07:05:00 already exists! 2016-05-29 07:05:00 already exists! 2016-05-30 07:05:00 already exists! 2016-05-31 07:05:00 already exists! 2016-06-01 07:05:00 already exists! 2016-06-02 07:05:00 already exists! 2016-06-03 07:05:00 already exists! 2016-06-04 07:05:00 already exists! 2016-06-05 07:05:00 already exists! 2016-06-06 07:05:00 already exists! 2016-06-07 07:05:00 already exists! 2016-06-08 07:05:00 already exists! 2016-06-09 07:05:00 already exists! 2016-06-10 07:05:00 already exists! 2016-06-11 07:05:00 already exists! 2016-06-12 07:05:00 already exists! 2016-06-13 07:05:00 already exists! 2016-06-14 07:05:00 already exists! 2016-06-17 07:05:00 already exists!
if space_history:
my_space_history = my_space_history.append(space_history, ignore_index=True)
if replica_history:
my_replica_history = my_replica_history.append(replica_history, ignore_index=True)
my_space_history.set_index('date', inplace=True)
my_replica_history.set_index('date', inplace=True)
ddm_columns = ['mc','data','himc', 'hidata','results']
non_ddm_columns = [column for column in my_space_history.columns if column not in ddm_columns]
my_space_history['total_size_ddm'] = my_space_history[ddm_columns].sum(axis=1)
my_space_history['total_size_non_ddm'] = my_space_history[non_ddm_columns].sum(axis=1)
my_replica_history['total_size_ddm'] = my_replica_history[ddm_columns].sum(axis=1)
my_replica_history['total_size_non_ddm'] = my_replica_history[non_ddm_columns].sum(axis=1)
ddm_columns.append('total_size_ddm')
non_ddm_columns.append('total_size_non_ddm')
monthly_space_history = my_space_history.resample('M', how='max')
monthly_replica_history = my_replica_history.resample('M', how='max')
monthly_space_history[['total_size_ddm', 'total_size_non_ddm']]
total_size_ddm | total_size_non_ddm | |
---|---|---|
date | ||
2016-04-30 | 1823.753441 | 35.614994 |
2016-05-31 | 1844.585341 | 39.611763 |
2016-06-30 | 1883.914916 | 32.920462 |
2016-07-31 | 2295.816447 | 54.253320 |
monthly_replica_history[['total_size_ddm', 'total_size_non_ddm']]
total_size_ddm | total_size_non_ddm | |
---|---|---|
date | ||
2016-04-30 | 0.282364 | 20.339355 |
2016-05-31 | 0.321294 | 20.339368 |
2016-06-30 | 0.317070 | 20.339355 |
2016-07-31 | 0.309168 | 20.339355 |
ax = monthly_space_history[non_ddm_columns].plot(kind='bar', figsize=(15, 15), title="Max. size of Non-DDM directories within one month")
ax.set_xlabel('date')
ax.set_ylabel('Size [TB]')
<matplotlib.text.Text at 0x2a5e53590>
ax = monthly_replica_history[non_ddm_columns].plot(kind='bar', figsize=(15, 15), title="Max. replica size of Non-DDM directories within one month")
ax.set_xlabel('date')
ax.set_ylabel('Size [TB]')
<matplotlib.text.Text at 0x27b131190>
ax = monthly_space_history[ddm_columns].plot(kind='bar', figsize=(15, 15), title="Max. size of DDM directories within one month")
ax.set_xlabel('date')
ax.set_ylabel('Size [TB]')
<matplotlib.text.Text at 0x2a9d4a890>
ax = monthly_replica_history[ddm_columns].plot(kind='bar', figsize=(15, 15), title="Max. replica size of DDM directories within one month")
ax.set_xlabel('date')
ax.set_ylabel('Size [TB]')
<matplotlib.text.Text at 0x27e547710>
ax = my_space_history[ddm_columns].plot(figsize=(10, 10), title="Size of ddm directories")
ax.set_xlabel('date')
ax.set_ylabel('Size [TB]')
<matplotlib.text.Text at 0x2b36fc850>
ax = my_replica_history[ddm_columns].plot(figsize=(10, 10), title="Replica size of ddm directories")
ax.set_xlabel('date')
ax.set_ylabel('Size [TB]')
<matplotlib.text.Text at 0x2b36fc7d0>
ax = my_space_history[non_ddm_columns].plot(figsize=(10, 10), title="Size of non-ddm directories")
ax.set_xlabel('date')
ax.set_ylabel('Size [TB]')
<matplotlib.text.Text at 0x293d11650>
ax = my_replica_history[non_ddm_columns].plot(figsize=(10, 10), title="Replica size of non-ddm directories")
ax.set_xlabel('date')
ax.set_ylabel('Size [TB]')
<matplotlib.text.Text at 0x28b5ed090>
my_space_history.to_csv('history_space_data.csv')
my_replica_history.to_csv('history_replica_data.csv')