# modified version of
# http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html#example-svm-plot-iris-py
import os
import json
import pickle
import pprint
import numbers
import itertools
from functools import partial
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn import metrics
import joblib
from naklar import experiment
# record these metrics for each experiment
metric_fns = [('precision', partial(metrics.precision_score, average='micro')),
('recall', partial(metrics.recall_score, average='micro')),
('f1', partial(metrics.f1_score, average='micro')),
('accuracy', metrics.accuracy_score)]
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features. We could
# avoid this ugly slicing by using a two-dim dataset
y = iris.target
def evaluate_settings(i, output='results', record_metrics=[], dump=['pickle', 'json', 'joblib'], **kwargs):
"""Evaluate an experimental condition and dump results onto disk.
"""
svc = svm.SVC(**kwargs).fit(X, y)
pred = svc.predict(X)
conf = kwargs
conf['i'] = i
if record_metrics:
metrics = {}
for metric_name, metric in record_metrics:
conf[metric_name] = metric(y, pred)
# store results on disk
conf['results'] = pred # results is a NumPy array
if not os.path.exists(os.path.join('.', output, str(i))):
os.makedirs(os.path.join('.', output, str(i)))
if 'pickle' in dump:
conf['path'] = os.path.join(output, str(i))
conf_file = os.path.join(output, str(i), 'conf.pkl')
with open(conf_file, 'wb') as out:
pickle.dump(conf, out)
# dump a json file
if 'json' in dump:
conf_file = os.path.join(output, str(i), 'conf.json')
with open(conf_file, 'w') as out:
conf_ = {k:v for k, v in conf.items() if isinstance(v, (str, numbers.Number, list, set))}
json.dump(conf_, out)
# dump a joblib file
# the nice thing about joblib is that it'll dump large NumPy arrays
# as separate files
if 'joblib' in dump:
conf_file = os.path.join(output, str(i), 'conf.jbl')
joblib.dump(conf, conf_file)
# store the trained model as well
with open(os.path.join(output, str(i), 'svc.skl'), 'wb') as out:
pickle.dump(svc, out)
kernels = ['linear', 'rbf', 'poly']
Cs = [0.001, 0.1, 1, 2]
# could also use sklearn.cross_validation
jobs = itertools.product(kernels, Cs)
for i, (kernel, C) in enumerate(jobs):
evaluate_settings(i, record_metrics=metric_fns, kernel=kernel, C=C)
TBW
The entire purpose of naklar
is to make loading experiment results and selecting certain parts of experiments for analysis easy. The motivation comes from personal experience with hyperparameter optimisation, using different preprocessing pipelines with crossvalidation and so on and so on.
It's a good idea to store results from experiments on disk and then load them up separately for analysis as opposed to doing the analysis as part of the experiment and output only figures. The experiments are normally not interactive and take a long time to run, the analysis tends to be interactive and relatively quick to run. A simple experiment (such as the one above) can easily turn into hundreds of separate output directories on disk. Traversing all these filepaths over and over with for loops is tedious error prone and downright annoying.
This is what naklar
helps you with. The only assumption naklar
makes is that result sets are separated into one directory per experimental condition and that there is a dictionary in that directory detailing the settings of that one condition. Assuming there is a host of experiment results under ./results/
you can load all of them up easily.
from naklar import experiment
#experiment.reset() # if you've loaded results into memory before you have to reset the DB
experiment.initialise('./results/', autoload=True, )
This traverses the ./results/
directory descending into all subdirectories and loads settings from a pickled Python dictionary called conf.pkl
, if you've called the settings file something else use the dict_file
parameter.
# experiment.reset()
experiment.initialise('./results/', autoload=True, dict_file='settings.pickle')
Calling experiment.initialise
creates a sqlalchemy
table definition out of all the (key, value) pairs found in the dictionaries. The data types are reflected in the following order [DateTime, Float, Integer, Boolean, String]
so if a parameter does not map onto any other SQL
type a string is used, if the data can not be represented as a string (for instance a NumPy array) an error will be thrown.
By default experiment.initialise
will create an in memory SQLite database to store the results in. Other backends can be used by calling experiment.connect
before calling experiment.initialise
, the .connect
function passes all parameters (*args
and **kwargs
) to sqlalchemy.create_engine
(http://docs.sqlalchemy.org/en/rel_1_0/core/engines.html). Altenatively the connect parameters can be passed into .initialise
as **kwargs
.
The autoload=True
tells naklar
that in addition to traversing the entire directory tree under ./results
and creating a database schema out of the dictionaries found, it should also load all of those dictionaries into the newly created database table.
nakla
is not limited to pickled Python dictionaries either, it can also load other file types by providing a load_func
. For instance json
or joblib
dumps. The provided load_func
callable is called once per matching file path in any subdirectory of the root directory passing the approriate file path to the function. A Python dictionary should be returned.
import json
from functools import partial
import joblib
def load_json(fpath):
with open(fpath, 'r') as fh:
conf = json.load(fh)
return conf
# load json
experiment.reset()
experiment.initialise('./results/', autoload=True, dict_file='conf.json', load_func=load_json)
experiment.select('i', 'path', 'f1')
[(0, 'results/0', 0.76), (1, 'results/1', 0.8000000000000002), (10, 'results/10', 0.8133333333333334), (11, 'results/11', 0.82), (2, 'results/2', 0.82), (3, 'results/3', 0.82), (4, 'results/4', 0.8066666666666665), (5, 'results/5', 0.82), (6, 'results/6', 0.8266666666666667), (7, 'results/7', 0.8266666666666667), (8, 'results/8', 0.8066666666666665), (9, 'results/9', 0.82)]
joblib
dumps are slightly more complicated as they can contain numpy
arrays. It's a good idea to use memory mapping to load the numpy
arrays as they can easily take up a lot of memory, however the memory mapped arrays have to be closed explicitly. There's a helper function in naklar.util.load_joblib
that replaces numpy array values with their filepath and makes sure the memory mapped array is closed.
# load joblib
# it's usually a good idea to use memory mapping in case joblib has dumped
# large numpy arrays into the dictionary
experiment.reset()
experiment.initialise('./results/', autoload=True, dict_file='conf.jbl', load_func=naklar.util.load_joblib)
experiment.select('i', 'path', 'f1')
[(0, 'results/0', '0.76'), (1, 'results/1', '0.8'), (10, 'results/10', '0.813333333333333'), (11, 'results/11', '0.82'), (2, 'results/2', '0.82'), (3, 'results/3', '0.82'), (4, 'results/4', '0.806666666666667'), (5, 'results/5', '0.82'), (6, 'results/6', '0.826666666666667'), (7, 'results/7', '0.826666666666667'), (8, 'results/8', '0.806666666666667'), (9, 'results/9', '0.82')]
If some of the parameters need custom getter
(setter
is somewhat untested) code those can be provided to .initialise
. One use case for this is path translation between different file systems. For instance if the experiments are done on a separate computing cluster and the results are then downloaded onto a desktop for analysis the path references will differ. These can be automatically translated using a custom getter
.
from functools import partial
from naklar import decorator
output_path = partial(decorator.translate_path,
ptrn='/home/me/results/',
replace='/usr/local/scratch/results/')
experiment.initialise(root_dir='./results' autoload=True,
decorators={'output': (home_path, )})
The custom getters and setters are defined as a dictionary to .initialise
, the keys should match those defined in the settings dictionary and the value needs to be a 1- or 2-tuple (1-tuple is getter only, 2-tuple is getter and setter). The get/set callable should take at least one parameter self
and return the altered value also in the case of a setter.
It is also possible to add parameters to the experiment
table/object using decorators. For instance computing adding a timestamp parsed from the experiment filepath. These parameters are also defined in the decorators dictionary and they become extra columns in the table definition.
Consider a settings dictionary that contains an output
field which is the output file path for the experiment:
/home/user/results/exp.39034.21102015141054
/home/user/results/exp.89045.21102015113236
where the last part is a timestamp of when the experiment was started. This timestamp can be added to the table definition by only defining a setter
decorator, however since naklar
has no way of knowing what the data type of this parameter is a column definition must also be provided.
import pandas as pd
import sqlalchemy
def set_startdate(self, v):
pth = self.output
ts = pth[-14:]
ts = (ts[:2], ts[2:4], ts[4:8], ts[8:10], ts[10:12], ts[12:])
ts = pd.to_datetime('{}.{}.{} {}:{}:{}'.format(*ts))
return ts
startdate = (None, set_startdate, sqlalchemy.Column('startdate', sqlalchemy.DateTime))
experiment.initialise(root_dir='./results' autoload=True, decorators={'startdate': startdate})
The first value of the decorator tuple is None
as this corresponds to the getter method. As we don't want the getter to do anything special - the default getter that just returns the value is enough - the value can be set to be anything that isn't a callable. The second parameter is the setter, and the third is the column definition.
The decorators are implemented as sqlalchemy
hybrid properties but only ever use the two first paramters of the hybrid property - get and set for the python object. The latter two parameters are the SQL
side get and set method, but they are currently not supported.
This means that if the field output
has a decorator, calling experiment.select('output')
will fail as this will attempt to access output
on the class not on an object, i.e. experiment.E.output
. Calling experiment.select()[0].output
is still perfectly fine as the elements in the list are instantiated objects.
experiment.E
object¶After the initialisation the reflected database definition is stored as experiment.E
. Any parameters found from the settings dictionaries will become properties on experiment.E
, all these parameters are sqlalchemy
properties.
experiment.E
naklar.experiment.Exp
experiment.E.f1, experiment.E.path
(<sqlalchemy.orm.attributes.InstrumentedAttribute at 0x10b352a98>, <sqlalchemy.orm.attributes.InstrumentedAttribute at 0x10b352d58>)
naklar
¶Since naklar
is basically just a wrapper around sqlalchemy
you can bypass the whole thing and access the backend directly. The .initialise()
function creates a database engine as experiment._engine
that can be used to create sessions.
Using the raw sqlalchemy
queries is much more verbose but may in some cases be necessary.
experiment.select('kernel', kernel = 'linear')
[('linear'), ('linear'), ('linear'), ('linear')]
experiment.select('kernel', experiment.E.kernel == 'linear')
[('linear'), ('linear'), ('linear'), ('linear')]
from sqlalchemy.orm import Session
session = Session(bind=experiment._engine)
q = session.query(experiment.E.kernel)
q = q.filter(experiment.E.kernel.in_(['linear']))
pprint.pprint(q.all())
session.close()
[('linear',), ('linear',), ('linear',), ('linear',)]
from sqlalchemy.orm import Session
session = Session(bind=experiment._engine)
q = session.query('path')
q = q.filter(experiment.E.kernel.in_(['linear']))
pprint.pprint(q.all())
session.close()
[('results/0',), ('results/1',), ('results/2',), ('results/3',)]
Calling experiment.select()
with no arguments will fetch all experiments currently in the database and returns a list of the reflected sqlalchemy
rows.
experiment.select()
[<naklar.experiment.Exp at 0x10b1d1390>, <naklar.experiment.Exp at 0x10b1d13c8>, <naklar.experiment.Exp at 0x10b1d1208>, <naklar.experiment.Exp at 0x10b1d1588>, <naklar.experiment.Exp at 0x10b1d1940>, <naklar.experiment.Exp at 0x10b1d1978>, <naklar.experiment.Exp at 0x10b1d1c18>, <naklar.experiment.Exp at 0x10b1d1c88>, <naklar.experiment.Exp at 0x10b1d1d30>, <naklar.experiment.Exp at 0x10b1d1dd8>, <naklar.experiment.Exp at 0x10b1d1e80>, <naklar.experiment.Exp at 0x10b1d1f28>]
Defining column names will instead return NamedTuple
s
experiment.select('path', 'C', 'f1')
[('results/0', 0.001, '0.76'), ('results/1', 0.1, '0.8'), ('results/10', 1.0, '0.813333333333333'), ('results/11', 2.0, '0.82'), ('results/2', 1.0, '0.82'), ('results/3', 2.0, '0.82'), ('results/4', 0.001, '0.806666666666667'), ('results/5', 0.1, '0.82'), ('results/6', 1.0, '0.826666666666667'), ('results/7', 2.0, '0.826666666666667'), ('results/8', 0.001, '0.806666666666667'), ('results/9', 0.1, '0.82')]
experiment.select(kernel='linear')
[<naklar.experiment.Exp at 0x10b1e23c8>, <naklar.experiment.Exp at 0x10b1e2438>, <naklar.experiment.Exp at 0x10b1e24a8>, <naklar.experiment.Exp at 0x10b1e2518>]
experiment.select('path', C=1)
[('results/10'), ('results/2'), ('results/6')]
experiment.select('path', C=[1, 2])
[('results/10'), ('results/11'), ('results/2'), ('results/3'), ('results/6'), ('results/7')]
Non existent settings don't matter
experiment.select('path', 'C', C=[1, 2, 3])
[('results/10', 1.0), ('results/11', 2.0), ('results/2', 1.0), ('results/3', 2.0), ('results/6', 1.0), ('results/7', 2.0)]
All the select statements above are translated into sqlalchemy
queries. The **kwargs
provided to .select()
are applied to the query as filters and translate directly into sqlalchemy
query filters. The following two statements are equivalent.
experiment.select('path', C=[1, 2, 3])
[('results/10'), ('results/11'), ('results/2'), ('results/3'), ('results/6'), ('results/7')]
experiment.select('path', experiment.E.C.in_([1, 2, 3]))
[('results/10'), ('results/11'), ('results/2'), ('results/3'), ('results/6'), ('results/7')]
Notice however that for equality comparisons there is a slight syntactic change. For the sqlalchemy
version you need to use ==
(double equals) for the query filter to be valid, a single equals sign is a syntax error.
experiment.select('path', C=1)
[('results/10'), ('results/2'), ('results/6')]
# the sqlalchemy version uses double equals ==
experiment.select('path', experiment.E.C == 1)
[('results/10'), ('results/2'), ('results/6')]
experiment.select('path', experiment.E.C = 1)
File "<ipython-input-83-5c5677046345>", line 1 experiment.select('path', experiment.E.C = 1) ^ SyntaxError: keyword can't be an expression
Using the experiment.E
table definition is a little cumbersome for simple queries where a parameter value has to match something or be on of a defined set. In some cases it is not possible to define the query filter as a variable assignmend C=1
, for instance selecting all values that are above or below some threshold. In those cases using experiment.E
becomes useful.
Given the following results we only want to select those where the F1-score is greter than 82%, that is experiments 6 and 7.
experiment.select('i', 'kernel', 'C', 'f1')
[(0, 'linear', 0.001, '0.76'), (1, 'linear', 0.1, '0.8'), (10, 'poly', 1.0, '0.813333333333333'), (11, 'poly', 2.0, '0.82'), (2, 'linear', 1.0, '0.82'), (3, 'linear', 2.0, '0.82'), (4, 'rbf', 0.001, '0.806666666666667'), (5, 'rbf', 0.1, '0.82'), (6, 'rbf', 1.0, '0.826666666666667'), (7, 'rbf', 2.0, '0.826666666666667'), (8, 'poly', 0.001, '0.806666666666667'), (9, 'poly', 0.1, '0.82')]
experiment.select('i', 'kernel', 'C', 'f1', experiment.E.f1 > 0.82)
[(6, 'rbf', 1.0, '0.826666666666667'), (7, 'rbf', 2.0, '0.826666666666667')]
sqlalchemy
allows you to define hybrid attributes (http://docs.sqlalchemy.org/en/rel_1_0/orm/extensions/hybrid.html) for a database schema definition. These can be used to map values on disk to some other values.
Some experiment outputs are not easy to store in database, things like file system paths or single performance metrics are fine but entire tables of cross validated results are a bit trickier. One way to get around this problem is to dump the metric data to a separate file (say a pandas
data frame) and have just a path reference to that file.
A decorator can then easily be added to the experiment.E
object that returns the data frame instead of the path.
Consider for instance the case of running experiments on a different machine to where the analysis is done. It is likely that the file system path mappings won't be the same. The settings dictionary will contain paths that are valid on the experiment machine but not on the analysis machine - a computing cluster vs. a desktop machine for instance. These paths can easily be mapped using regular expressions.
naklar.decorator
contains some ready made decorator functions that can be used to do this.
Say for instance you needed to map /mnt/scratch/results/
to /usr/local/scratch/results/
from functools import partial
from naklar import decorator
home_path = partial(decorator.translate_path,
ptrn='/mnt/scratch/results/',
replace='/usr/local/scratch/results/')
experiment.initialise('/usr/local/scratch/lustre/results/', autoload=True, dict_file='args.final',
restrict_keys=restrict_keys,
decorators={'output': (home_path, None)})
InvalidRequestError: Table 'experiment' is already defined for this MetaData instance. Specify 'extend_existing=True' to redefine options and columns on an existing Table object.
This happens when a table definition is loaded onto an already existing table definition. The fix is simple, just all experiment.reset()
before experiment.initialise()
- .reset()
will remove the existing table definition.