%pylab inline
import pylab as pl
import numpy as np

# Some nice default configuration for plots
pl.rcParams['figure.figsize'] = 10, 7.5
pl.rcParams['axes.grid'] = True
pl.gray()

from IPython.parallel import Client
client = Client()

len(client)

%px print("Hello from the cluster engines!")

def where_am_i():
    import os
    import socket
    
    return "In process with pid {0} on host: '{1}'".format(
        os.getpid(), socket.gethostname())


where_am_i()

direct_view = client.direct_view()

where_am_i_direct_results = direct_view.apply(where_am_i)
where_am_i_direct_results

where_am_i_direct_results.get()

where_am_i_direct_results.get_dict()

lb_view = client.load_balanced_view()

where_am_i_lb_result = lb_view.apply(where_am_i)
where_am_i_lb_result

where_am_i_lb_result.get()

import mmap_utils, model_selection
_ = reload(mmap_utils), reload(model_selection)

from sklearn.datasets import load_digits
from sklearn.preprocessing import MinMaxScaler

digits = load_digits()

X = MinMaxScaler().fit_transform(digits.data)
y = digits.target

digits_cv_split_filenames = mmap_utils.persist_cv_splits('digits_10', X, y, 10)
digits_cv_split_filenames

mmap_utils.warm_mmap_on_cv_splits(client, digits_cv_split_filenames)

from sklearn.svm import LinearSVC
from collections import OrderedDict
import numpy as np

linear_svc_params = OrderedDict((
    ('C', np.logspace(-2, 2, 5)),
))
linear_svc = LinearSVC()

linear_svc_search = model_selection.RandomizedGridSeach(lb_view)

linear_svc_search.launch_for_splits(linear_svc, linear_svc_params, digits_cv_split_filenames)

linear_svc_search

linear_svc_search.boxplot_parameters(display_train=False)

x = np.linspace(0, int(1e3), 100)

pl.plot(x, x ** 3 / 1e9)
pl.xlabel("Number of training samples")
pl.ylabel("Estimated Convergence Time of SMO (in seconds)")

1e6 ** 3 / 1e9 / 60 / 60 / 24 / 365

from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import Pipeline

nystroem_pipeline = Pipeline([
    ('nystroem', Nystroem()),
    ('clf', LinearSVC()),
])

nystroem_pipeline_params = OrderedDict((
    ('nystroem__n_components', [50, 100, 200]),
    ('nystroem__gamma', np.logspace(-2, 2, 5)),
    ('clf__C', np.logspace(-2, 2, 5)),
))

nystroem_search = model_selection.RandomizedGridSeach(lb_view)
nystroem_search.launch_for_splits(nystroem_pipeline, nystroem_pipeline_params, digits_cv_split_filenames)

nystroem_search

nystroem_search.boxplot_parameters()

client.abort()