from sklearn.datasets import fetch_mldata

mnist = fetch_mldata('MNIST Original')

# normalize and use the native DT precision level 
X = (mnist.data / 255.).astype(np.float32)
X_train, y_train = X[:60000], mnist.target[:60000]
X_test, y_test = X[60000:], mnist.target[60000:]

from IPython.parallel import Client

client = Client()
lb_view = client.load_balanced_view()
len(lb_view)

from sklearn.ensemble import ExtraTreesClassifier
from pyrallel.model_selection import RandomizedGridSeach
from pyrallel.mmap_utils import persist_cv_splits
from sklearn.cross_validation import ShuffleSplit

n_samples_dev = 60000  # make it possible to use a subsample of the trainingset
X_dev, y_dev = X_train[:n_samples_dev], y_train[:n_samples_dev]

cv_split_filenames = persist_cv_splits(X_dev, y_dev, n_cv_iter=2,
                                       test_size=0.25, random_state=0)

params = {
    'criterion': ['gini', 'entropy'],
    'max_features': [10, 20, 30, 50, 100, 200, 500, 768],
    'min_samples_split': [2, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100, 150, 200],
}

et = ExtraTreesClassifier(n_estimators=100, random_state=0)
rgs = RandomizedGridSeach(lb_view, random_state=0)

rgs.launch_for_splits(et, params, cv_split_filenames)

print(rgs.report(n_top=100))

rgs.boxplot_parameters()

et_final = ExtraTreesClassifier(n_estimators=1000,
                                max_features=100,
                                min_samples_split=2,
                                n_jobs=16, random_state=0)
%time et_final.fit(X_train, y_train)

%time test_score = et_final.score(X_test, y_test)

test_error = 1. - test_score
test_error