from sklearn.datasets import fetch_mldata mnist = fetch_mldata('MNIST Original') # normalize and use the native DT precision level X = (mnist.data / 255.).astype(np.float32) X_train, y_train = X[:60000], mnist.target[:60000] X_test, y_test = X[60000:], mnist.target[60000:] from IPython.parallel import Client client = Client() lb_view = client.load_balanced_view() len(lb_view) from sklearn.ensemble import ExtraTreesClassifier from pyrallel.model_selection import RandomizedGridSeach from pyrallel.mmap_utils import persist_cv_splits from sklearn.cross_validation import ShuffleSplit n_samples_dev = 60000 # make it possible to use a subsample of the trainingset X_dev, y_dev = X_train[:n_samples_dev], y_train[:n_samples_dev] cv_split_filenames = persist_cv_splits(X_dev, y_dev, n_cv_iter=2, test_size=0.25, random_state=0) params = { 'criterion': ['gini', 'entropy'], 'max_features': [10, 20, 30, 50, 100, 200, 500, 768], 'min_samples_split': [2, 5, 10, 15, 20, 25, 30, 40, 50, 75, 100, 150, 200], } et = ExtraTreesClassifier(n_estimators=100, random_state=0) rgs = RandomizedGridSeach(lb_view, random_state=0) rgs.launch_for_splits(et, params, cv_split_filenames) print(rgs.report(n_top=100)) rgs.boxplot_parameters() et_final = ExtraTreesClassifier(n_estimators=1000, max_features=100, min_samples_split=2, n_jobs=16, random_state=0) %time et_final.fit(X_train, y_train) %time test_score = et_final.score(X_test, y_test) test_error = 1. - test_score test_error