%pylab inline import IPython import sklearn as sk import numpy as np import matplotlib import matplotlib.pyplot as plt print 'IPython version:', IPython.__version__ print 'numpy version:', np.__version__ print 'scikit-learn version:', sk.__version__ print 'matplotlib version:', matplotlib.__version__ from sklearn.datasets import fetch_20newsgroups news = fetch_20newsgroups(subset='all') n_samples = 3000 X = news.data[:n_samples] y = news.target[:n_samples] from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer def get_stop_words(): result = set() for line in open('data/stopwords_en.txt', 'r').readlines(): result.add(line.strip()) return result stop_words = get_stop_words() clf = Pipeline([ ('vect', TfidfVectorizer( stop_words=stop_words, token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b", )), ('nb', MultinomialNB(alpha=0.01)), ]) from sklearn.cross_validation import cross_val_score, KFold from scipy.stats import sem def evaluate_cross_validation(clf, X, y, K): # create a k-fold croos validation iterator of k=5 folds cv = KFold(len(y), K, shuffle=True, random_state=0) # by default the score used is the one returned by score method of the estimator (accuracy) scores = cross_val_score(clf, X, y, cv=cv) print scores print ("Mean score: {0:.3f} (+/-{1:.3f})").format( np.mean(scores), sem(scores)) evaluate_cross_validation(clf, X, y, 3) def calc_params(X, y, clf, param_values, param_name, K): # initialize training and testing scores with zeros train_scores = np.zeros(len(param_values)) test_scores = np.zeros(len(param_values)) # iterate over the different parameter values for i, param_value in enumerate(param_values): print param_name, ' = ', param_value # set classifier parameters clf.set_params(**{param_name:param_value}) # initialize the K scores obtained for each fold k_train_scores = np.zeros(K) k_test_scores = np.zeros(K) # create KFold cross validation cv = KFold(n_samples, K, shuffle=True, random_state=0) # iterate over the K folds for j, (train, test) in enumerate(cv): # fit the classifier in the corresponding fold # and obtain the corresponding accuracy scores on train and test sets clf.fit([X[k] for k in train], y[train]) k_train_scores[j] = clf.score([X[k] for k in train], y[train]) k_test_scores[j] = clf.score([X[k] for k in test], y[test]) # store the mean of the K fold scores train_scores[i] = np.mean(k_train_scores) test_scores[i] = np.mean(k_test_scores) # plot the training and testing scores in a log scale plt.semilogx(param_values, train_scores, alpha=0.4, lw=2, c='b') plt.semilogx(param_values, test_scores, alpha=0.4, lw=2, c='g') plt.xlabel(param_name + " values") plt.ylabel("Mean cross validation accuracy") # return the training and testing scores on each parameter value return train_scores, test_scores alphas = np.logspace(-7, 0, 8) print alphas train_scores, test_scores = calc_params(X, y, clf, alphas, 'nb__alpha', 3) print 'training scores: ', train_scores print 'testing scores: ', test_scores from sklearn.svm import SVC clf = Pipeline([ ('vect', TfidfVectorizer( stop_words=stop_words, token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b", )), ('svc', SVC()), ]) gammas = np.logspace(-2, 1, 4) train_scores, test_scores = calc_params(X, y, clf, gammas, 'svc__gamma', 3) print 'training scores: ', train_scores print 'testing scores: ', test_scores from sklearn.grid_search import GridSearchCV parameters = { 'svc__gamma': np.logspace(-2, 1, 4), 'svc__C': np.logspace(-1, 1, 3), } clf = Pipeline([ ('vect', TfidfVectorizer( stop_words=stop_words, token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b", )), ('svc', SVC()), ]) gs = GridSearchCV(clf, parameters, verbose=2, refit=False, cv=3) %time _ = gs.fit(X, y) gs.best_params_, gs.best_score_ from sklearn.externals import joblib from sklearn.cross_validation import ShuffleSplit import os def persist_cv_splits(X, y, K=3, name='data', suffix="_cv_%03d.pkl"): """Dump K folds to filesystem.""" cv_split_filenames = [] # create KFold cross validation cv = KFold(n_samples, K, shuffle=True, random_state=0) # iterate over the K folds for i, (train, test) in enumerate(cv): cv_fold = ([X[k] for k in train], y[train], [X[k] for k in test], y[test]) cv_split_filename = name + suffix % i cv_split_filename = os.path.abspath(cv_split_filename) joblib.dump(cv_fold, cv_split_filename) cv_split_filenames.append(cv_split_filename) return cv_split_filenames cv_filenames = persist_cv_splits(X, y, name='news') def compute_evaluation(cv_split_filename, clf, params): # All module imports should be executed in the worker namespace from sklearn.externals import joblib # load the fold training and testing partitions from the filesystem X_train, y_train, X_test, y_test = joblib.load( cv_split_filename, mmap_mode='c') clf.set_params(**params) clf.fit(X_train, y_train) test_score = clf.score(X_test, y_test) return test_score from sklearn.grid_search import ParameterGrid def parallel_grid_search(lb_view, clf, cv_split_filenames, param_grid): all_tasks = [] all_parameters = list(ParameterGrid(param_grid)) # iterate over parameter combinations for i, params in enumerate(all_parameters): task_for_params = [] # iterate over the K folds for j, cv_split_filename in enumerate(cv_split_filenames): t = lb_view.apply( compute_evaluation, cv_split_filename, clf, params) task_for_params.append(t) all_tasks.append(task_for_params) return all_parameters, all_tasks Now we use IPython parallel to get the client and a load balanced view. We must first create a local cluster of N engines by using the Cluster tab in the IPython notebook. Then we create the client, the view and execute our parallel_grid_search function: from sklearn.svm import SVC from IPython.parallel import Client client = Client() lb_view = client.load_balanced_view() all_parameters, all_tasks = parallel_grid_search( lb_view, clf, cv_filenames, parameters) def print_progress(tasks): progress = np.mean([task.ready() for task_group in tasks for task in task_group]) print "Tasks completed: {0}%".format(100 * progress) print_progress(all_tasks) def find_bests(all_parameters, all_tasks, n_top=5): """Compute the mean score of the completed tasks""" mean_scores = [] for param, task_group in zip(all_parameters, all_tasks): scores = [t.get() for t in task_group if t.ready()] if len(scores) == 0: continue mean_scores.append((np.mean(scores), param)) return sorted(mean_scores, reverse=True)[:n_top] print find_bests(all_parameters, all_tasks)