Notebook

Model Selection¶

Model selection is the task of choosing best performing classifier out of a candidate set of classifiers which is very useful for libraries like Scikit-Learn as the api for different parameters stay across different classifiers. That makes it very easy to try different "estimators" in the same dataset.

After learning about parameter search, one could do parameter search on a given classifier, but in practice you rarely depend on one classifier. Ideally, you need to compare different classifiers (as their learning function differs a lot), their performance on your real dataset may vary drastically. In order to choose not best paremeters for a given classifier, one needs to look at different classifiers and then do a parameters search on top of the those multiple classifiers.

Model selection makes sure that not only I will get the best parameters but also best classifier for a given candidate of classifiers for a scoring function that I will optimize for. But this is not good enough because I want to also see the effectof total number of features as well; when I change the total number of features, how does it affect the score of a classifier.

In [1]:

%matplotlib inline
import csv
import itertools
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn import cross_validation
from sklearn import ensemble
from sklearn.feature_extraction import text
from sklearn import feature_extraction
from sklearn import feature_selection
from sklearn import linear_model
from sklearn import metrics
from sklearn import naive_bayes
from sklearn import svm
from sklearn import tree

plt.style.use('fivethirtyeight')

_DATA_DIR = 'data'
_NYT_DATA_PATH = os.path.join(_DATA_DIR, 'nyt_title_data.csv')

_PLT_LEGEND_OPTIONS = dict(loc="upper center", 
                           bbox_to_anchor=(0.5, -0.15),
                           fancybox=True, 
                           shadow=True, 
                           ncol=3)

In [2]:

colors = [ii.strip() for ii in '#30a2da, #fc4f30, #e5ae38, #6d904f, #8b8b8b'.split(',')]
colors += ['#' + ii.strip() for ii in '348ABD, A60628, 7A68A6, 467821,D55E00,  CC79A7, 56B4E9, 009E73, F0E442, 0072B2'.split(',')]
markers = itertools.cycle(["o", "D"])
colors = itertools.cycle(colors)

def cv(X, y, clf, nfeats, clfname, scoring=metrics.accuracy_score, n_folds=10):
  stratified_k_fold = cross_validation.StratifiedKFold(y, n_folds=n_folds)
  accuracy, ii = 0., 0
  for train, test in stratified_k_fold:
    ii += 1
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = scoring(y_test, y_pred)
    accuracy += score
  accuracy /= float(n_folds)
  return accuracy

def plot_accuracies(accuracies, xvals, legends):
  fig = plt.figure(figsize=(16, 12))
  ax = fig.add_subplot(111)
  for ii in range(0, accuracies.shape[0]):
    ax.plot(xvals, accuracies[ii, :], color=next(colors), marker=next(markers), label=legends[ii])
  plt.xlabel("Number of Features")
  plt.ylabel("Accuracy")
  plt.title("Accuracy vs Number of Features")
  ax.set_xscale("log")
  box = ax.get_position()
  ax.set_position([box.x0, box.y0 + box.height * 0.3, box.width, box.height * 0.7])
  ax.legend(**_PLT_LEGEND_OPTIONS)
  plt.show()

def estimator_name(clf):
  return type(clf).__name__
    
def select_model(X, y, scoring=metrics.accuracy_score):
  n_features = np.array([10, 100, 500, 1000, 5000, 10000, 20000, 50000, 100000])
  clfs = [
    naive_bayes.BernoulliNB(),
    naive_bayes.MultinomialNB(),
    naive_bayes.GaussianNB(),
    tree.DecisionTreeClassifier(),
    ensemble.RandomForestClassifier(n_estimators=10),
    svm.LinearSVC(random_state=0),
    linear_model.LogisticRegression(),
    linear_model.SGDClassifier(),
    linear_model.RidgeClassifier(),
  ]
  
  classifier_names = map(estimator_name, clfs)
  feature_selection_methods = [feature_selection.f_classif]
  accuracies = np.zeros((len(clfs), len(n_features), len(feature_selection_methods)))
  for kk in range(len(feature_selection_methods)):
    X_feature_selected = X.copy().toarray()
    for jj in range(len(n_features)):
      for ii in range(len(clfs)):
        accuracies[ii, jj, kk] = cv(X_feature_selected, y, clfs[ii], n_features[jj], classifier_names[ii], scoring=scoring)
  for k in range(len(feature_selection_methods)):
    for i in range(len(clfs)):
      print "%22s " % classifier_names[i],
      for j in range(accuracies.shape[1]):
        print "%5.3f" % accuracies[i, j, k],
      print
    plot_accuracies(accuracies[:, :, k], n_features, classifier_names)

In [3]:

with open(_NYT_DATA_PATH) as nyt:
    nyt_data = []
    nyt_labels = []
    csv_reader = csv.reader(nyt)
    for line in csv_reader:
      nyt_labels.append(int(line[0]))
      nyt_data.append(line[1])

Unigram & Bigram for features¶

In [4]:

X = np.array([''.join(el) for el in nyt_data])
y = np.array([el for el in nyt_labels])


vectorizer = text.TfidfVectorizer(min_df=2, 
                                  ngram_range=(1, 2), 
                                  stop_words='english', 
                                  strip_accents='unicode', 
                                  norm='l2')
 
example = unicode(X[5])

print("Example string:      {}".format(example))
print("Preprocessed string: {}".format(vectorizer.build_preprocessor()(example)))
print("Tokenized string:    {}".format(str(vectorizer.build_tokenizer()(example))))
print("N-gram data string:  {}".format(str(vectorizer.build_analyzer()(example))))

Example string:      Clinton Defends His Policies In Kosovo, China and Mexico
Preprocessed string: clinton defends his policies in kosovo, china and mexico
Tokenized string:    [u'Clinton', u'Defends', u'His', u'Policies', u'In', u'Kosovo', u'China', u'and', u'Mexico']
N-gram data string:  [u'clinton', u'defends', u'policies', u'kosovo', u'china', u'mexico', u'clinton defends', u'defends policies', u'policies kosovo', u'kosovo china', u'china mexico']

In TfidfVectorizer, one may choose the ngram_range, in this example, it is bigram and unigram. Choosing bigram enables us to capture phrases that could play an important role in determining the class of the observation in the dataset. If you believe, even a longer sequence of words internet of things play an important role, you could pass ngram_range to (1, 3) to include trigrams as well.

In [6]:

Out[6]:

array(['Dole Courts Democrats', 'Yanks End Drought; Mets Fall in Opener',
       'Lumpectomies Seen As Equal in Benefit To Breast Removals', ...,
       'Delays Hurting U.S. Rebuilding In Afghanistan',
       'SENATE APPROVES $1 BILLION TO AID COLOMBIA MILITARY',
       'POLITICS: THE MONEY; A Hollywood Production: Political Money'], 
      dtype='|S127')

In [7]:

X = vectorizer.fit_transform(X)

In [8]:

Out[8]:

<2161x2171 sparse matrix of type '<type 'numpy.float64'>'
	with 10377 stored elements in Compressed Sparse Row format>

In [17]:

select_model(X, y)

           BernoulliNB  0.528 0.528 0.528 0.528 0.528 0.528 0.528 0.528 0.528
         MultinomialNB  0.581 0.581 0.581 0.581 0.581 0.581 0.581 0.581 0.581
            GaussianNB  0.549 0.549 0.549 0.549 0.549 0.549 0.549 0.549 0.549
DecisionTreeClassifier  0.519 0.529 0.526 0.523 0.524 0.531 0.524 0.528 0.521
RandomForestClassifier  0.595 0.593 0.589 0.600 0.601 0.589 0.601 0.588 0.587
             LinearSVC  0.671 0.671 0.671 0.671 0.671 0.671 0.671 0.671 0.671
    LogisticRegression  0.583 0.583 0.583 0.583 0.583 0.583 0.583 0.583 0.583
         SGDClassifier  0.608 0.608 0.608 0.608 0.608 0.608 0.608 0.608 0.608
       RidgeClassifier  0.679 0.679 0.679 0.679 0.679 0.679 0.679 0.679 0.679

This plot not only shows the relative strength of classifiers with respect to each for a given number of features but also provides a good picture on the effect of number of features for the classification task at the hand. Generally even if Scikit-Learn has all grid search other capabilities to output the best estimator for a given classifier or pipeline, I'd prefer some plot over those grids. You not only learn the best parameters but also effects of classifier and feature number in a one plot. And this could be easily adopted to different number of paramters(instead of number of features) for classifiers that share a number of parameters in the optimization to see the effect of each parameter visually rather than getting the best parameters. This way you could both explore and gain insights about parameters that the classifier has.

We could always pass another scoring function to compute the accuracies of each classifier.

In [18]:

select_model(X, y, scoring=metrics.f1_score)

           BernoulliNB  0.468 0.468 0.468 0.468 0.468 0.468 0.468 0.468 0.468
         MultinomialNB  0.538 0.538 0.538 0.538 0.538 0.538 0.538 0.538 0.538
            GaussianNB  0.548 0.548 0.548 0.548 0.548 0.548 0.548 0.548 0.548
DecisionTreeClassifier  0.537 0.534 0.526 0.535 0.538 0.532 0.535 0.531 0.534
RandomForestClassifier  0.590 0.584 0.590 0.599 0.590 0.577 0.573 0.584 0.579
             LinearSVC  0.667 0.667 0.667 0.667 0.667 0.667 0.667 0.667 0.667
    LogisticRegression  0.554 0.554 0.554 0.554 0.554 0.554 0.554 0.554 0.554
         SGDClassifier  0.613 0.613 0.613 0.613 0.613 0.613 0.613 0.613 0.613
       RidgeClassifier  0.674 0.674 0.674 0.674 0.674 0.674 0.674 0.674 0.674

In [19]:

select_model(X, y, scoring=metrics.precision_score)

           BernoulliNB  0.652 0.652 0.652 0.652 0.652 0.652 0.652 0.652 0.652
         MultinomialNB  0.676 0.676 0.676 0.676 0.676 0.676 0.676 0.676 0.676
            GaussianNB  0.555 0.555 0.555 0.555 0.555 0.555 0.555 0.555 0.555
DecisionTreeClassifier  0.587 0.585 0.579 0.584 0.574 0.580 0.575 0.569 0.578
RandomForestClassifier  0.603 0.610 0.617 0.602 0.599 0.598 0.582 0.611 0.605
             LinearSVC  0.674 0.674 0.674 0.674 0.674 0.674 0.674 0.674 0.674
    LogisticRegression  0.697 0.697 0.697 0.697 0.697 0.697 0.697 0.697 0.697
         SGDClassifier  0.662 0.662 0.662 0.662 0.662 0.662 0.662 0.662 0.662
       RidgeClassifier  0.687 0.687 0.687 0.687 0.687 0.687 0.687 0.687 0.687

/Users/bugra/anaconda/lib/python2.7/site-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [20]:

select_model(X, y, scoring=metrics.recall_score)

           BernoulliNB  0.528 0.528 0.528 0.528 0.528 0.528 0.528 0.528 0.528
         MultinomialNB  0.581 0.581 0.581 0.581 0.581 0.581 0.581 0.581 0.581
            GaussianNB  0.549 0.549 0.549 0.549 0.549 0.549 0.549 0.549 0.549
DecisionTreeClassifier  0.522 0.522 0.525 0.523 0.516 0.528 0.527 0.519 0.522
RandomForestClassifier  0.595 0.607 0.592 0.590 0.594 0.603 0.595 0.591 0.600
             LinearSVC  0.671 0.671 0.671 0.671 0.671 0.671 0.671 0.671 0.671
    LogisticRegression  0.583 0.583 0.583 0.583 0.583 0.583 0.583 0.583 0.583
         SGDClassifier  0.608 0.608 0.608 0.608 0.608 0.608 0.608 0.608 0.608
       RidgeClassifier  0.679 0.679 0.679 0.679 0.679 0.679 0.679 0.679 0.679

In [ ]: