Model selection is the task of choosing best performing classifier out of a candidate set of classifiers which is very useful for libraries like Scikit-Learn as the api for different parameters stay across different classifiers. That makes it very easy to try different "estimators" in the same dataset.
After learning about parameter search, one could do parameter search on a given classifier, but in practice you rarely depend on one classifier. Ideally, you need to compare different classifiers (as their learning function differs a lot), their performance on your real dataset may vary drastically. In order to choose not best paremeters for a given classifier, one needs to look at different classifiers and then do a parameters search on top of the those multiple classifiers.
Model selection makes sure that not only I will get the best parameters but also best classifier for a given candidate of classifiers for a scoring function that I will optimize for. But this is not good enough because I want to also see the effectof total number of features as well; when I change the total number of features, how does it affect the score of a classifier.
%matplotlib inline
import csv
import itertools
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn import cross_validation
from sklearn import ensemble
from sklearn.feature_extraction import text
from sklearn import feature_extraction
from sklearn import feature_selection
from sklearn import linear_model
from sklearn import metrics
from sklearn import naive_bayes
from sklearn import svm
from sklearn import tree
plt.style.use('fivethirtyeight')
_DATA_DIR = 'data'
_NYT_DATA_PATH = os.path.join(_DATA_DIR, 'nyt_title_data.csv')
_PLT_LEGEND_OPTIONS = dict(loc="upper center",
bbox_to_anchor=(0.5, -0.15),
fancybox=True,
shadow=True,
ncol=3)
colors = [ii.strip() for ii in '#30a2da, #fc4f30, #e5ae38, #6d904f, #8b8b8b'.split(',')]
colors += ['#' + ii.strip() for ii in '348ABD, A60628, 7A68A6, 467821,D55E00, CC79A7, 56B4E9, 009E73, F0E442, 0072B2'.split(',')]
markers = itertools.cycle(["o", "D"])
colors = itertools.cycle(colors)
def cv(X, y, clf, nfeats, clfname, scoring=metrics.accuracy_score, n_folds=10):
stratified_k_fold = cross_validation.StratifiedKFold(y, n_folds=n_folds)
accuracy, ii = 0., 0
for train, test in stratified_k_fold:
ii += 1
X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score = scoring(y_test, y_pred)
accuracy += score
accuracy /= float(n_folds)
return accuracy
def plot_accuracies(accuracies, xvals, legends):
fig = plt.figure(figsize=(16, 12))
ax = fig.add_subplot(111)
for ii in range(0, accuracies.shape[0]):
ax.plot(xvals, accuracies[ii, :], color=next(colors), marker=next(markers), label=legends[ii])
plt.xlabel("Number of Features")
plt.ylabel("Accuracy")
plt.title("Accuracy vs Number of Features")
ax.set_xscale("log")
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.3, box.width, box.height * 0.7])
ax.legend(**_PLT_LEGEND_OPTIONS)
plt.show()
def estimator_name(clf):
return type(clf).__name__
def select_model(X, y, scoring=metrics.accuracy_score):
n_features = np.array([10, 100, 500, 1000, 5000, 10000, 20000, 50000, 100000])
clfs = [
naive_bayes.BernoulliNB(),
naive_bayes.MultinomialNB(),
naive_bayes.GaussianNB(),
tree.DecisionTreeClassifier(),
ensemble.RandomForestClassifier(n_estimators=10),
svm.LinearSVC(random_state=0),
linear_model.LogisticRegression(),
linear_model.SGDClassifier(),
linear_model.RidgeClassifier(),
]
classifier_names = map(estimator_name, clfs)
feature_selection_methods = [feature_selection.f_classif]
accuracies = np.zeros((len(clfs), len(n_features), len(feature_selection_methods)))
for kk in range(len(feature_selection_methods)):
X_feature_selected = X.copy().toarray()
for jj in range(len(n_features)):
for ii in range(len(clfs)):
accuracies[ii, jj, kk] = cv(X_feature_selected, y, clfs[ii], n_features[jj], classifier_names[ii], scoring=scoring)
for k in range(len(feature_selection_methods)):
for i in range(len(clfs)):
print "%22s " % classifier_names[i],
for j in range(accuracies.shape[1]):
print "%5.3f" % accuracies[i, j, k],
print
plot_accuracies(accuracies[:, :, k], n_features, classifier_names)
with open(_NYT_DATA_PATH) as nyt:
nyt_data = []
nyt_labels = []
csv_reader = csv.reader(nyt)
for line in csv_reader:
nyt_labels.append(int(line[0]))
nyt_data.append(line[1])
X = np.array([''.join(el) for el in nyt_data])
y = np.array([el for el in nyt_labels])
vectorizer = text.TfidfVectorizer(min_df=2,
ngram_range=(1, 2),
stop_words='english',
strip_accents='unicode',
norm='l2')
example = unicode(X[5])
print("Example string: {}".format(example))
print("Preprocessed string: {}".format(vectorizer.build_preprocessor()(example)))
print("Tokenized string: {}".format(str(vectorizer.build_tokenizer()(example))))
print("N-gram data string: {}".format(str(vectorizer.build_analyzer()(example))))
Example string: Clinton Defends His Policies In Kosovo, China and Mexico Preprocessed string: clinton defends his policies in kosovo, china and mexico Tokenized string: [u'Clinton', u'Defends', u'His', u'Policies', u'In', u'Kosovo', u'China', u'and', u'Mexico'] N-gram data string: [u'clinton', u'defends', u'policies', u'kosovo', u'china', u'mexico', u'clinton defends', u'defends policies', u'policies kosovo', u'kosovo china', u'china mexico']
In TfidfVectorizer, one may choose the ngram_range
, in this example, it is bigram and unigram. Choosing bigram enables us to capture phrases that could play an important role in determining the class of the observation in the dataset. If you believe, even a longer sequence of words internet of things
play an important role, you could pass ngram_range
to (1, 3)
to include trigrams as well.
X
array(['Dole Courts Democrats', 'Yanks End Drought; Mets Fall in Opener', 'Lumpectomies Seen As Equal in Benefit To Breast Removals', ..., 'Delays Hurting U.S. Rebuilding In Afghanistan', 'SENATE APPROVES $1 BILLION TO AID COLOMBIA MILITARY', 'POLITICS: THE MONEY; A Hollywood Production: Political Money'], dtype='|S127')
X = vectorizer.fit_transform(X)
X
<2161x2171 sparse matrix of type '<type 'numpy.float64'>' with 10377 stored elements in Compressed Sparse Row format>
select_model(X, y)
BernoulliNB 0.528 0.528 0.528 0.528 0.528 0.528 0.528 0.528 0.528 MultinomialNB 0.581 0.581 0.581 0.581 0.581 0.581 0.581 0.581 0.581 GaussianNB 0.549 0.549 0.549 0.549 0.549 0.549 0.549 0.549 0.549 DecisionTreeClassifier 0.519 0.529 0.526 0.523 0.524 0.531 0.524 0.528 0.521 RandomForestClassifier 0.595 0.593 0.589 0.600 0.601 0.589 0.601 0.588 0.587 LinearSVC 0.671 0.671 0.671 0.671 0.671 0.671 0.671 0.671 0.671 LogisticRegression 0.583 0.583 0.583 0.583 0.583 0.583 0.583 0.583 0.583 SGDClassifier 0.608 0.608 0.608 0.608 0.608 0.608 0.608 0.608 0.608 RidgeClassifier 0.679 0.679 0.679 0.679 0.679 0.679 0.679 0.679 0.679
This plot not only shows the relative strength of classifiers with respect to each for a given number of features but also provides a good picture on the effect of number of features for the classification task at the hand. Generally even if Scikit-Learn has all grid search other capabilities to output the best estimator for a given classifier or pipeline, I'd prefer some plot over those grids. You not only learn the best parameters but also effects of classifier and feature number in a one plot. And this could be easily adopted to different number of paramters(instead of number of features) for classifiers that share a number of parameters in the optimization to see the effect of each parameter visually rather than getting the best parameters. This way you could both explore and gain insights about parameters that the classifier has.
We could always pass another scoring function to compute the accuracies of each classifier.
select_model(X, y, scoring=metrics.f1_score)
BernoulliNB 0.468 0.468 0.468 0.468 0.468 0.468 0.468 0.468 0.468 MultinomialNB 0.538 0.538 0.538 0.538 0.538 0.538 0.538 0.538 0.538 GaussianNB 0.548 0.548 0.548 0.548 0.548 0.548 0.548 0.548 0.548 DecisionTreeClassifier 0.537 0.534 0.526 0.535 0.538 0.532 0.535 0.531 0.534 RandomForestClassifier 0.590 0.584 0.590 0.599 0.590 0.577 0.573 0.584 0.579 LinearSVC 0.667 0.667 0.667 0.667 0.667 0.667 0.667 0.667 0.667 LogisticRegression 0.554 0.554 0.554 0.554 0.554 0.554 0.554 0.554 0.554 SGDClassifier 0.613 0.613 0.613 0.613 0.613 0.613 0.613 0.613 0.613 RidgeClassifier 0.674 0.674 0.674 0.674 0.674 0.674 0.674 0.674 0.674
select_model(X, y, scoring=metrics.precision_score)
BernoulliNB 0.652 0.652 0.652 0.652 0.652 0.652 0.652 0.652 0.652 MultinomialNB 0.676 0.676 0.676 0.676 0.676 0.676 0.676 0.676 0.676 GaussianNB 0.555 0.555 0.555 0.555 0.555 0.555 0.555 0.555 0.555 DecisionTreeClassifier 0.587 0.585 0.579 0.584 0.574 0.580 0.575 0.569 0.578 RandomForestClassifier 0.603 0.610 0.617 0.602 0.599 0.598 0.582 0.611 0.605 LinearSVC 0.674 0.674 0.674 0.674 0.674 0.674 0.674 0.674 0.674 LogisticRegression 0.697 0.697 0.697 0.697 0.697 0.697 0.697 0.697 0.697 SGDClassifier 0.662 0.662 0.662 0.662 0.662 0.662 0.662 0.662 0.662 RidgeClassifier 0.687 0.687 0.687 0.687 0.687 0.687 0.687 0.687 0.687
/Users/bugra/anaconda/lib/python2.7/site-packages/sklearn/metrics/metrics.py:1771: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. 'precision', 'predicted', average, warn_for)
select_model(X, y, scoring=metrics.recall_score)
BernoulliNB 0.528 0.528 0.528 0.528 0.528 0.528 0.528 0.528 0.528 MultinomialNB 0.581 0.581 0.581 0.581 0.581 0.581 0.581 0.581 0.581 GaussianNB 0.549 0.549 0.549 0.549 0.549 0.549 0.549 0.549 0.549 DecisionTreeClassifier 0.522 0.522 0.525 0.523 0.516 0.528 0.527 0.519 0.522 RandomForestClassifier 0.595 0.607 0.592 0.590 0.594 0.603 0.595 0.591 0.600 LinearSVC 0.671 0.671 0.671 0.671 0.671 0.671 0.671 0.671 0.671 LogisticRegression 0.583 0.583 0.583 0.583 0.583 0.583 0.583 0.583 0.583 SGDClassifier 0.608 0.608 0.608 0.608 0.608 0.608 0.608 0.608 0.608 RidgeClassifier 0.679 0.679 0.679 0.679 0.679 0.679 0.679 0.679 0.679