More process building tools!¶

Use multiple feature extractors (on the same data), concatenate results.

In [1]:

from sklearn.pipeline import make_union, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
import numpy as np

In [2]:

from sklearn.datasets import fetch_20newsgroups

In [3]:

news = fetch_20newsgroups()

In [4]:

data, y = news.data, news.target

In [5]:

from sklearn.cross_validation import train_test_split
data_train, data_test, y_train, y_test = train_test_split(data, y)

In [6]:

char_and_word = make_union(TfidfVectorizer(analyzer="char"), TfidfVectorizer(analyzer="word"))

text_pipe = make_pipeline(char_and_word, LinearSVC(dual=False))
param_grid = {'linearsvc__C': 10. ** np.arange(-3, 3)}

grid = GridSearchCV(text_pipe, param_grid=param_grid, cv=5, verbose=10)

In [7]:

grid.fit(data_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] linearsvc__C=0.001 ..............................................
[CV] ..................... linearsvc__C=0.001, score=0.650645 -  15.3s
[CV] linearsvc__C=0.001 ..............................................
[CV] ..................... linearsvc__C=0.001, score=0.638660 -  15.5s

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:   15.3s
[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:   30.8s

[CV] linearsvc__C=0.001 ..............................................
[CV] ..................... linearsvc__C=0.001, score=0.643320 -  15.4s
[CV] linearsvc__C=0.001 ..............................................
[CV] ..................... linearsvc__C=0.001, score=0.623891 -  15.7s
[CV] linearsvc__C=0.001 ..............................................
[CV] ..................... linearsvc__C=0.001, score=0.627149 -  14.9s
[CV] linearsvc__C=0.01 ...............................................
[CV] ...................... linearsvc__C=0.01, score=0.811254 -  17.0s
[CV] linearsvc__C=0.01 ...............................................
[CV] ...................... linearsvc__C=0.01, score=0.801998 -  16.9s
[CV] linearsvc__C=0.01 ...............................................
[CV] ...................... linearsvc__C=0.01, score=0.805180 -  17.3s

[Parallel(n_jobs=1)]: Done   5 jobs       | elapsed:  1.3min
[Parallel(n_jobs=1)]: Done   8 jobs       | elapsed:  2.1min

[CV] linearsvc__C=0.01 ...............................................
[CV] ...................... linearsvc__C=0.01, score=0.781786 -  19.9s
[CV] linearsvc__C=0.01 ...............................................
[CV] ...................... linearsvc__C=0.01, score=0.802015 -  17.0s
[CV] linearsvc__C=0.1 ................................................
[CV] ....................... linearsvc__C=0.1, score=0.893318 -  20.8s
[CV] linearsvc__C=0.1 ................................................
[CV] ....................... linearsvc__C=0.1, score=0.901293 -  20.6s
[CV] linearsvc__C=0.1 ................................................
[CV] ....................... linearsvc__C=0.1, score=0.885815 -  19.6s
[CV] linearsvc__C=0.1 ................................................
[CV] ....................... linearsvc__C=0.1, score=0.884092 -  20.2s
[CV] linearsvc__C=0.1 ................................................
[CV] ....................... linearsvc__C=0.1, score=0.887374 -  21.2s
[CV] linearsvc__C=1.0 ................................................
[CV] ....................... linearsvc__C=1.0, score=0.915592 -  32.5s
[CV] linearsvc__C=1.0 ................................................
[CV] ....................... linearsvc__C=1.0, score=0.925969 -  24.1s
[CV] linearsvc__C=1.0 ................................................
[CV] ....................... linearsvc__C=1.0, score=0.916421 -  23.7s

[Parallel(n_jobs=1)]: Done  13 jobs       | elapsed:  3.8min
[Parallel(n_jobs=1)]: Done  18 jobs       | elapsed:  5.8min

[CV] linearsvc__C=1.0 ................................................
[CV] ....................... linearsvc__C=1.0, score=0.912478 -  22.4s
[CV] linearsvc__C=1.0 ................................................
[CV] ....................... linearsvc__C=1.0, score=0.911678 -  22.8s
[CV] linearsvc__C=10.0 ...............................................
[CV] ...................... linearsvc__C=10.0, score=0.913834 -  39.0s
[CV] linearsvc__C=10.0 ...............................................
[CV] ...................... linearsvc__C=10.0, score=0.918331 -  40.0s
[CV] linearsvc__C=10.0 ...............................................
[CV] ...................... linearsvc__C=10.0, score=0.908770 -  37.9s
[CV] linearsvc__C=10.0 ...............................................
[CV] ...................... linearsvc__C=10.0, score=0.912478 -  38.4s
[CV] linearsvc__C=10.0 ...............................................
[CV] ...................... linearsvc__C=10.0, score=0.912270 -  39.5s
[CV] linearsvc__C=100.0 ..............................................
[CV] ..................... linearsvc__C=100.0, score=0.913247 - 1.3min
[CV] linearsvc__C=100.0 ..............................................
[CV] ..................... linearsvc__C=100.0, score=0.917744 - 1.4min
[CV] linearsvc__C=100.0 ..............................................
[CV] ..................... linearsvc__C=100.0, score=0.908770 - 1.3min
[CV] linearsvc__C=100.0 ..............................................
[CV] ..................... linearsvc__C=100.0, score=0.910704 - 1.3min
[CV] linearsvc__C=100.0 ..............................................
[CV] ..................... linearsvc__C=100.0, score=0.912863 - 1.4min

[Parallel(n_jobs=1)]: Done  25 jobs       | elapsed:  9.8min
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 16.5min finished

Out[7]:

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('tfidfvectorizer-1', TfidfVectorizer(analyzer='char', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=...2', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0))]),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'linearsvc__C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=10)

In [8]:

param_grid = {'featureunion__tfidfvectorizer-1__ngram_range': [(1, 3), (1, 5), (2, 5)],
              'featureunion__tfidfvectorizer-2__ngram_range': [(1, 1), (1, 2), (2, 2)],
              'linearsvc__C': 10. ** np.arange(-3, 3)}