Use multiple feature extractors (on the same data), concatenate results.
from sklearn.pipeline import make_union, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
import numpy as np
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups()
data, y = news.data, news.target
from sklearn.cross_validation import train_test_split
data_train, data_test, y_train, y_test = train_test_split(data, y)
char_and_word = make_union(TfidfVectorizer(analyzer="char"), TfidfVectorizer(analyzer="word"))
text_pipe = make_pipeline(char_and_word, LinearSVC(dual=False))
param_grid = {'linearsvc__C': 10. ** np.arange(-3, 3)}
grid = GridSearchCV(text_pipe, param_grid=param_grid, cv=5, verbose=10)
grid.fit(data_train, y_train)
Fitting 5 folds for each of 6 candidates, totalling 30 fits [CV] linearsvc__C=0.001 .............................................. [CV] ..................... linearsvc__C=0.001, score=0.650645 - 15.3s [CV] linearsvc__C=0.001 .............................................. [CV] ..................... linearsvc__C=0.001, score=0.638660 - 15.5s
[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 15.3s [Parallel(n_jobs=1)]: Done 2 jobs | elapsed: 30.8s
[CV] linearsvc__C=0.001 .............................................. [CV] ..................... linearsvc__C=0.001, score=0.643320 - 15.4s [CV] linearsvc__C=0.001 .............................................. [CV] ..................... linearsvc__C=0.001, score=0.623891 - 15.7s [CV] linearsvc__C=0.001 .............................................. [CV] ..................... linearsvc__C=0.001, score=0.627149 - 14.9s [CV] linearsvc__C=0.01 ............................................... [CV] ...................... linearsvc__C=0.01, score=0.811254 - 17.0s [CV] linearsvc__C=0.01 ............................................... [CV] ...................... linearsvc__C=0.01, score=0.801998 - 16.9s [CV] linearsvc__C=0.01 ............................................... [CV] ...................... linearsvc__C=0.01, score=0.805180 - 17.3s
[Parallel(n_jobs=1)]: Done 5 jobs | elapsed: 1.3min [Parallel(n_jobs=1)]: Done 8 jobs | elapsed: 2.1min
[CV] linearsvc__C=0.01 ............................................... [CV] ...................... linearsvc__C=0.01, score=0.781786 - 19.9s [CV] linearsvc__C=0.01 ............................................... [CV] ...................... linearsvc__C=0.01, score=0.802015 - 17.0s [CV] linearsvc__C=0.1 ................................................ [CV] ....................... linearsvc__C=0.1, score=0.893318 - 20.8s [CV] linearsvc__C=0.1 ................................................ [CV] ....................... linearsvc__C=0.1, score=0.901293 - 20.6s [CV] linearsvc__C=0.1 ................................................ [CV] ....................... linearsvc__C=0.1, score=0.885815 - 19.6s [CV] linearsvc__C=0.1 ................................................ [CV] ....................... linearsvc__C=0.1, score=0.884092 - 20.2s [CV] linearsvc__C=0.1 ................................................ [CV] ....................... linearsvc__C=0.1, score=0.887374 - 21.2s [CV] linearsvc__C=1.0 ................................................ [CV] ....................... linearsvc__C=1.0, score=0.915592 - 32.5s [CV] linearsvc__C=1.0 ................................................ [CV] ....................... linearsvc__C=1.0, score=0.925969 - 24.1s [CV] linearsvc__C=1.0 ................................................ [CV] ....................... linearsvc__C=1.0, score=0.916421 - 23.7s
[Parallel(n_jobs=1)]: Done 13 jobs | elapsed: 3.8min [Parallel(n_jobs=1)]: Done 18 jobs | elapsed: 5.8min
[CV] linearsvc__C=1.0 ................................................ [CV] ....................... linearsvc__C=1.0, score=0.912478 - 22.4s [CV] linearsvc__C=1.0 ................................................ [CV] ....................... linearsvc__C=1.0, score=0.911678 - 22.8s [CV] linearsvc__C=10.0 ............................................... [CV] ...................... linearsvc__C=10.0, score=0.913834 - 39.0s [CV] linearsvc__C=10.0 ............................................... [CV] ...................... linearsvc__C=10.0, score=0.918331 - 40.0s [CV] linearsvc__C=10.0 ............................................... [CV] ...................... linearsvc__C=10.0, score=0.908770 - 37.9s [CV] linearsvc__C=10.0 ............................................... [CV] ...................... linearsvc__C=10.0, score=0.912478 - 38.4s [CV] linearsvc__C=10.0 ............................................... [CV] ...................... linearsvc__C=10.0, score=0.912270 - 39.5s [CV] linearsvc__C=100.0 .............................................. [CV] ..................... linearsvc__C=100.0, score=0.913247 - 1.3min [CV] linearsvc__C=100.0 .............................................. [CV] ..................... linearsvc__C=100.0, score=0.917744 - 1.4min [CV] linearsvc__C=100.0 .............................................. [CV] ..................... linearsvc__C=100.0, score=0.908770 - 1.3min [CV] linearsvc__C=100.0 .............................................. [CV] ..................... linearsvc__C=100.0, score=0.910704 - 1.3min [CV] linearsvc__C=100.0 .............................................. [CV] ..................... linearsvc__C=100.0, score=0.912863 - 1.4min
[Parallel(n_jobs=1)]: Done 25 jobs | elapsed: 9.8min [Parallel(n_jobs=1)]: Done 30 out of 30 | elapsed: 16.5min finished
GridSearchCV(cv=5, error_score='raise', estimator=Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1, transformer_list=[('tfidfvectorizer-1', TfidfVectorizer(analyzer='char', binary=False, decode_error=u'strict', dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content', lowercase=True, max_df=1.0, max_features=None, min_df=...2', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0))]), fit_params={}, iid=True, loss_func=None, n_jobs=1, param_grid={'linearsvc__C': array([ 1.00000e-03, 1.00000e-02, 1.00000e-01, 1.00000e+00, 1.00000e+01, 1.00000e+02])}, pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None, verbose=10)
param_grid = {'featureunion__tfidfvectorizer-1__ngram_range': [(1, 3), (1, 5), (2, 5)],
'featureunion__tfidfvectorizer-2__ngram_range': [(1, 1), (1, 2), (2, 2)],
'linearsvc__C': 10. ** np.arange(-3, 3)}