#!/usr/bin/env python # coding: utf-8 # # Feature selection # # Demo of feature selection in scikit-learn and how things can go terribly wrong. # # In[2]: # first a few imports... import numpy as np from sklearn import feature_selection from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import cross_validate from sklearn.pipeline import Pipeline from sklearn.pipeline import make_pipeline from sklearn.feature_selection import RFE,RFECV from sklearn.svm import SVC,LinearSVC import warnings warnings.filterwarnings('ignore') # In[23]: # let's read in the yeast gene expression data that you used in an earlier assignment: data = np.genfromtxt("data/yeast2.csv", delimiter = ",") X = data[:,1:] y = data[:,0] print(X.shape) # Let's establish a baseline of how well are we doing with an SVM with a linear kernel: # In[4]: cv_generator = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) results_orig = cross_validate(LinearSVC(), X, y, cv=cv_generator, scoring='roc_auc', return_train_score=False) print(np.mean(results_orig['test_score'])) # Not let's add **many** noisy features and see how that affects classifier performance: # In[5]: X = np.hstack((X, np.random.randn(len(y), 1000))) results_noisy = cross_validate(LinearSVC(), X, y, cv=cv_generator, scoring='roc_auc', return_train_score=False) print(np.mean(results_noisy['test_score'])) # Let's create an instance of `RFE` that uses an SVM to define weights for the features (any linear classifier will work): # In[8]: selector = RFE(LinearSVC(), step=0.1, n_features_to_select=20) # run feature selection: selector = selector.fit(X, y) # check which features got chosen: print (sum(selector.support_[:79]), sum(selector.support_[79:])) # The `fit` method did not change the data. To do that: # In[9]: Xt=selector.fit_transform(X,y) # Evaluation of the classification performance of feature selection is actually not trivial. To demonstrate a potential issue let's use a gene expression data set available from the [libsvm repository](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html). # # In[16]: from sklearn.datasets import load_svmlight_file X, y = load_svmlight_file("data/colon-cancer.data") X.shape # In[17]: results = cross_validate(LinearSVC(), X, y, cv=cv_generator, scoring='roc_auc', return_train_score=False) print(np.mean(results['test_score'])) # Now let's perform feature selection using RFE and evaluate the resulting accuracy: # In[20]: selector = RFE(LinearSVC(), step=0.1, n_features_to_select=30) Xt=selector.fit_transform(X,y) results_wrong = cross_validate(LinearSVC(), Xt, y, cv=cv_generator, scoring='roc_auc', return_train_score=False) print(np.mean(results_wrong['test_score'])) # Now whenever we get such a fabulous result we need to be concerned. # Where did we go wrong? # Here's the correct way to do this: # In[21]: selector = RFE(LinearSVC(), step=0.1, n_features_to_select=30) rfe_svm = make_pipeline(selector, LinearSVC()) results_nested = cross_validate(rfe_svm, X, y, cv=cv_generator, scoring='roc_auc', return_train_score=False) print(np.mean(results_nested['test_score'])) # This issue was described in the literature in a [2002 paper](https://doi.org/10.1073/pnas.102102699).