#!/usr/bin/env python
# coding: utf-8

# # Feature selection
# 
# Demo of feature selection in scikit-learn and how things can go terribly wrong.
# 

# In[2]:


# first a few imports...
import numpy as np
from sklearn import feature_selection
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE,RFECV
from sklearn.svm import SVC,LinearSVC
import warnings
warnings.filterwarnings('ignore')


# In[23]:


# let's read in the yeast gene expression data that you used in an earlier assignment:
data = np.genfromtxt("data/yeast2.csv", delimiter = ",")
X = data[:,1:]
y = data[:,0]
print(X.shape)


# Let's establish a baseline of how well are we doing with an SVM with a linear kernel:

# In[4]:


cv_generator = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
results_orig = cross_validate(LinearSVC(), X, y, cv=cv_generator, scoring='roc_auc', return_train_score=False)
print(np.mean(results_orig['test_score']))


# Not let's add **many** noisy features and see how that affects classifier performance:

# In[5]:


X = np.hstack((X, np.random.randn(len(y), 1000)))
results_noisy = cross_validate(LinearSVC(), X, y, cv=cv_generator, scoring='roc_auc', return_train_score=False)
print(np.mean(results_noisy['test_score']))


# Let's create an instance of `RFE` that uses an SVM to define weights for the features (any linear classifier will work):

# In[8]:


selector = RFE(LinearSVC(), step=0.1, n_features_to_select=20)
# run feature selection:
selector = selector.fit(X, y)
# check which features got chosen:
print (sum(selector.support_[:79]), sum(selector.support_[79:]))


# The `fit` method did not change the data.  To do that:

# In[9]:


Xt=selector.fit_transform(X,y)


# Evaluation of the classification performance of feature selection is actually not trivial.  To demonstrate a potential issue let's use a gene expression data set available from the [libsvm repository](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html).
# 

# In[16]:


from sklearn.datasets import load_svmlight_file
X, y = load_svmlight_file("data/colon-cancer.data")
X.shape


# In[17]:


results = cross_validate(LinearSVC(), X, y, cv=cv_generator, scoring='roc_auc', return_train_score=False)
print(np.mean(results['test_score']))


# Now let's perform feature selection using RFE and evaluate the resulting accuracy:

# In[20]:


selector = RFE(LinearSVC(), step=0.1, n_features_to_select=30)
Xt=selector.fit_transform(X,y)

results_wrong = cross_validate(LinearSVC(), Xt, y, cv=cv_generator, scoring='roc_auc', return_train_score=False)
print(np.mean(results_wrong['test_score']))


# Now whenever we get such a fabulous result we need to be concerned.
# Where did we go wrong?

# Here's the correct way to do this:

# In[21]:


selector = RFE(LinearSVC(), step=0.1, n_features_to_select=30)
rfe_svm = make_pipeline(selector, LinearSVC())

results_nested = cross_validate(rfe_svm, X, y, cv=cv_generator, scoring='roc_auc', return_train_score=False)
print(np.mean(results_nested['test_score']))


# This issue was described in the literature in a [2002 paper](https://doi.org/10.1073/pnas.102102699).