%matplotlib inline from copy import copy import numpy as np import matplotlib.pyplot as plt from scipy.stats.mstats import mquantiles from sklearn.datasets import load_digits from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import StratifiedShuffleSplit digits = load_digits() X, y = digits.data, digits.target cv = StratifiedShuffleSplit(y, train_size=50, test_size=None) train, test = iter(cv).next() X_orig, X_test = X[train], X[test] y_orig, y_test = y[train], y[test] n_samples, n_features_orig = X_orig.shape print(n_samples) print(n_features_orig) np.sum(np.var(X_orig, axis=0) == 0) n_features_noise_1 = 50 n_features_noise_2 = 1000 rng = np.random.RandomState(42) X_noise_1 = np.hstack([X_orig, rng.normal(size=(n_samples, n_features_noise_1))]) X_noise_2 = np.hstack([X_orig, rng.normal(size=(n_samples, n_features_noise_2))]) n_trees = 100 extra_trees = ExtraTreesClassifier(n_estimators=n_trees, n_jobs=-1, random_state=0) random_forest = RandomForestClassifier(n_estimators=n_trees, n_jobs=-1, random_state=0) extra_trees.fit(X_orig, y_orig).score(X_test, y_test) random_forest.fit(X_orig, y_orig).score(X_test, y_test) def compute_importances(ensemble, normalize=False): trees_importances = [base_model.tree_.compute_feature_importances(normalize=normalize) for base_model in ensemble.estimators_] return sum(trees_importances) / len(trees_importances) def plot_feature_importances(importances, normalize=False, color=None, alpha=0.5, label=None, chunk=None): if hasattr(importances, 'estimators_'): importances = compute_importances(importances, normalize=normalize) if chunk is not None: importances = importances[chunk] plt.bar(range(len(importances)), importances, color=color, alpha=alpha, label=label) plt.figure(figsize=(16, 3)) plot_feature_importances(extra_trees.fit(X_orig, y_orig), label='extra') _ = plt.legend() plt.figure(figsize=(16, 3)) plot_feature_importances(extra_trees.fit(X_noise_1, y_orig), label='extra') _ = plt.legend() plt.figure(figsize=(16, 3)) plot_feature_importances(extra_trees.fit(X_noise_2, y_orig)) _ = plt.title("Variable importance on a data set with a majority of noisy variables") def shuffle_columns(X, copy=True, seed=0): rng = np.random.RandomState(seed) if copy: X = X.copy() for i in range(X.shape[1]): rng.shuffle(X[:, i]) return X def model_importances(ensemble, normalize=False): return np.array([ base_model.tree_.compute_feature_importances(normalize=normalize) for base_model in ensemble.estimators_]) def compute_adjusted_importances(ensemble, X, y, n_permutations=5, return_diff=True, aggregate_method='average', normalize=False): all_importances = [] n_features = X.shape[1] for i in range(n_permutations): ensemble.fit(np.hstack([X, shuffle_columns(X, seed=i)]), y) all_importances.append(model_importances(ensemble, normalize=normalize)) if aggregate_method == 'average': # Average importances of trees accross permutations: # the shape is (n_trees, 2 * n_features) all_importances = np.mean(all_importances, axis=0) elif aggregate_method == 'stack': # Stack importances of trees for various permutations: # the shape is (n_permutations * n_trees, 2 * n_features) all_importances = np.vstack(all_importances) if return_diff: return all_importances[:, :n_features] - all_importances[:, n_features:] else: return all_importances[:, :n_features], all_importances[:, n_features:] def bootstrap_adjusted_importances(ensemble, X, y, normalize=False, n_permutations=10, n_bootstraps=1000, aggregate_method='average', seed=0): rng = np.random.RandomState(seed) avi_trees = compute_adjusted_importances( ensemble, X, y, n_permutations=n_permutations, normalize=normalize, aggregate_method=aggregate_method) n_trees = len(ensemble.estimators_) bootstraped_means = [] for i in range(n_bootstraps): idx = rng.random_integers(low=0, high=n_trees - 1, size=n_trees) bootstraped_means.append(avi_trees[idx].mean(axis=0)) bootstraped_means = np.array(bootstraped_means) bootstraped_means.sort(axis=0) return np.mean(bootstraped_means, axis=0), np.std(bootstraped_means, axis=0), bootstraped_means avi_mean, avi_std, bootstraped_avi = bootstrap_adjusted_importances( extra_trees, X_noise_2, y_orig, n_permutations=10, aggregate_method='average', seed=0) plt.figure(figsize=(16, 3)) plt.bar(range(len(avi_mean)), avi_mean, color=None, alpha=0.5) _ = plt.title('Mean Adjusted Variable Importances for all features') avi_mean[:n_features_orig].mean(), avi_mean[:n_features_orig].std() avi_mean[n_features_orig:].mean(), avi_mean[n_features_orig:].std() plt.figure(figsize=(16, 3)) plt.bar(range(len(avi_mean) - 64), avi_mean[64:], color=None, alpha=0.5) _ = plt.title('Mean Adjusted Variable Importances for noisy features') neg_rates = np.mean(bootstraped_avi <= 0, axis=0) plt.figure(figsize=(16, 3)) plt.bar(range(len(neg_rates)), neg_rates, color=None, alpha=0.5) _ = plt.title('Negative AVI rate') np.sum(neg_rates[:n_features_orig][np.var(X_orig, axis=0) != 0] <= 0) np.mean(neg_rates[n_features_orig:] != 0) np.mean(neg_rates > 0.5) def select_important(ensemble, X, y, threshold=0.5, n_iter=None, n_permutations=10, n_bootstraps=1000, seed=0): selected_features = np.arange(X.shape[1]) X_selected = X.copy() while True: _, _, bootstraped_avi = bootstrap_adjusted_importances( ensemble, X_selected, y, n_permutations=n_permutations, n_bootstraps=n_bootstraps, seed=seed) neg_rates = np.mean(bootstraped_avi <= 0, axis=0) selection_mask = neg_rates <= threshold print("Keeping %d features out of %d." % (np.sum(selection_mask), X_selected.shape[1])) if np.all(selection_mask): # All features are selected break elif not np.any(selection_mask): # All features where detected as noisy return selected_features[selection_mask] # Else: apply the selection mask and iterate untill reaching a fixpoint X_selected = X_selected[:, selection_mask] selected_features = selected_features[selection_mask] return selected_features selected = select_important(extra_trees, X_noise_2, y_orig, n_permutations=20, threshold=0.0) print("Number of selected features: %d" % len(selected)) print("Number of selected features among original features: %d" % np.sum(selected < n_features_orig)) print("Number of selected noisy features: %d" % np.sum(selected >= n_features_orig)) from sklearn.feature_selection import SelectFdr, f_classif alphas = np.logspace(-8, -2, 100) total_selected = [] wrongly_selected = [] for alpha in alphas: univariate_selector = SelectFdr(f_classif, alpha=alpha).fit(X_noise_2, y_orig) selected = univariate_selector.get_support() total_selected.append(np.sum(selected)) wrongly_selected.append(np.sum(selected[n_features_orig:])) total_selected = np.array(total_selected) wrongly_selected = np.array(wrongly_selected) plt.semilogx(alphas, total_selected, label="# selected features") plt.semilogx(alphas, wrongly_selected, label="# selected noisy features") _ = plt.legend() small_alphas = alphas < 1e-4 plt.semilogx(alphas[small_alphas], total_selected[small_alphas], label="# selected features") plt.semilogx(alphas[small_alphas], wrongly_selected[small_alphas], label="# selected noisy features") _ = plt.legend() alphas[wrongly_selected == 0][-1] total_selected[wrongly_selected == 0][-1] plt.figure(figsize=(16, 3)) plt.hist(bootstraped_avi[:, 10], bins=30) plt.title("Distribution of the bootstrapped mean AVI for a relevant variable") _ = plt.xlim(bootstraped_avi.min(), bootstraped_avi.max()) plt.figure(figsize=(16, 3)) plt.hist(bootstraped_avi[:, 28], bins=30) plt.title("Distribution of the bootstrapped mean AVI for a relevant variable") _ = plt.xlim(bootstraped_avi.min(), bootstraped_avi.max()) plt.figure(figsize=(16, 3)) plt.hist(bootstraped_avi[:, 145], bins=30) plt.title("Distribution of the bootstrapped mean AVI for a noisy variable") _ = plt.xlim(bootstraped_avi.min(), bootstraped_avi.max()) plt.figure(figsize=(16, 3)) plt.hist(bootstraped_avi[:, 99], bins=30) plt.title("Distribution of the bootstrapped mean AVI for a noisy variable") _ = plt.xlim(bootstraped_avi.min(), bootstraped_avi.max()) plt.figure(figsize=(16, 3)) plt.hist(bootstraped_avi[:, 400], bins=30) plt.title("Distribution of the bootstrapped mean AVI for a noisy variable") _ = plt.xlim(bootstraped_avi.min(), bootstraped_avi.max())