#!/usr/bin/env python # coding: utf-8 # Copyright (c) 2015-2017 [Sebastian Raschka](sebastianraschka.com) # # https://github.com/rasbt/python-machine-learning-book # # [MIT License](https://github.com/rasbt/python-machine-learning-book/blob/master/LICENSE.txt) # # Python Machine Learning - Code Examples # # Chapter 4 - Building Good Training Sets – Data Pre-Processing # Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s). # In[1]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', "-a 'Sebastian Raschka' -u -d -p numpy,pandas,matplotlib,sklearn") # *The use of `watermark` is optional. You can install this IPython extension via "`pip install watermark`". For more information, please see: https://github.com/rasbt/watermark.* #
#
# ### Overview # - [Dealing with missing data](#Dealing-with-missing-data) # - [Eliminating samples or features with missing values](#Eliminating-samples-or-features-with-missing-values) # - [Imputing missing values](#Imputing-missing-values) # - [Understanding the scikit-learn estimator API](#Understanding-the-scikit-learn-estimator-API) # - [Handling categorical data](#Handling-categorical-data) # - [Mapping ordinal features](#Mapping-ordinal-features) # - [Encoding class labels](#Encoding-class-labels) # - [Performing one-hot encoding on nominal features](#Performing-one-hot-encoding-on-nominal-features) # - [Partitioning a dataset in training and test sets](#Partitioning-a-dataset-in-training-and-test-sets) # - [Bringing features onto the same scale](#Bringing-features-onto-the-same-scale) # - [Selecting meaningful features](#Selecting-meaningful-features) # - [Sparse solutions with L1 regularization](#Sparse-solutions-with-L1-regularization) # - [Sequential feature selection algorithms](#Sequential-feature-selection-algorithms) # - [Assessing feature importance with random forests](#Assessing-feature-importance-with-random-forests) # - [Summary](#Summary) #
#
# In[2]: from IPython.display import Image get_ipython().run_line_magic('matplotlib', 'inline') # In[3]: # Added version check for recent scikit-learn 0.18 checks from distutils.version import LooseVersion as Version from sklearn import __version__ as sklearn_version # # Dealing with missing data # In[4]: import pandas as pd from io import StringIO csv_data = '''A,B,C,D 1.0,2.0,3.0,4.0 5.0,6.0,,8.0 10.0,11.0,12.0,''' # If you are using Python 2.7, you need # to convert the string to unicode: # csv_data = unicode(csv_data) df = pd.read_csv(StringIO(csv_data)) df # In[5]: df.isnull().sum() #
#
# ## Eliminating samples or features with missing values # In[6]: df.dropna() # In[7]: df.dropna(axis=1) # In[8]: # only drop rows where all columns are NaN df.dropna(how='all') # In[9]: # drop rows that have not at least 4 non-NaN values df.dropna(thresh=4) # In[10]: # only drop rows where NaN appear in specific columns (here: 'C') df.dropna(subset=['C']) #
#
# ## Imputing missing values # In[11]: from sklearn.preprocessing import Imputer imr = Imputer(missing_values='NaN', strategy='mean', axis=0) imr = imr.fit(df) imputed_data = imr.transform(df.values) imputed_data # In[12]: df.values #
#
# ## Understanding the scikit-learn estimator API # In[13]: Image(filename='./images/04_04.png', width=400) # In[14]: Image(filename='./images/04_05.png', width=400) #
#
# # Handling categorical data # In[15]: import pandas as pd df = pd.DataFrame([['green', 'M', 10.1, 'class1'], ['red', 'L', 13.5, 'class2'], ['blue', 'XL', 15.3, 'class1']]) df.columns = ['color', 'size', 'price', 'classlabel'] df #
#
# ## Mapping ordinal features # In[16]: size_mapping = {'XL': 3, 'L': 2, 'M': 1} df['size'] = df['size'].map(size_mapping) df # In[17]: inv_size_mapping = {v: k for k, v in size_mapping.items()} df['size'].map(inv_size_mapping) #
#
# ## Encoding class labels # In[18]: import numpy as np class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))} class_mapping # In[19]: df['classlabel'] = df['classlabel'].map(class_mapping) df # In[20]: inv_class_mapping = {v: k for k, v in class_mapping.items()} df['classlabel'] = df['classlabel'].map(inv_class_mapping) df # In[21]: from sklearn.preprocessing import LabelEncoder class_le = LabelEncoder() y = class_le.fit_transform(df['classlabel'].values) y # In[22]: class_le.inverse_transform(y) #
#
# ## Performing one-hot encoding on nominal features # In[23]: X = df[['color', 'size', 'price']].values color_le = LabelEncoder() X[:, 0] = color_le.fit_transform(X[:, 0]) X # In[24]: from sklearn.preprocessing import OneHotEncoder ohe = OneHotEncoder(categorical_features=[0]) ohe.fit_transform(X).toarray() # In[25]: pd.get_dummies(df[['price', 'color', 'size']]) #
#
# # Partitioning a dataset in training and test sets # In[26]: df_wine = pd.read_csv('https://archive.ics.uci.edu/' 'ml/machine-learning-databases/wine/wine.data', header=None) df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'] print('Class labels', np.unique(df_wine['Class label'])) df_wine.head() #
# # ### Note: # # # If the link to the Wine dataset provided above does not work for you, you can find a local copy in this repository at [./../datasets/wine/wine.data](./../datasets/wine.data). # # Or you could fetch it via # # In[27]: df_wine = pd.read_csv('https://raw.githubusercontent.com/rasbt/python-machine-learning-book/master/code/datasets/wine/wine.data', header=None) df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'] df_wine.head() #
# In[28]: if Version(sklearn_version) < '0.18': from sklearn.cross_validation import train_test_split else: from sklearn.model_selection import train_test_split X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.3, random_state=0) #
#
# # Bringing features onto the same scale # In[29]: from sklearn.preprocessing import MinMaxScaler mms = MinMaxScaler() X_train_norm = mms.fit_transform(X_train) X_test_norm = mms.transform(X_test) # In[30]: from sklearn.preprocessing import StandardScaler stdsc = StandardScaler() X_train_std = stdsc.fit_transform(X_train) X_test_std = stdsc.transform(X_test) # A visual example: # In[31]: ex = pd.DataFrame([0, 1, 2, 3, 4, 5]) # standardize ex[1] = (ex[0] - ex[0].mean()) / ex[0].std(ddof=0) # Please note that pandas uses ddof=1 (sample standard deviation) # by default, whereas NumPy's std method and the StandardScaler # uses ddof=0 (population standard deviation) # normalize ex[2] = (ex[0] - ex[0].min()) / (ex[0].max() - ex[0].min()) ex.columns = ['input', 'standardized', 'normalized'] ex #
#
# # Selecting meaningful features # ... # ## Sparse solutions with L1-regularization # In[32]: Image(filename='./images/04_12.png', width=500) # In[33]: Image(filename='./images/04_13.png', width=500) # In[34]: from sklearn.linear_model import LogisticRegression lr = LogisticRegression(penalty='l1', C=0.1) lr.fit(X_train_std, y_train) print('Training accuracy:', lr.score(X_train_std, y_train)) print('Test accuracy:', lr.score(X_test_std, y_test)) # In[35]: lr.intercept_ # In[36]: lr.coef_ # In[37]: import matplotlib.pyplot as plt fig = plt.figure() ax = plt.subplot(111) colors = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'pink', 'lightgreen', 'lightblue', 'gray', 'indigo', 'orange'] weights, params = [], [] for c in np.arange(-4., 6.): lr = LogisticRegression(penalty='l1', C=10.**c, random_state=0) lr.fit(X_train_std, y_train) weights.append(lr.coef_[1]) params.append(10.**c) weights = np.array(weights) for column, color in zip(range(weights.shape[1]), colors): plt.plot(params, weights[:, column], label=df_wine.columns[column + 1], color=color) plt.axhline(0, color='black', linestyle='--', linewidth=3) plt.xlim([10**(-5), 10**5]) plt.ylabel('weight coefficient') plt.xlabel('C') plt.xscale('log') plt.legend(loc='upper left') ax.legend(loc='upper center', bbox_to_anchor=(1.38, 1.03), ncol=1, fancybox=True) # plt.savefig('./figures/l1_path.png', dpi=300) plt.show() #
#
# ## Sequential feature selection algorithms # In[38]: from sklearn.base import clone from itertools import combinations import numpy as np from sklearn.metrics import accuracy_score if Version(sklearn_version) < '0.18': from sklearn.cross_validation import train_test_split else: from sklearn.model_selection import train_test_split class SBS(): def __init__(self, estimator, k_features, scoring=accuracy_score, test_size=0.25, random_state=1): self.scoring = scoring self.estimator = clone(estimator) self.k_features = k_features self.test_size = test_size self.random_state = random_state def fit(self, X, y): X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=self.test_size, random_state=self.random_state) dim = X_train.shape[1] self.indices_ = tuple(range(dim)) self.subsets_ = [self.indices_] score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_) self.scores_ = [score] while dim > self.k_features: scores = [] subsets = [] for p in combinations(self.indices_, r=dim - 1): score = self._calc_score(X_train, y_train, X_test, y_test, p) scores.append(score) subsets.append(p) best = np.argmax(scores) self.indices_ = subsets[best] self.subsets_.append(self.indices_) dim -= 1 self.scores_.append(scores[best]) self.k_score_ = self.scores_[-1] return self def transform(self, X): return X[:, self.indices_] def _calc_score(self, X_train, y_train, X_test, y_test, indices): self.estimator.fit(X_train[:, indices], y_train) y_pred = self.estimator.predict(X_test[:, indices]) score = self.scoring(y_test, y_pred) return score # In[39]: import matplotlib.pyplot as plt from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=2) # selecting features sbs = SBS(knn, k_features=1) sbs.fit(X_train_std, y_train) # plotting performance of feature subsets k_feat = [len(k) for k in sbs.subsets_] plt.plot(k_feat, sbs.scores_, marker='o') plt.ylim([0.7, 1.1]) plt.ylabel('Accuracy') plt.xlabel('Number of features') plt.grid() plt.tight_layout() # plt.savefig('./sbs.png', dpi=300) plt.show() # In[40]: k5 = list(sbs.subsets_[8]) print(df_wine.columns[1:][k5]) # In[41]: knn.fit(X_train_std, y_train) print('Training accuracy:', knn.score(X_train_std, y_train)) print('Test accuracy:', knn.score(X_test_std, y_test)) # In[42]: knn.fit(X_train_std[:, k5], y_train) print('Training accuracy:', knn.score(X_train_std[:, k5], y_train)) print('Test accuracy:', knn.score(X_test_std[:, k5], y_test)) #
#
# # Assessing Feature Importances with Random Forests # In[43]: from sklearn.ensemble import RandomForestClassifier feat_labels = df_wine.columns[1:] forest = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1) forest.fit(X_train, y_train) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] for f in range(X_train.shape[1]): print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]])) plt.title('Feature Importances') plt.bar(range(X_train.shape[1]), importances[indices], color='lightblue', align='center') plt.xticks(range(X_train.shape[1]), feat_labels[indices], rotation=90) plt.xlim([-1, X_train.shape[1]]) plt.tight_layout() #plt.savefig('./random_forest.png', dpi=300) plt.show() # In[44]: if Version(sklearn_version) < '0.18': X_selected = forest.transform(X_train, threshold=0.15) else: from sklearn.feature_selection import SelectFromModel sfm = SelectFromModel(forest, threshold=0.15, prefit=True) X_selected = sfm.transform(X_train) X_selected.shape # Now, let's print the 3 features that met the threshold criterion for feature selection that we set earlier (note that this code snippet does not appear in the actual book but was added to this notebook later for illustrative purposes): # In[45]: for f in range(X_selected.shape[1]): print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]])) #
#
# # Summary # ...