%load_ext watermark %watermark -d -u -v -p scikit-learn,pandas,scipy,matplotlib clf = Pipeline(steps=[ ('scaler', StandardScaler()), ('reduce_dim', PCA(n_components=2)), ('classifier', GaussianNB()) ]) clf.fit(X_train, y_train) # fitting on the training dataset pred = clf_lda.predict(X_test) # classifying the test dataset import pandas as pd df = pd.io.parsers.read_csv( filepath_or_buffer='https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None, sep=',', ) df.dropna(how="all", inplace=True) # to drop the empty line at file-end feature_dict = {i:label for i,label in zip( range(4), ('sepal length in cm', 'sepal width in cm', 'petal length in cm', 'petal width in cm', ))} from sklearn.preprocessing import LabelEncoder X = df[[0,1,2,3]].values y = df[4].values enc = LabelEncoder() label_encoder = enc.fit(y) y = label_encoder.transform(y) + 1 label_dict = {1: 'Setosa', 2: 'Versicolor', 3:'Virginica'} %matplotlib inline from matplotlib import pyplot as plt import numpy as np import math fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12,6)) for ax,cnt in zip(axes.ravel(), range(4)): # set bin sizes min_b = math.floor(np.min(X[:,cnt])) max_b = math.ceil(np.max(X[:,cnt])) bins = np.linspace(min_b, max_b, 25) # plottling the histograms for lab,col in zip(range(1,4), ('blue', 'red', 'green')): ax.hist(X[y==lab, cnt], color=col, label='class %s' %label_dict[lab], bins=bins, alpha=0.5,) ylims = ax.get_ylim() # plot annotation leg = ax.legend(loc='upper right', fancybox=True, fontsize=8) leg.get_frame().set_alpha(0.5) ax.set_ylim([0, max(ylims)+2]) ax.set_xlabel(feature_dict[cnt]) ax.set_title('Iris histogram #%s' %str(cnt+1)) # hide axis ticks ax.tick_params(axis="both", which="both", bottom="off", top="off", labelbottom="on", left="off", right="off", labelleft="on") # remove axis spines ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) ax.spines["bottom"].set_visible(False) ax.spines["left"].set_visible(False) axes[0][0].set_ylabel('count') axes[1][0].set_ylabel('count') fig.tight_layout() plt.show() from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=12345) class ColumnSelector(object): """ A feature selector for scikit-learn's Pipeline class that returns specified columns from a numpy array. """ def __init__(self, cols): self.cols = cols def transform(self, X, y=None): return X[:, self.cols] def fit(self, X, y=None): return self from sklearn.cross_validation import cross_val_score, KFold from sklearn.pipeline import Pipeline from sklearn.naive_bayes import GaussianNB from sklearn.preprocessing import StandardScaler from sklearn.lda import LDA from sklearn.decomposition import PCA clf_all = Pipeline(steps=[ ('scaler', StandardScaler()), ('classifier', GaussianNB()) ]) clf_petal = Pipeline(steps=[ ('scaler', StandardScaler()), ('reduce_dim', ColumnSelector(cols=(2,3))), ('classifier', GaussianNB()) ]) clf_pca = Pipeline(steps=[ ('scaler', StandardScaler()), ('reduce_dim', PCA(n_components=2)), ('classifier', GaussianNB()) ]) clf_lda = Pipeline(steps=[ ('scaler', StandardScaler()), ('reduce_dim', LDA(n_components=2)), ('classifier', GaussianNB()) ]) # Constructing the k-fold cross validation iterator (k=5) cv = KFold(n=X_train.shape[0], # total number of samples n_folds=5, # number of folds the dataset is divided into random_state=12345) scores = [ cross_val_score(clf, X_train, y_train, cv=cv, scoring='accuracy') for clf in [clf_all, clf_petal, clf_pca, clf_lda] ] for score,label in zip(scores, ['all attributes', 'Petal dimensions (column 3 & 4)', 'PCA dim. red. (n=2)', 'LDA dim. red. (n=2)', ] ): print("Accuracy: {:.2%} (+/- {:.2%}), {:}".format(score.mean(), score.std(), label)) from sklearn import metrics clf_lda.fit(X_train, y_train) pred_test = clf_lda.predict(X_test) print('Prediction accuracy for the test dataset') print('{:.2%}'.format(metrics.accuracy_score(y_test, pred_test))) print('\nConfusion Matrix of the Naive Bayes classifier') print(metrics.confusion_matrix(y_test, clf_lda.predict(X_test)))