%load_ext watermark

%watermark -d -u -v -p scikit-learn,pandas,scipy,matplotlib

clf = Pipeline(steps=[
    ('scaler', StandardScaler()),    
    ('reduce_dim', PCA(n_components=2)),
    ('classifier', GaussianNB())   
    ])

clf.fit(X_train, y_train)      # fitting on the training dataset
pred = clf_lda.predict(X_test) # classifying the test dataset

import pandas as pd

df = pd.io.parsers.read_csv(
    filepath_or_buffer='https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', 
    header=None, 
    sep=',', 
    )
df.dropna(how="all", inplace=True) # to drop the empty line at file-end

feature_dict = {i:label for i,label in zip(
            range(4),
              ('sepal length in cm', 
              'sepal width in cm', 
              'petal length in cm', 
              'petal width in cm', ))}

from sklearn.preprocessing import LabelEncoder

X = df[[0,1,2,3]].values 
y = df[4].values

enc = LabelEncoder()
label_encoder = enc.fit(y)
y = label_encoder.transform(y) + 1

label_dict = {1: 'Setosa', 2: 'Versicolor', 3:'Virginica'}

%matplotlib inline

from matplotlib import pyplot as plt
import numpy as np
import math

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12,6))

for ax,cnt in zip(axes.ravel(), range(4)):

    # set bin sizes
    min_b = math.floor(np.min(X[:,cnt]))
    max_b = math.ceil(np.max(X[:,cnt]))
    bins = np.linspace(min_b, max_b, 25)

    # plottling the histograms
    for lab,col in zip(range(1,4), ('blue', 'red', 'green')):
        ax.hist(X[y==lab, cnt],
                   color=col, 
                   label='class %s' %label_dict[lab], 
                   bins=bins,
                   alpha=0.5,)
    ylims = ax.get_ylim()

    # plot annotation
    leg = ax.legend(loc='upper right', fancybox=True, fontsize=8)
    leg.get_frame().set_alpha(0.5)
    ax.set_ylim([0, max(ylims)+2])
    ax.set_xlabel(feature_dict[cnt])
    ax.set_title('Iris histogram #%s' %str(cnt+1))

    # hide axis ticks
    ax.tick_params(axis="both", which="both", bottom="off", top="off",  
            labelbottom="on", left="off", right="off", labelleft="on")

    # remove axis spines
    ax.spines["top"].set_visible(False)  
    ax.spines["right"].set_visible(False) 
    ax.spines["bottom"].set_visible(False) 
    ax.spines["left"].set_visible(False)

axes[0][0].set_ylabel('count')
axes[1][0].set_ylabel('count')

fig.tight_layout()

plt.show()

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=0.40, random_state=12345)

class ColumnSelector(object):
    """ A feature selector for scikit-learn's Pipeline class that returns
        specified columns from a numpy array.
    
    """
    
    def __init__(self, cols):
        self.cols = cols
        
    def transform(self, X, y=None):
        return X[:, self.cols]

    def fit(self, X, y=None):
        return self

from sklearn.cross_validation import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.lda import LDA
from sklearn.decomposition import PCA

clf_all = Pipeline(steps=[
    ('scaler', StandardScaler()),           
    ('classifier', GaussianNB())   
    ])

clf_petal = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('reduce_dim', ColumnSelector(cols=(2,3))),    
    ('classifier', GaussianNB())   
    ]) 

clf_pca = Pipeline(steps=[
    ('scaler', StandardScaler()),    
    ('reduce_dim', PCA(n_components=2)),
    ('classifier', GaussianNB())   
    ])

clf_lda = Pipeline(steps=[
    ('scaler', StandardScaler()), 
    ('reduce_dim', LDA(n_components=2)),
    ('classifier', GaussianNB())   
    ])

# Constructing the k-fold cross validation iterator (k=5)  

cv = KFold(n=X_train.shape[0],  # total number of samples
           n_folds=5,           # number of folds the dataset is divided into
           random_state=12345)

scores = [
    cross_val_score(clf, X_train, y_train, cv=cv, scoring='accuracy')
            for clf in [clf_all, clf_petal, clf_pca, clf_lda]
    ]

for score,label in zip(scores, 
                       ['all attributes', 
                        'Petal dimensions (column 3 & 4)',
                        'PCA dim. red. (n=2)', 
                        'LDA dim. red. (n=2)', 
                        ]
                       ):
    print("Accuracy: {:.2%} (+/- {:.2%}), {:}".format(score.mean(), score.std(), label))

from sklearn import metrics

clf_lda.fit(X_train, y_train)
pred_test = clf_lda.predict(X_test)

print('Prediction accuracy for the test dataset')
print('{:.2%}'.format(metrics.accuracy_score(y_test, pred_test)))

print('\nConfusion Matrix of the Naive Bayes classifier')
print(metrics.confusion_matrix(y_test, clf_lda.predict(X_test)))