#!/usr/bin/env python # coding: utf-8 # #Introduction Machine Learning using `scikit-learn` # ###This workshop make uses the dataset that was used in the Kaggle Otto Challenge. # # # ###We will learn the following: # * Import data using `pandas` # * Build an initial linear model # * Understand how to evaluate the model # * Build on the linear model and add cross-validation and regularization # * Build logistic regression models - with cross-validation and regularization # * Build decision tree models # * Build random forest models # In[103]: import numpy as np import pandas as pd import matplotlib # ###Read in the datasets. # [source](https://www.kaggle.com/c/otto-group-product-classification-challenge/data) # In[104]: train_pd = pd.read_csv("data/train.csv") test_pd = pd.read_csv("data/test.csv") # ### Let's see the number of rows and columns the train and test dataset has # In[105]: train_pd.shape # In[106]: test_pd.shape # ###Taking a peek at the training dataset # In[107]: train_pd.head() # ###What are the column types of each of the columns in the training dataset? # In[108]: train_pd.info() # ###Summary statistics of the training dataset # In[109]: train_pd.describe() # ###How many classes are there in the target? # In[110]: train_pd.target.unique() # ###What is the distribution of the target? # In[111]: train_pd.target.value_counts() # ###For creating models, the target has to be a numeric value. # ####Using the `preprocessing` module to create a column for each of the classes. # In[112]: from sklearn import preprocessing # In[113]: labels = train_pd.target.values enc = preprocessing.LabelBinarizer() binarized_labels = enc.fit_transform(labels) # In[114]: binarized_labels[0:10] # In[115]: train_pd[train_pd.target == "Class_2"] # In[116]: binarized_labels[1920:1940] # ###Creating train and target dataset # ###The models are created for *class_2* # In[117]: target = binarized_labels[:,1] train_pd = train_pd.drop("id", axis=1) train_pd = train_pd.drop("target", axis=1) # ##Model 1: Linear Regression # In[118]: from sklearn import linear_model # In[119]: ols = linear_model.LinearRegression(normalize=True, fit_intercept=True) # In[120]: get_ipython().run_line_magic('timeit', '-n 1 ols.fit(train_pd, target, n_jobs=-1)') # In[121]: ols.coef_ # In[122]: ols_predict = ols.predict(train_pd) # In[123]: ols_predict # ####Setting 0.5 as the decision boundary. If values are above 0.5, the prediction is set to 1. This means the record belongs to class 2. Else, the prediction is set to 0 # In[124]: ols_predict_raw = ols_predict.copy() ols_predict[ols_predict > 0.5] = 1 ols_predict[ols_predict <= 0.5] = 0 # In[125]: ols_predict # In[126]: ols_model_analysis = pd.concat([pd.Series(target), pd.Series(ols_predict)], axis=1) # In[127]: ols_model_analysis.columns = ['actual', 'prediction'] # In[128]: ols_model_analysis[ols_model_analysis.actual == 1] # ###Model Evaluation # In[129]: true_positives = ols_model_analysis[(ols_model_analysis.actual == 1) & (ols_model_analysis.prediction == 1)]\ .sum()[0] true_positives # In[130]: true_negatives = ols_model_analysis[(ols_model_analysis.actual == 0) & (ols_model_analysis.prediction == 0)]\ .sum()[0] true_negatives # In[131]: false_positives = ols_model_analysis[(ols_model_analysis.actual == 0) & (ols_model_analysis.prediction == 1)]\ .sum()[0] false_positives # In[132]: false_negatives = ols_model_analysis[(ols_model_analysis.actual == 1) & (ols_model_analysis.prediction == 0)]\ .sum()[0] false_negatives # In[133]: precision = true_positives / (true_positives + false_positives) precision # In[134]: recall = true_positives / (true_positives + false_negatives) recall # In[135]: from sklearn import metrics # In[136]: get_ipython().run_line_magic('pylab', 'inline') # In[137]: ols_auc = metrics.roc_auc_score(ols_model_analysis.actual, ols_predict_raw) fpr, tpr, thresholds = metrics.roc_curve(ols_model_analysis.actual, ols_predict_raw) pyplot.plot(fpr, tpr) pyplot.plot([0,1],[0,1]) # In[138]: fpr, tpr, thresholds = metrics.roc_curve(ols_model_analysis.actual, ols_model_analysis.prediction) pyplot.plot(fpr, tpr) pyplot.plot([0,1],[0,1]) # In[139]: ols_auc # In[140]: ols_f1 = 2*true_positives / (2*true_positives + false_positives + false_negatives) # In[141]: ols_f1 # ## Train, Validate Datasets. Cross-validation # # * Split training data into 80:20 # * Train on 80, predict on 20 # * Cross validation # In[142]: from sklearn.cross_validation import train_test_split # In[143]: sample_train, sample_validate, sample_train_target, sample_validate_target = train_test_split(train_pd, target, test_size = 0.2, random_state = 123) # In[144]: sample_train.shape # In[145]: sample_validate.shape # In[146]: sample_train_target.shape # In[147]: sample_validate_target.shape # In[148]: ols = linear_model.LinearRegression(normalize=True) get_ipython().run_line_magic('timeit', '-n 1 ols.fit(sample_train, sample_train_target, n_jobs=-1)') ols_sample_predict = ols.predict(sample_validate) ols_predict_raw = ols_sample_predict.copy() ols_sample_predict[ols_sample_predict > 0.5] = 1 ols_sample_predict[ols_sample_predict <= 0.5] = 0 ols_model_analysis = pd.concat([pd.Series(sample_validate_target), pd.Series(ols_sample_predict)], axis=1) ols_model_analysis.columns = ['actual', 'prediction'] true_positives = ols_model_analysis[(ols_model_analysis.actual == 1) & (ols_model_analysis.prediction == 1)]\ .sum()[0] true_negatives = ols_model_analysis[(ols_model_analysis.actual == 0) & (ols_model_analysis.prediction == 0)]\ .sum()[0] false_positives = ols_model_analysis[(ols_model_analysis.actual == 0) & (ols_model_analysis.prediction == 1)]\ .sum()[0] false_negatives = ols_model_analysis[(ols_model_analysis.actual == 1) & (ols_model_analysis.prediction == 0)]\ .sum()[0] precision = true_positives / (true_positives + false_positives) recall = true_positives / (true_positives + false_negatives) ols_auc = metrics.roc_auc_score(ols_model_analysis.actual, ols_model_analysis.prediction) ols_f1 = 2*true_positives / (2*true_positives + false_positives + false_negatives) # In[149]: print precision print recall print ols_auc print ols_f1 # In[150]: fpr, tpr, thresholds = metrics.roc_curve(ols_model_analysis.actual, ols_predict_raw) pyplot.plot(fpr, tpr) pyplot.plot([0,1],[0,1]) fpr, tpr, thresholds = metrics.roc_curve(ols_model_analysis.actual, ols_model_analysis.prediction) pyplot.plot(fpr, tpr) pyplot.plot([0,1],[0,1]) # ###`sklearn` has a `cross_validation` module to create the k-fold datasets # In[151]: from sklearn.cross_validation import KFold # In[152]: ols_kf = KFold(n=train_pd.shape[0], n_folds=5, shuffle=True) # In[153]: ols_kf # In[154]: ols_cv_metrics = [] for train_index, validate_index in ols_kf: sample_train, sample_validate = train_pd.loc[train_index], train_pd.loc[validate_index] sample_train_target, sample_validate_target = target[train_index], target[validate_index] ols = linear_model.LinearRegression(normalize=True) get_ipython().run_line_magic('timeit', '-n 1 ols.fit(sample_train, sample_train_target, n_jobs=-1)') ols_sample_predict = ols.predict(sample_validate) ols_predict_raw = ols_sample_predict.copy() ols_sample_predict[ols_sample_predict > 0.5] = 1 ols_sample_predict[ols_sample_predict <= 0.5] = 0 ols_model_analysis = pd.concat([pd.Series(sample_validate_target), pd.Series(ols_sample_predict)], axis=1) ols_model_analysis.columns = ['actual', 'prediction'] true_positives = ols_model_analysis[(ols_model_analysis.actual == 1) & (ols_model_analysis.prediction == 1)]\ .sum()[0] true_negatives = ols_model_analysis[(ols_model_analysis.actual == 0) & (ols_model_analysis.prediction == 0)]\ .sum()[0] false_positives = ols_model_analysis[(ols_model_analysis.actual == 0) & (ols_model_analysis.prediction == 1)]\ .sum()[0] false_negatives = ols_model_analysis[(ols_model_analysis.actual == 1) & (ols_model_analysis.prediction == 0)]\ .sum()[0] precision = true_positives / (true_positives + false_positives) recall = true_positives / (true_positives + false_negatives) ols_auc = metrics.roc_auc_score(ols_model_analysis.actual, ols_model_analysis.prediction) ols_f1 = 2*true_positives / (2*true_positives + false_positives + false_negatives) ols_cv_metrics.append((precision, recall, ols_auc, ols_f1)) # In[155]: ols_metric_pd = pd.DataFrame(ols_cv_metrics).mean() # In[156]: ols_metrics_pd = pd.DataFrame(pd.DataFrame(ols_metric_pd).T) ols_metrics_pd.columns=['precision', 'recall', 'auc', 'f1'] print ols_metrics_pd # In[157]: fpr, tpr, thresholds = metrics.roc_curve(ols_model_analysis.actual, ols_predict_raw) pyplot.plot(fpr, tpr) pyplot.plot([0,1],[0,1]) # ## Regularization # ###To avoid overfitting, regularization is used. # * Ridge / LARS (L2) # * Lasso (L1) # ## Ridge Regression # In[158]: from sklearn.linear_model import Ridge # In[159]: ridge = Ridge(normalize=True) # In[160]: get_ipython().run_line_magic('timeit', '-n 1 ridge.fit(sample_train, sample_train_target)') # In[161]: ridge_sample_predict = ridge.predict(sample_validate) # In[162]: ridge_sample_predict # In[163]: ridge_sample_predict[ridge_sample_predict > 0.5] = 1 ridge_sample_predict[ridge_sample_predict <= 0.5] = 0 # In[164]: ridge_sample_predict # In[165]: ridge_model_analysis = pd.concat([pd.Series(sample_validate_target), pd.Series(ridge_sample_predict)], axis=1) # In[166]: ridge_model_analysis.columns = ['actual', 'prediction'] # In[167]: ridge_model_analysis.head() # In[168]: ridge_model_analysis[ridge_model_analysis.prediction == 1] # In[169]: true_positives = ridge_model_analysis[(ridge_model_analysis.actual == 1) & (ridge_model_analysis.prediction == 1)]\ .sum()[0] true_negatives = ridge_model_analysis[(ridge_model_analysis.actual == 0) & (ridge_model_analysis.prediction == 0)]\ .sum()[0] false_positives = ridge_model_analysis[(ridge_model_analysis.actual == 0) & (ridge_model_analysis.prediction == 1)]\ .sum()[0] false_negatives = ridge_model_analysis[(ridge_model_analysis.actual == 1) & (ridge_model_analysis.prediction == 0)]\ .sum()[0] # In[170]: precision = true_positives / (true_positives + false_negatives) recall = true_positives / (true_positives + false_positives) ridge_auc = metrics.roc_auc_score(ridge_model_analysis.actual, ridge_model_analysis.prediction) ridge_f1 = 2*(true_positives) / (2*true_positives + false_positives + false_negatives) # In[171]: print precision print recall print ridge_auc print ridge_f1 # In[172]: #Creating a function for the cross-validation process. def cross_validate(clf, train_pd, target): predict_raw = None clf_cv_metrics = [] train, validate, train_target, validate_target = train_test_split(train_pd, target, test_size = 0.2) kf = KFold(n=train_pd.shape[0], n_folds=10, shuffle=True) for train_index, validate_index in kf: sample_train, sample_validate = train_pd.loc[train_index], train_pd.loc[validate_index] sample_train_target, sample_validate_target = target[train_index], target[validate_index] #ols = linear_model.LinearRegression(normalize=True) #print clf clf.fit(train, train_target) predict = clf.predict(validate) predict_raw predict_raw = predict.copy() predict[predict > 0.5] = 1 predict[predict <= 0.5] = 0 clf_model_analysis = pd.concat([pd.Series(validate_target), pd.Series(predict)], axis=1) clf_model_analysis.columns = ['actual', 'prediction'] true_positives = clf_model_analysis[(clf_model_analysis.actual == 1) & (clf_model_analysis.prediction == 1)]\ .sum()[0] #print "tp ", true_positives true_negatives = clf_model_analysis[(clf_model_analysis.actual == 0) & (clf_model_analysis.prediction == 0)]\ .sum()[0] #print "tn ", true_negatives false_positives = clf_model_analysis[(clf_model_analysis.actual == 0) & (clf_model_analysis.prediction == 1)]\ .sum()[0] #print "fp ", false_positives false_negatives = clf_model_analysis[(clf_model_analysis.actual == 1) & (clf_model_analysis.prediction == 0)]\ .sum()[0] #print "fn ", false_negatives precision = float(true_positives) / float(true_positives + false_positives) recall = float(true_positives) / float(true_positives + false_negatives) clf_auc = metrics.roc_auc_score(clf_model_analysis.actual, clf_model_analysis.prediction) clf_f1 = float(2*true_positives) / float(2*true_positives + false_positives + false_negatives) clf_cv_metrics.append((precision, recall, clf_auc, clf_f1)) fpr, tpr, thresholds = metrics.roc_curve(clf_model_analysis.actual, predict_raw) pyplot.plot(fpr, tpr) pyplot.plot([0,1],[0,1]) return clf_cv_metrics # In[173]: ridge = Ridge(normalize=True) ridge_cv_metrics = cross_validate(ridge, train_pd, target) # In[174]: ridge_metrics = pd.DataFrame(ridge_cv_metrics).mean() ridge_metrics_pd = pd.DataFrame(pd.DataFrame(ridge_metrics).T) ridge_metrics_pd.columns=['precision', 'recall', 'auc', 'f1'] print ridge_metrics_pd # In[175]: ols_metrics_pd # ## LASSO # In[176]: lasso = linear_model.Lasso(alpha=0.1, selection='random') # In[177]: lasso_cv_metrics = cross_validate(lasso,train_pd, target) # In[178]: lasso_cv_metrics = pd.DataFrame(lasso_cv_metrics).mean() lasso_cv_metrics_pd = pd.DataFrame(pd.DataFrame(lasso_cv_metrics).T) lasso_cv_metrics_pd.columns=['precision', 'recall', 'auc', 'f1'] print lasso_cv_metrics_pd # ## Model 2: Logistic Regression # In[179]: log_reg_l2 = linear_model.LogisticRegression(C=1, penalty='l2') # In[180]: log_reg_l2_cv = cross_validate(log_reg_l2, train_pd, target) log_reg_l2_cv_metrics = pd.DataFrame(log_reg_l2_cv).mean() log_reg_l2_metrics_pd = pd.DataFrame(pd.DataFrame(log_reg_l2_cv_metrics).T) log_reg_l2_metrics_pd.columns=['precision', 'recall', 'auc', 'f1'] print log_reg_l2_metrics_pd # In[181]: log_reg_l1 = linear_model.LogisticRegression(C=0.1, penalty='l1') log_reg_l1_cv_metrics = cross_validate(log_reg_l1, train_pd, target) # In[182]: log_reg_l1_metrics = pd.DataFrame(log_reg_l1_cv_metrics).mean() log_reg_l1_metrics_pd = pd.DataFrame(pd.DataFrame(log_reg_l1_metrics).T) log_reg_l1_metrics_pd.columns=['precision', 'recall', 'auc', 'f1'] print log_reg_l1_metrics_pd # ## Model 3: Decision Trees # In[183]: from sklearn.tree import DecisionTreeClassifier, export_graphviz # In[184]: dtc = DecisionTreeClassifier(max_depth=8) # In[185]: dtc.fit(train_pd, target) # In[186]: dtc_cv_metric = cross_validate(dtc, train_pd, target) # In[187]: dtc_metrics = pd.DataFrame(dtc_cv_metric).mean() dtc_metrics_pd = pd.DataFrame(pd.DataFrame(dtc_metrics).T) dtc_metrics_pd.columns=['precision', 'recall', 'auc', 'f1'] print dtc_metrics_pd # In[188]: export_graphviz(dtc, feature_names=train_pd.columns) # In[189]: get_ipython().system('dot -Tpng tree.dot -o tree.png') # In[190]: from IPython.display import Image # In[191]: Image('tree.png', unconfined=True) # ## Model 4: Random Forests # In[192]: from sklearn.ensemble import RandomForestClassifier # In[193]: rf_clf = RandomForestClassifier(n_jobs=-1) # In[194]: rf_clf_cv_metric = cross_validate(rf_clf, train_pd, target) # In[195]: rf_clf_cv_metrics = pd.DataFrame(rf_clf_cv_metric).mean() rf_clf_cv_metrics_pd = pd.DataFrame(pd.DataFrame(rf_clf_cv_metrics).T) rf_clf_cv_metrics_pd.columns=['precision', 'recall', 'auc', 'f1'] print rf_clf_cv_metrics_pd # ##Model Comparison # In[196]: frames = [ols_metrics_pd, ridge_metrics_pd, lasso_cv_metrics_pd, log_reg_l1_metrics_pd, log_reg_l2_metrics_pd, dtc_metrics_pd, rf_clf_cv_metrics_pd] model_analysis = pd.concat(frames) model_analysis['model_name'] = ['Linear Regression', 'L2 Linear Reg', 'L1 Linear Reg', 'L1 Logistic Reg', 'L2 Logistic Reg', 'Decision Tree', 'Random Forest'] model_analysis # In[ ]: