%matplotlib inline from IPython.display import Image import matplotlib as mlp import matplotlib.pyplot as plt import numpy as np import os import pandas as pd import sklearn from sklearn import cross_validation from sklearn import tree from sklearn import svm from sklearn import ensemble from sklearn import neighbors from sklearn import linear_model from sklearn import metrics from sklearn import preprocessing plt.style.use('fivethirtyeight') # Good looking plots pd.set_option('display.max_columns', None) # Display any number of columns _DATA_DIR = 'data' _CHURN_DATA_PATH = os.path.join(_DATA_DIR, 'churn.csv') import seaborn as sns df = pd.read_csv(_CHURN_DATA_PATH) df.head() # Discreet value integer encoder label_encoder = preprocessing.LabelEncoder() # Get the Labels as integers df['Churn'] = df['Churn?'] == 'True.' y = df['Churn'].as_matrix().astype(np.int) # State is string and we want discre integer values df['State'] = label_encoder.fit_transform(df['State']) # Drop the redundant columns from dataframe df.drop(['Area Code','Phone','Churn?', 'Churn'], axis=1, inplace=True) # Get the features as integers similar to what we did for labels(targets) df[["Int'l Plan","VMail Plan"]] = df[["Int'l Plan","VMail Plan"]] == 'yes' print('There are {} instances for churn class and {} instances for not-churn classes.'.format(y.sum(), y.shape[0] - y.sum())) print('Ratio of churn class over all instances: {:.2f}'.format(float(y.sum()) / y.shape[0])) df.head() X = df.as_matrix().astype(np.float) X X.shape scaler = preprocessing.StandardScaler() X = scaler.fit_transform(X) X cv_url = 'http://i.imgur.com/N9HZktu.png' Image(url=cv_url) def stratified_cv(X, y, clf_class, shuffle=True, n_folds=10, **kwargs): stratified_k_fold = cross_validation.StratifiedKFold(y, n_folds=n_folds, shuffle=shuffle) y_pred = y.copy() for ii, jj in stratified_k_fold: X_train, X_test = X[ii], X[jj] y_train = y[ii] clf = clf_class(**kwargs) clf.fit(X_train,y_train) y_pred[jj] = clf.predict(X_test) return y_pred print('Passive Aggressive Classifier: {:.2f}'.format(metrics.accuracy_score(y, stratified_cv(X, y, linear_model.PassiveAggressiveClassifier)))) print('Gradient Boosting Classifier: {:.2f}'.format(metrics.accuracy_score(y, stratified_cv(X, y, ensemble.GradientBoostingClassifier)))) print('Support vector machine(SVM): {:.2f}'.format(metrics.accuracy_score(y, stratified_cv(X, y, svm.SVC)))) print('Random Forest Classifier: {:.2f}'.format(metrics.accuracy_score(y, stratified_cv(X, y, ensemble.RandomForestClassifier)))) print('K Nearest Neighbor Classifier: {:.2f}'.format(metrics.accuracy_score(y, stratified_cv(X, y, neighbors.KNeighborsClassifier)))) print('Logistic Regression: {:.2f}'.format(metrics.accuracy_score(y, stratified_cv(X, y, linear_model.LogisticRegression)))) print('Dump Classifier: {:.2f}'.format(metrics.accuracy_score(y, [0 for ii in y.tolist()]))) pass_agg_conf_matrix = metrics.confusion_matrix(y, stratified_cv(X, y, linear_model.PassiveAggressiveClassifier)) grad_ens_conf_matrix = metrics.confusion_matrix(y, stratified_cv(X, y, ensemble.GradientBoostingClassifier)) decision_conf_matrix = metrics.confusion_matrix(y, stratified_cv(X, y, tree.DecisionTreeClassifier)) ridge_clf_conf_matrix = metrics.confusion_matrix(y, stratified_cv(X, y, linear_model.RidgeClassifier)) svm_svc_conf_matrix = metrics.confusion_matrix(y, stratified_cv(X, y, svm.SVC)) random_forest_conf_matrix = metrics.confusion_matrix(y, stratified_cv(X, y, ensemble.RandomForestClassifier)) k_neighbors_conf_matrix = metrics.confusion_matrix(y, stratified_cv(X, y, neighbors.KNeighborsClassifier)) logistic_reg_conf_matrix = metrics.confusion_matrix(y, stratified_cv(X, y, linear_model.LogisticRegression)) dumb_conf_matrix = metrics.confusion_matrix(y, [0 for ii in y.tolist()]); # ignore the warning as they are all 0 conf_matrix = { 1: { 'matrix': pass_agg_conf_matrix, 'title': 'Passive Aggressive', }, 2: { 'matrix': grad_ens_conf_matrix, 'title': 'Gradient Boosting', }, 3: { 'matrix': decision_conf_matrix, 'title': 'Decision Tree', }, 4: { 'matrix': ridge_clf_conf_matrix, 'title': 'Ridge', }, 5: { 'matrix': svm_svc_conf_matrix, 'title': 'Support Vector Machine', }, 6: { 'matrix': random_forest_conf_matrix, 'title': 'Random Forest', }, 7: { 'matrix': k_neighbors_conf_matrix, 'title': 'K Nearest Neighbors', }, 8: { 'matrix': logistic_reg_conf_matrix, 'title': 'Logistic Regression', }, 9: { 'matrix': dumb_conf_matrix, 'title': 'Dumb', }, } fix, ax = plt.subplots(figsize=(16, 12)) plt.suptitle('Confusion Matrix of Various Classifiers') for ii, values in conf_matrix.items(): matrix = values['matrix'] title = values['title'] plt.subplot(3, 3, ii) # starts from 1 plt.title(title); sns.heatmap(matrix, annot=True, fmt=''); print('Passive Aggressive Classifier:\n {}\n'.format(metrics.classification_report(y, stratified_cv(X, y, linear_model.PassiveAggressiveClassifier)))) print('Gradient Boosting Classifier:\n {}\n'.format(metrics.classification_report(y, stratified_cv(X, y, ensemble.GradientBoostingClassifier)))) print('Support vector machine(SVM):\n {}\n'.format(metrics.classification_report(y, stratified_cv(X, y, svm.SVC)))) print('Random Forest Classifier:\n {}\n'.format(metrics.classification_report(y, stratified_cv(X, y, ensemble.RandomForestClassifier)))) print('K Nearest Neighbor Classifier:\n {}\n'.format(metrics.classification_report(y, stratified_cv(X, y, neighbors.KNeighborsClassifier)))) print('Logistic Regression:\n {}\n'.format(metrics.classification_report(y, stratified_cv(X, y, linear_model.LogisticRegression)))) print('Dump Classifier:\n {}\n'.format(metrics.classification_report(y, [0 for ii in y.tolist()]))); # ignore the warning as they are all 0 gbc = ensemble.GradientBoostingClassifier() gbc.fit(X, y) # Get Feature Importance from the classifier feature_importance = gbc.feature_importances_ # Normalize The Features feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) pos = np.arange(sorted_idx.shape[0]) + .5 plt.figure(figsize=(16, 12)) plt.barh(pos, feature_importance[sorted_idx], align='center', color='#7A68A6') plt.yticks(pos, np.asanyarray(df.columns.tolist())[sorted_idx]) plt.xlabel('Relative Importance') plt.title('Variable Importance') plt.show() X = df.as_matrix().astype(np.float) polynomial_features = preprocessing.PolynomialFeatures() X = polynomial_features.fit_transform(X) X.shape print('Passive Aggressive Classifier:\n {}\n'.format(metrics.classification_report(y, stratified_cv(X, y, linear_model.PassiveAggressiveClassifier)))) print('Gradient Boosting Classifier:\n {}\n'.format(metrics.classification_report(y, stratified_cv(X, y, ensemble.GradientBoostingClassifier)))) print('Support vector machine(SVM):\n {}\n'.format(metrics.classification_report(y, stratified_cv(X, y, svm.SVC)))) print('Random Forest Classifier:\n {}\n'.format(metrics.classification_report(y, stratified_cv(X, y, ensemble.RandomForestClassifier)))) print('K Nearest Neighbor Classifier:\n {}\n'.format(metrics.classification_report(y, stratified_cv(X, y, neighbors.KNeighborsClassifier)))) print('Logistic Regression:\n {}\n'.format(metrics.classification_report(y, stratified_cv(X, y, linear_model.LogisticRegression)))) print('Dump Classifier:\n {}\n'.format(metrics.classification_report(y, [0 for ii in y.tolist()]))); # ignore the warning as they are all 0 X = df.as_matrix().astype(np.float) scaler = preprocessing.StandardScaler() X = scaler.fit_transform(X) polynomial_features = preprocessing.PolynomialFeatures() X = polynomial_features.fit_transform(X) print('Passive Aggressive Classifier:\n {}\n'.format(metrics.classification_report(y, stratified_cv(X, y, linear_model.PassiveAggressiveClassifier)))) print('Gradient Boosting Classifier:\n {}\n'.format(metrics.classification_report(y, stratified_cv(X, y, ensemble.GradientBoostingClassifier)))) print('Support vector machine(SVM):\n {}\n'.format(metrics.classification_report(y, stratified_cv(X, y, svm.SVC)))) print('Random Forest Classifier:\n {}\n'.format(metrics.classification_report(y, stratified_cv(X, y, ensemble.RandomForestClassifier)))) print('K Nearest Neighbor Classifier:\n {}\n'.format(metrics.classification_report(y, stratified_cv(X, y, neighbors.KNeighborsClassifier)))) print('Logistic Regression:\n {}\n'.format(metrics.classification_report(y, stratified_cv(X, y, linear_model.LogisticRegression)))) print('Dump Classifier:\n {}\n'.format(metrics.classification_report(y, [0 for ii in y.tolist()]))); # ignore the warning as they are all 0 from sklearn.datasets.california_housing import fetch_california_housing california_housing = sklearn.datasets.california_housing.fetch_california_housing() california_housing_data = california_housing['data'] california_housing_labels = california_housing['target']# 'target' variables california_housing_feature_names = california_housing['feature_names'] california_housing_feature_names from sklearn import cross_validation X_train, X_test, y_train, y_test = cross_validation.train_test_split(california_housing_data, california_housing_labels, test_size=0.2, random_state=0) print(X_train.shape) print(X_test.shape) print('Training/Test Ratio: {}'.format(X_train.shape[0] / X_test.shape[0])) parameters = { 'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1, 'learning_rate': 0.01, 'loss': 'ls' } from sklearn import ensemble from sklearn import metrics classifier = ensemble.GradientBoostingRegressor(**parameters) classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) mse = metrics.mean_squared_error(y_test, predictions) print('Mean Square Error: {:.3f}'.format(mse)) ### This may take some time, but it will worth it I promise parameters = { 'n_estimators': 3000, 'max_depth': 6, 'learning_rate': 0.04, 'loss': 'huber' } classifier = ensemble.GradientBoostingRegressor(**parameters) classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) mse = metrics.mean_squared_error(y_test, predictions) print('Mean Squared Error: {:.3f}'.format(mse)) plt.figure(figsize=(16, 12)) plt.scatter(range(predictions.shape[0]), predictions, label='predictions', c='#348ABD', alpha=0.4) plt.scatter(range(y_test.shape[0]), y_test, label='actual values', c='#A60628', alpha=0.4) plt.ylim([y_test.min(), predictions.max()]) plt.xlim([0, predictions.shape[0]]) plt.legend(); test_score = [classifier.loss_(y_test, y_pred) for y_pred in classifier.staged_decision_function(X_test)] plt.figure(figsize=(16, 12)) plt.title('Deviance'); plt.plot(np.arange(parameters['n_estimators']) + 1, classifier.train_score_, c='#348ABD', label='Training Set Deviance'); plt.plot(np.arange(parameters['n_estimators']) + 1, test_score, c='#A60628', label='Test Set Deviance'); plt.annotate('Overfit Point', xy=(600, test_score[600]), xycoords='data', xytext=(420, 0.06), textcoords='data', arrowprops=dict(arrowstyle="->", connectionstyle="arc"), ) plt.legend(loc='upper right'); plt.xlabel('Boosting Iterations'); plt.ylabel('Deviance'); # Get Feature Importance from the classifier feature_importance = classifier.feature_importances_ # Normalize The Features feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) pos = np.arange(sorted_idx.shape[0]) + .5 plt.figure(figsize=(16, 12)) plt.barh(pos, feature_importance[sorted_idx], align='center', color='#7A68A6') plt.yticks(pos, np.asanyarray(california_housing_feature_names)[sorted_idx]) plt.xlabel('Relative Importance') plt.title('Variable Importance') plt.show() features = ['MedInc', 'AveOccup', 'HouseAge', 'AveRooms', ('AveOccup', 'HouseAge')] fig, ax = ensemble.partial_dependence.plot_partial_dependence(classifier, X_train, features, feature_names=california_housing_feature_names, figsize=(16, 12));