#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd import matplotlib.pyplot as plt import warnings warnings.simplefilter(action = "ignore", category = FutureWarning) # ## Fetch the data and load it in pandas # In[2]: import os from urllib.request import urlretrieve url = ("https://archive.ics.uci.edu/ml/machine-learning-databases" "/adult/adult.data") local_filename = os.path.basename(url) if not os.path.exists(local_filename): print("Downloading Adult Census datasets from UCI") urlretrieve(url, local_filename) # In[3]: names = ("age, workclass, fnlwgt, education, education-num, " "marital-status, occupation, relationship, race, sex, " "capital-gain, capital-loss, hours-per-week, " "native-country, income").split(', ') data = pd.read_csv(local_filename, names=names) # In[4]: data.head() # In[5]: data.count() # In[6]: data.describe() # In[7]: data.groupby('occupation').size().nlargest(10) # In[8]: data.groupby('native-country').size().nlargest(10) # In[9]: data.hist(column='education-num', bins=15); # In[10]: data.hist(column='age', bins=15); # In[11]: data.hist('hours-per-week', bins=15); # In[12]: data.plot(x='age', y='hours-per-week', kind='scatter', alpha=0.02, s=50); # In[13]: data.groupby('income')['income'].count() # In[14]: np.mean(data['income'] == ' >50K') # In[15]: data['income'].unique() # In[16]: data = data.dropna() # In[17]: data['income'].unique() # In[18]: target_names = data['income'].unique() target_names # In[19]: low_income = data[data['income'] == ' <=50K'] high_income = data[data['income'] == ' >50K'] bins = np.linspace(10, 90, 20) plt.hist(low_income['age'].values, bins=bins, alpha=0.5, label='<=50K') plt.hist(high_income['age'].values, bins=bins, alpha=0.5, label='>50K') plt.legend(loc='best'); # In[20]: plt.scatter(low_income['age'], low_income['hours-per-week'], alpha=0.03, s=50, c='b', label='<=50K'); plt.scatter(high_income['age'], high_income['hours-per-week'], alpha=0.03, s=50, c='g', label='>50K'); plt.legend() plt.xlabel('age'); plt.ylabel('hours-per-week'); # ## Building predictive models # In[21]: target = data['income'] features_data = data.drop('income', axis=1) # In[22]: numeric_features = [c for c in features_data if features_data[c].dtype.kind in ('i', 'f')] numeric_features # In[23]: numeric_data = features_data[numeric_features] numeric_data.head(5) # In[24]: categorical_data = features_data.drop(numeric_features, 1) categorical_data.head(5) # In[25]: categorical_data_encoded = categorical_data.apply(lambda x: pd.factorize(x)[0]) categorical_data_encoded.head(5) # In[26]: features = pd.concat([numeric_data, categorical_data_encoded], axis=1) features.head() # In[27]: # Alternatively: use one-hot encoding for categorical features # features = pd.get_dummies(features_data) # features.head() # In[28]: X = features.values.astype(np.float32) y = (target.values == ' >50K').astype(np.int32) # In[29]: X.shape # In[30]: y # In[33]: from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0) # In[34]: from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import cross_val_score clf = DecisionTreeClassifier(max_depth=8) scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc') print("ROC AUC Decision Tree: {:.4f} +/-{:.4f}".format( np.mean(scores), np.std(scores))) # ## Model error analysis # In[35]: from sklearn.model_selection import learning_curve def plot_learning_curve(estimator, X, y, ylim=(0, 1.1), cv=5, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5), scoring=None): plt.title("Learning curves for %s" % type(estimator).__name__) plt.ylim(*ylim); plt.grid() plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, validation_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring=scoring) train_scores_mean = np.mean(train_scores, axis=1) validation_scores_mean = np.mean(validation_scores, axis=1) plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, validation_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") print("Best validation score: {:.4f}".format(validation_scores_mean[-1])) # In[36]: clf = DecisionTreeClassifier(max_depth=None) plot_learning_curve(clf, X_train, y_train, scoring='roc_auc') # In[37]: clf = DecisionTreeClassifier(max_depth=15) plot_learning_curve(clf, X_train, y_train, scoring='roc_auc') # In[38]: clf = DecisionTreeClassifier(max_depth=8) plot_learning_curve(clf, X_train, y_train, scoring='roc_auc') # In[39]: clf = DecisionTreeClassifier(max_depth=4) plot_learning_curve(clf, X_train, y_train, scoring='roc_auc') # In[40]: from sklearn.model_selection import validation_curve def plot_validation_curve(estimator, X, y, param_name, param_range, ylim=(0, 1.1), cv=5, n_jobs=-1, scoring=None): estimator_name = type(estimator).__name__ plt.title("Validation curves for %s on %s" % (param_name, estimator_name)) plt.ylim(*ylim); plt.grid() plt.xlim(min(param_range), max(param_range)) plt.xlabel(param_name) plt.ylabel("Score") train_scores, test_scores = validation_curve( estimator, X, y, param_name, param_range, cv=cv, n_jobs=n_jobs, scoring=scoring) train_scores_mean = np.mean(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) plt.semilogx(param_range, train_scores_mean, 'o-', color="r", label="Training score") plt.semilogx(param_range, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") print("Best test score: {:.4f}".format(test_scores_mean[-1])) # In[41]: clf = DecisionTreeClassifier(max_depth=8) param_name = 'max_depth' param_range = [1, 2, 4, 8, 16, 32] plot_validation_curve(clf, X_train, y_train, param_name, param_range, scoring='roc_auc') # ## Gradient Boosted Decision Trees # In[42]: from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=30, max_features=10, max_depth=10) scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1) print("ROC Random Forest: {:.4f} +/-{:.4f}".format( np.mean(scores), np.std(scores))) # In[43]: from sklearn.ensemble import GradientBoostingClassifier clf = GradientBoostingClassifier(max_leaf_nodes=5, learning_rate=0.1, n_estimators=100) scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1) print("ROC AUC Gradient Boosted Trees: {:.4f} +/-{:.4f}".format( np.mean(scores), np.std(scores))) # ## Evaluation of the best model # In[44]: get_ipython().run_cell_magic('time', '', '_ = clf.fit(X_train, y_train)\n') # In[45]: from sklearn.metrics import roc_auc_score y_pred_proba = clf.predict_proba(X_test)[:, 1] print("ROC AUC: %0.4f" % roc_auc_score(y_test, y_pred_proba)) # In[46]: from sklearn.metrics import classification_report y_pred = clf.predict(X_test) print(classification_report(y_test, y_pred, target_names=target_names)) # In[47]: from sklearn.metrics import precision_recall_curve precision_gb, recall_gb, _ = precision_recall_curve(y_test, y_pred_proba) plt.plot(precision_gb, recall_gb) plt.xlabel('precision') plt.ylabel('recall') plt.title('Precision / Recall curve'); # ## Variable importances # In[48]: plt.figure(figsize=(10, 5)) ordering = np.argsort(clf.feature_importances_)[::-1] importances = clf.feature_importances_[ordering] feature_names = features.columns[ordering] x = np.arange(len(feature_names)) plt.bar(x, importances) plt.xticks(x + 0.5, feature_names, rotation=90, fontsize=15); # ## Using the boosted trees to extract features for a Logistic Regression model # The following is an implementation of a trick found in: # # Practical Lessons from Predicting Clicks on Ads at Facebook # Junfeng Pan, He Xinran, Ou Jin, Tianbing XU, Bo Liu, Tao Xu, Yanxin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers, Joaquin QuiƱonero Candela # International Workshop on Data Mining for Online Advertising (ADKDD) # # https://www.facebook.com/publications/329190253909587/ # In[49]: from sklearn.base import BaseEstimator, TransformerMixin, clone from sklearn.preprocessing import LabelBinarizer from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegressionCV from sklearn.pipeline import make_pipeline from scipy.sparse import hstack class TreeTransform(BaseEstimator, TransformerMixin): """One-hot encode samples with an ensemble of trees This transformer first fits an ensemble of trees (e.g. gradient boosted trees or a random forest) on the training set. Then each leaf of each tree in the ensembles is assigned a fixed arbitrary feature index in a new feature space. If you have 100 trees in the ensemble and 2**3 leafs per tree, the new feature space has 100 * 2**3 == 800 dimensions. Each sample of the training set go through the decisions of each tree of the ensemble and ends up in one leaf per tree. The sample if encoded by setting features with those leafs to 1 and letting the other feature values to 0. The resulting transformer learn a supervised, sparse, high-dimensional categorical embedding of the data. This transformer is typically meant to be pipelined with a linear model such as logistic regression, linear support vector machines or elastic net regression. """ def __init__(self, estimator): self.estimator = estimator def fit(self, X, y): self.fit_transform(X, y) return self def fit_transform(self, X, y): self.estimator_ = clone(self.estimator) self.estimator_.fit(X, y) self.binarizers_ = [] sparse_applications = [] estimators = np.asarray(self.estimator_.estimators_).ravel() for t in estimators: lb = LabelBinarizer(sparse_output=True) sparse_applications.append(lb.fit_transform(t.tree_.apply(X))) self.binarizers_.append(lb) return hstack(sparse_applications) def transform(self, X, y=None): sparse_applications = [] estimators = np.asarray(self.estimator_.estimators_).ravel() for t, lb in zip(estimators, self.binarizers_): sparse_applications.append(lb.transform(t.tree_.apply(X))) return hstack(sparse_applications) boosted_trees = GradientBoostingClassifier( max_leaf_nodes=5, learning_rate=0.1, n_estimators=100, random_state=0, ) pipeline = make_pipeline( TreeTransform(boosted_trees), LogisticRegression(C=1) ) pipeline.fit(X_train, y_train); # In[50]: y_pred_proba = pipeline.predict_proba(X_test)[:, 1] print("ROC AUC: %0.4f" % roc_auc_score(y_test, y_pred_proba)) # In[51]: y_pred = pipeline.predict(X_test) print(classification_report(y_test, y_pred, target_names=target_names)) # In[52]: from sklearn.metrics import precision_recall_curve precision_gb_lr, recall_gb_lr, _ = precision_recall_curve( y_test, y_pred_proba) plt.plot(precision_gb, recall_gb, label='GBT') plt.plot(precision_gb_lr, recall_gb_lr, label='GBT + LR') plt.xlabel('precision') plt.ylabel('recall') plt.title('Precision / Recall curve') plt.legend(); # In[ ]: