Bagging trees on Titanic

sklearn.tree.DecisionTreeClassifier

  • Creer les train et test sets
  • comme baseline: arbre de decision simple, not pruned, quel accuracy sur le test set ?
  • maintenant prendre 20 arbres, en limitant la taille a 2 niveaux
  • pour chaque arbre, predire les probas des echantillons du test set
  • puis moyenner les proba et utiliser le resultat pour determiner la classe predite.
  • quel accuracy sur le test set ?
In [112]:
import pandas as pd
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
In [113]:
df = pd.read_csv('../data/classification/titanic.csv', sep = ';')
print(df.shape)
df.sample(frac = 1)
df.head()
(1309, 12)
Out[113]:
pclass survived name sex age sibsp parch ticket fare cabin embarked home.dest
0 1 1 Allen, Miss. Elisabeth Walton female 29.00 0 0 24160 211.3375 B5 S St Louis, MO
1 1 1 Allison, Master. Hudson Trevor male 0.92 1 2 113781 151.5500 C22 C26 S Montreal, PQ / Chesterville, ON
2 1 0 Allison, Miss. Helen Loraine female 2.00 1 2 113781 151.5500 C22 C26 S Montreal, PQ / Chesterville, ON
3 1 0 Allison, Mr. Hudson Joshua Creighton male 30.00 1 2 113781 151.5500 C22 C26 S Montreal, PQ / Chesterville, ON
4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 25.00 1 2 113781 151.5500 C22 C26 S Montreal, PQ / Chesterville, ON

Preprocessing

In [114]:
# Age
df.loc[df.age.isnull(), 'age'] = np.mean(df.age)

# Fare
df.loc[df.fare.isnull(), 'fare'] = np.mean(df.fare)


# Choix arbitraire
df.loc[df.embarked.isnull(), 'embarked'] = 'C'

# Choix arbitraire
df.loc[df['home.dest'].isnull(), 'home.dest'] = 'Autre'


# extract title
df['title'] = df.name.apply(lambda x: re.search(' ([A-Z][a-z]+)\.', x).group(1))

for col in ['sex', 'embarked', 'home.dest', 'title']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    

# drop useless columns    
df.drop(columns = ['name','cabin', 'ticket'], inplace = True)

X = df.drop(columns = ['survived'], inplace = False)
y = df.survived
In [ ]:
y
In [117]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

Baseline

comme baseline: arbre de decision simple, not pruned, quel accuracy sur le test set On voit bien l'overfitting

In [118]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)



yhat_proba_test  = clf.predict_proba(X_test)[:,1]
yhat_proba_train = clf.predict_proba(X_train)[:,1]


print("\n -- ROC AUC Score test {:.4f}".format(roc_auc_score(y_test, yhat_proba_test)))
print("\n -- ROC AUC Score train {:.4f}".format(roc_auc_score(y_train, yhat_proba_train)))
 -- ROC AUC Score test 0.7463

 -- ROC AUC Score train 0.9991

N arbres max_depth = 2

maintenant prendre 20 arbres, en limitant la taille a 2 niveaux

A chaque iteration

In [152]:
N = 200

yhat_proba = [0] * len(y_test)

roc_auc_train, roc_auc_test = [], []

for n in range(N):
    idx = X_train.sample(frac = 0.2, replace = True).index

    clf = DecisionTreeClassifier(min_samples_leaf = 100)

    clf.fit(X_train.loc[idx], y_train[idx])
    
    yhat_proba_test += clf.predict_proba(X_test)[:,1]
    
    roc_auc_train.append(roc_auc_score(y_train, clf.predict_proba(X_train)[:,1] ))
    roc_auc_test.append(roc_auc_score(y_test, clf.predict_proba(X_test)[:,1] ))

yhat_proba_test = yhat_proba_test / N    


print("\n -- ROC AUC Score test Bagging {:.4f}".format(roc_auc_score(y_test, yhat_proba_test)))
print("\n -- ROC AUC Score train {:.4f}  {:.4f}  ".format(np.mean(roc_auc_train), np.std(roc_auc_train)))
print("\n -- ROC AUC Score test {:.4f}  {:.4f} ".format(np.mean(roc_auc_test), np.std(roc_auc_test)))
 -- ROC AUC Score test Bagging 0.8608

 -- ROC AUC Score train 0.5000  0.0000  

 -- ROC AUC Score test 0.5000  0.0000 
In [ ]:
import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots(1,1)
plt.boxplot(roc_auc_train);
fig, ax = plt.subplots(1,1)
plt.boxplot(roc_auc_test);

Random Forest

In [174]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth = 3,n_estimators=100, bootstrap=True )

clf.fit(X_train, y_train)
print(clf)
yhat_proba_test = clf.predict_proba(X_test)[:,1]
yhat_proba_train = clf.predict_proba(X_train)[:,1]

print("\n -- ROC AUC Score test {:.4f}".format(roc_auc_score(y_test, yhat_proba_test)))
print("\n -- ROC AUC Score train {:.4f}".format(roc_auc_score(y_train, yhat_proba_train)))
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

 -- ROC AUC Score test 0.8725

 -- ROC AUC Score train 0.8504
In [175]:
clf.feature_importances_
Out[175]:
array([ 0.13569291,  0.43813153,  0.03770966,  0.02240128,  0.01700829,
        0.10011347,  0.01994288,  0.0378396 ,  0.19116038])
In [176]:
X_train.columns
Out[176]:
Index(['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked',
       'home.dest', 'title'],
      dtype='object')
In [ ]: