# TASK 1: read the data from titanic.csv into a DataFrame import pandas as pd url = 'https://raw.githubusercontent.com/justmarkham/DAT7/master/data/titanic.csv' titanic = pd.read_csv(url, index_col='PassengerId') # TASK 2: define Pclass/Parch as the features and Survived as the response feature_cols = ['Pclass', 'Parch'] X = titanic[feature_cols] y = titanic.Survived # TASK 3: split the data into training and testing sets from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # TASK 4: fit a logistic regression model from sklearn.linear_model import LogisticRegression logreg = LogisticRegression(C=1e9) logreg.fit(X_train, y_train) # TASK 5: make predictions on testing set and calculate accuracy y_pred_class = logreg.predict(X_test) from sklearn import metrics print metrics.accuracy_score(y_test, y_pred_class) # compute null accuracy manually print y_test.mean() print 1 - y_test.mean() # equivalent function in scikit-learn from sklearn.dummy import DummyClassifier dumb = DummyClassifier(strategy='most_frequent') dumb.fit(X_train, y_train) y_dumb_class = dumb.predict(X_test) print metrics.accuracy_score(y_test, y_dumb_class) # check for missing values titanic.isnull().sum() # drop rows with any missing values titanic.dropna().shape # drop rows where Age is missing titanic[titanic.Age.notnull()].shape # fill missing values for Age with the mean age titanic.Age.fillna(titanic.Age.mean(), inplace=True) # equivalent function in scikit-learn, supports mean/median/most_frequent from sklearn.preprocessing import Imputer imp = Imputer(strategy='mean', axis=1) titanic['Age'] = imp.fit_transform(titanic.Age).T # include Age as a feature feature_cols = ['Pclass', 'Parch', 'Age'] X = titanic[feature_cols] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) logreg.fit(X_train, y_train) y_pred_class = logreg.predict(X_test) print metrics.accuracy_score(y_test, y_pred_class) # confusion matrix metrics.confusion_matrix(y_test, y_pred_class) # calculate the sensitivity 43 / float(52 + 43) # calculate the specificity 107 / float(107 + 21) # store the predicted probabilities y_pred_prob = logreg.predict_proba(X_test)[:, 1] # plot the predicted probabilities %matplotlib inline import matplotlib.pyplot as plt plt.hist(y_pred_prob) plt.xlabel('Predicted probability of survival') plt.ylabel('Frequency') # change the threshold for predicting survived to increase sensitivity import numpy as np y_pred_class = np.where(y_pred_prob > 0.25, 1, 0) # equivalent function in scikit-learn from sklearn.preprocessing import binarize y_pred_class = binarize(y_pred_prob, 0.25) # new confusion matrix print metrics.confusion_matrix(y_test, y_pred_class) # new sensitivity print 68 / float(27 + 68) # new specificity print 57 / float(57 + 71) # encode Sex_Female feature titanic['Sex_Female'] = titanic.Sex.map({'male':0, 'female':1}) # include Sex_Female in the model feature_cols = ['Pclass', 'Parch', 'Age', 'Sex_Female'] X = titanic[feature_cols] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) logreg=LogisticRegression(C=1e9) logreg.fit(X_train, y_train) zip(feature_cols, logreg.coef_[0]) # convert log-odds to odds zip(feature_cols, np.exp(logreg.coef_[0])) logreg.predict_proba([1, 0, 29, 0])[:, 1] logreg.predict_proba([2, 0, 29, 0])[:, 1] # convert Adam's probability to odds adamodds = 0.5/(1 - 0.5) # adjust odds for Bill due to lower class billodds = adamodds * 0.295 # convert Bill's odds to probability billodds/(1 + billodds) logreg.predict_proba([1, 0, 29, 1])[:, 1] # adjust odds for Susan due to her sex susanodds = adamodds * 14.6 # convert Susan's odds to probability susanodds/(1 + susanodds) # encode Sex_Male feature titanic['Sex_Male'] = titanic.Sex.map({'male':1, 'female':0}) # include Sex_Male in the model instead of Sex_Female feature_cols = ['Pclass', 'Parch', 'Age', 'Sex_Male'] X = titanic[feature_cols] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) logreg.fit(X_train, y_train) zip(feature_cols, logreg.coef_[0]) # create 3 dummy variables pd.get_dummies(titanic.Embarked, prefix='Embarked').head(10) # create 3 dummy variables, then exclude the first pd.get_dummies(titanic.Embarked, prefix='Embarked').iloc[:, 1:].head(10) # create a DataFrame with the two dummy variable columns embarked_dummies = pd.get_dummies(titanic.Embarked, prefix='Embarked').iloc[:, 1:] # concatenate the original DataFrame and the dummy DataFrame (axis=0 means rows, axis=1 means columns) titanic = pd.concat([titanic, embarked_dummies], axis=1) titanic.head() # include Embarked_Q and Embarked_S in the model feature_cols = ['Pclass', 'Parch', 'Age', 'Sex_Female', 'Embarked_Q', 'Embarked_S'] X = titanic[feature_cols] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) logreg=LogisticRegression(C=1e9) logreg.fit(X_train, y_train) zip(feature_cols, logreg.coef_[0]) # predict probability of survival y_pred_prob = logreg.predict_proba(X_test)[:, 1] # plot ROC curve fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob) plt.plot(fpr, tpr) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate (1 - Specificity)') plt.ylabel('True Positive Rate (Sensitivity)') # calculate AUC print metrics.roc_auc_score(y_test, y_pred_prob) # calculate AUC using y_pred_class (producing incorrect results) print metrics.roc_auc_score(y_test, y_pred_class) # histogram of predicted probabilities grouped by actual response value df = pd.DataFrame(data = {'probability':y_pred_prob, 'actual':y_test}) df.probability.hist(by=df.actual, sharex=True, sharey=True)