# Set some ipython defaults %pylab inline rcParams['figure.figsize'] = 8, 6 # that's default image size for this interactive session font = {'family' : 'normal', 'weight' : 'bold', 'size' : 16} rc('font', **font) # first let's load up the data and look at it import pandas as pd # used to keep data looking pretty import numpy as np # load the raw_data first raw_data = pd.read_csv('titanic.csv') raw_columns = raw_data.columns # We'll also re-index raw_data.index = raw_data['PassengerId'] # and we'll take all of our training data and put it into the "real_train" index real_train = raw_data.index # now let's load the test data set test_data = pd.read_csv('titanic_test.csv') # We'll make the survival NA for the test data set test_data['Survived'] = np.NaN test_data.index = test_data['PassengerId'] real_test = test_data.index # Now let's merge the dataframes # I'm going to merge them together for convenience so that any transformations I make are applied to both data sets data = pd.concat([raw_data, test_data]) # remove raw training and testing data del raw_data del test_data # Show the first 5 rows print(data.head()) # If I want to look at just the training data set, I can do that too print(data.ix[real_train].describe()) # Check that we've gotten rid of all the NaNs or other bad values def is_bad(series): """Returns a binary series indicating whether there are any bad values (null, +/- inf, or '') input: series return: Boolean series """ return pd.isnull(series) | series.apply(lambda x: (x == np.inf or x == -np.inf or x == '')) print(data.apply(is_bad).sum()) # Now let's fix the missing embarked data # first we'll see what the most common embarkment types are: print(data.groupby('Embarked').count()['PassengerId']) # So let's fill the missing values with S, which is by far the most commong embarkation point data['Embarked'] = data['Embarked'].fillna('S') print(data.groupby('Embarked').count()['PassengerId']) print(data[is_bad(data['Fare'])]) # and let's fix the missing fare value. While I'm at it, I'll also nudge any zero values so that I can later take the logarithm # Other than being old, the guy doesn't seem unusual, so I'll just use the median fare of his class and gender data['fFare'] = data['Fare'].fillna(data[(data['Sex'] == 'male') & (data['Pclass'] == 3)]['Fare'].dropna().median()) data['fFare'] = data['Fare'].apply(lambda x: .01 if (x == 0 or pd.isnull(x)) else x) data['zFare'] = data['Fare'].apply(lambda x: x == 0).astype(bool) # I'm guessing these people were comp'd, may be important! print(data['fFare'].describe()) print(data[data['zFare']]['fFare']) # fill in empty cells data['fSibSp'] = data['SibSp'].fillna(data['SibSp'].dropna().median()) data['fParch'] = data['Parch'].fillna(data['Parch'].dropna().median()) print(data[['fSibSp', 'fParch']].describe()) # For now let's just use a simple class model to fill in missing Ages. avg_age_dict = data.groupby(['Pclass', 'Sex']).mean()['Age'].to_dict() data['avg_Age'] = [avg_age_dict[(c, s)] for c, s in zip(data['Pclass'], data['Sex'])] data['fAge'] = data['Age'].fillna(data['avg_Age']) #Also fix any zeros: data['fAge'] = data['fAge'].apply(lambda x: 1 if x == 0 else x) print(data.apply(is_bad).sum()) # First, we need to convert categorical variables to integer form cat_map = {} cat_cols = [col for col, t in data.dtypes.to_dict().items() if t == np.dtype('O')] def factorize(data, cat_cols, cat_map=cat_map): """Convenience function for making categorical Series from string Series inputs: dataframe, list of string categorical columns outputs: dataframe, cat_map """ for col in cat_cols: temp, rev = pd.factorize(data[col]) # temp = pd.Categorical.from_array(data[col]) # print(temp) # print(rev) cat_map['_' + col] = pd.Factor(temp, rev) data['_' + col] = temp return data, cat_map data, cat_map = factorize(data, cat_cols) print(data.apply(is_bad).sum()) print([c == np.dtype('O') for c in data.dtypes]) # Now let's pick the columns to use def is_too_many(df, col): "Checks if there are too many categorical variables" return len(df[col].drop_duplicates()) > 120 def find_clean_cols(data): clean_features = set([col for col, t in data.dtypes.to_dict().items() if (t != np.dtype('O') and not any(is_bad(data[col])) and not (is_too_many(data, col) and t == np.dtype('int64') ))]) clean_features = clean_features - set(['avg_Age', 'Parch', 'SibSp', 'Age_guess', '_Age', 'fAge', 'sqrt_fare', 'log_fare', 'emp prediction', 'Prediction', 'sqrt_age', 'log_age', 'Prediction %', 'survival chance', 's_Age', 'Best LLRP', 'Best AIC', 'Best BIC']) clean_features = clean_features - set(['sttl_Miss', 'sttl_Mr', 'Tick_pref', 'sttl_Lady', 'sttl_Dr', 'sttl_Rev', 'sttl_Mst', 'sttl_Mrs', 'sttl_Mlle']) clean_features = list(clean_features) return clean_features clean_features = find_clean_cols(data) print(clean_features) # print(data.apply(is_bad).sum()[clean_features]) # Verify that our data is now clean print(data.apply(is_bad).sum()[clean_features + ['_Age']]) # Now let's clean up the data for doing the regression # We'll start simple with just a few columns # Define some easy columns to use simple_columns = [] # Create dummy variables # I use .ix[:, :-1] to avoid multicollinearity / dummy variable trap sex_dummies = pd.get_dummies(data['Sex'], prefix='sex').ix[:, :-1] class_dummies = pd.get_dummies(data['Pclass'], prefix='class').ix[:, :-1] embark_dummies = pd.get_dummies(data['Embarked'], prefix='embarked').ix[:, :-1] bin_cols = list(sex_dummies.columns.values) + list(class_dummies.columns.values) + list(embark_dummies.columns.values) # merge them together. Do it conditionally so that this cell is idempotent for col in bin_cols: if col in data.columns.values: data = data.drop(col, 1) data = data.join(sex_dummies, how='inner').join(class_dummies, how='inner').join(embark_dummies, how='inner') # now we'll add an intercept data['const'] = 1 # Check for bad data in the training set print(data.ix[real_train, ['const', 'Survived'] + bin_cols].apply(is_bad).sum()) # Let's pick the columns we'll use for regression and quickly look at their values simple_reg_columns = bin_cols + simple_columns simple_reg_columns = ['const'] + simple_reg_columns print(data.ix[real_train, simple_reg_columns].describe()) # We'll use statsmodels for the logistic regression import statsmodels.api as sm # Let's pick the columns we'll use for regression simple_reg_columns = bin_cols + simple_columns simple_reg_columns = ['const'] + simple_reg_columns log_reg_model = sm.Logit(data.ix[real_train,'Survived'], data.ix[real_train, simple_reg_columns]) log_reg_result = log_reg_model.fit() print(log_reg_result.summary()) # Let's do it with regularization! import statsmodels.api as sm # Set regularization penalty alpha = 0.01 * len(data.ix[real_train]) * np.ones(data.ix[real_train, simple_reg_columns].shape[1]) alpha[0] = 0 # don't regularize the constant # print(alpha) log_reg_result_reg = log_reg_model.fit_regularized(method='l1', alpha=alpha) print(log_reg_result_reg.summary()) def viz_sm_coefs(result_obj): params = result_obj.params.copy() params.sort() plt.figure(figsize(15, 6)) fig, axes = plt.subplots(nrows=1, ncols=2) params.plot(kind='bar', ax=axes[0]) axes[0].set_title('Regression Coefficients') params_abs = params.apply(abs) params_abs.sort() params_abs.plot(kind='bar', ax=axes[1]) axes[1].set_title('Abs(Regression Coefficients)') return None viz_sm_coefs(log_reg_result) show() # Predict results # Now let's see if they would survive! data['survival chance'] = 0 # can help to initialize for all values data['survival chance'] = log_reg_result.predict(data[simple_reg_columns]) # print(data.ix[real_train, 'survival chance']) print(pd.crosstab(data.ix[real_train, 'Sex'], data.ix[real_train, 'Pclass'], values=data.ix[real_train, 'survival chance'], aggfunc=np.mean)) # Let's compare to the realized values empirical_survival_rates = pd.crosstab(data.ix[real_train, 'Sex'], data.ix[real_train, 'Pclass'], values=data.ix[real_train, 'Survived'], aggfunc=np.mean) print(empirical_survival_rates) # And let's make sure we have enough data points for this to be meaningful print(pd.crosstab(data.ix[real_train, 'Sex'], data.ix[real_train, 'Pclass'])) # add more data! more_cols = ['fAge', 'fFare', 'fSibSp', 'fParch'] print(data[simple_reg_columns + more_cols].describe()) # Let's even use some transformations of some covariates! data['log_fare'] = data['fFare'].apply(np.log) data['sqrt_fare'] = data['fFare'].apply(np.sqrt) data['log_age'] = data['fAge'].apply(np.log) data['child'] = data['fAge'].apply(lambda x: 1 if x < 18 else 0) data['sqrt_age'] = data['fAge'].apply(np.sqrt) data['fam_size'] = data['fSibSp'] + data['fParch'] more_cols = ['fAge', 'fFare', 'fSibSp', 'fParch'] + ['log_fare', 'log_age', 'sqrt_fare', 'sqrt_age', 'fam_size', 'child'] for col in more_cols: if col not in data.columns: data = data.join(data.ix[real_train, col]) print(data[more_cols].describe()) # print(data.head()) print(data[['log_fare', 'log_age', 'sqrt_fare', 'sqrt_age', 'fam_size', 'child']].dtypes) print(data[['log_fare', 'log_age', 'sqrt_fare', 'sqrt_age', 'fam_size', 'child']].apply(is_bad).sum()) # Rescale (demean, unit variance) quantitative columns from sklearn.preprocessing import StandardScaler scale_transforms = {} def make_standard_scale(series, name): """Helper function to demean and transform a series to unit variance inputs: series, name for outputing series outputs: transformed series """ temp_scaler = StandardScaler().fit(series) scale_transforms[name] = temp_scaler result_series = temp_scaler.transform(series.copy()) return result_series scale_cols = [] for col in [col for col, t in data[more_cols].dtypes.to_dict().items() if t == np.dtype('float64')]: print(col) data['s' + col] = make_standard_scale(data[col], col) scale_cols.append('s' + col) print(data[scale_cols].head()) # Run the logistic regression. First set the columns alpha_cols = simple_reg_columns + scale_cols # Now actually run the regression more_model = sm.Logit(data.ix[real_train, 'Survived'], data.ix[real_train, alpha_cols]) alpha_more = 0.01 * len(data.ix[real_train]) * np.ones(data.ix[real_train, alpha_cols].shape[1]) alpha_more[0] = 0 more_results = more_model.fit_regularized(method='l1', alpha=alpha_more, n_jobs=4) print(more_results.summary()) # Visualize coefficients viz_sm_coefs(more_results) # Let's do a likelihood ratio test: llr = -2 * (log_reg_result.llf - more_results.llf) df_diff = more_results.df_model - log_reg_result.df_model from scipy.stats import chi2 print(1-chi2.cdf(llr, df_diff)) # First let's pick a training set at random and repeat the crosstab approach import random random.seed(0) train = random.sample(data.ix[real_train].index, len(data.ix[real_train].index)/2) test = data.index.drop(list(train) + list(real_test)) # Now let's calculate performance! # I'll make a helper function that we can re-use later to compare performance of # multiple models from sklearn.metrics import confusion_matrix, classification_report def perf_report(df, true_class, pred_class, string_title): """ Prints the confusion matrix and precision, recall of the classifer inputs: df, true_class, pred_class, string_title outputs: confustion matrix, class_report """ print(string_title + ' Confusion Matrix') conf_matrix = confusion_matrix(df[true_class], df[pred_class]) print(conf_matrix) print(string_title + ' Performance Summary') class_report = classification_report(np.array(df[true_class]), np.array(df[pred_class]), target_names=['Died', 'Survived']) print(class_report) print(string_title + ' accuracy: ', float(np.diag(conf_matrix).sum())/float(conf_matrix.sum().sum())) return conf_matrix, class_report data['emp prediction'] = 0 female = data['Sex'] == 'female' high_class = data['Pclass'] != 3 data.ix[female & high_class, 'emp prediction'] = 1 perf_report(data.ix[real_train], 'Survived', 'emp prediction', 'emp prediction') print() # let's look at the perf report for the best performing regressions: def make_pred(result_model, input_df, output_df, ix, title, cols=None): """ Makes a series prediction inputs: result_model, input_df, output_df, ix, title outputs: outputdf[title] """ if title not in list(output_df.columns): output_df[title] = 0 if cols == None: cols = list(sig_cols[result_model.ix]) input_data = input_df.ix[ix, cols] output_df.ix[ix, title] = result_model.predict(input_data) output_df.ix[ix, title] = output_df.ix[ix, title].apply(lambda x: 1 if x > 0.5 else 0) # print(output_df[title]) return output_df[title] data['log simple'] = log_reg_result.predict(data[simple_reg_columns]) data['log simple'] = data['log simple'].apply(lambda x: 1 if x > 0.5 else 0) data['log more'] = more_results.predict(data[alpha_cols]) data['log more'] = data['log more'].apply(lambda x: 1 if x > 0.5 else 0) perf_report(data.ix[test], 'Survived', 'log simple', 'log simple') perf_report(data.ix[test], 'Survived', 'log more', 'log more') perf_report(data.ix[test], 'Survived', 'emp prediction', 'emp rediction') print() # Make a grid of 1 to max_features, 1 to max_depth; fix all possible variants from sklearn.metrics import accuracy_score scaled_cols = ['sfAge', 'fSibSp', 'fParch', 'slog_fare', 'slog_age', 'ssqrt_age', 'ssqrt_fare', 'fam_size'] quant_cols = ['sfAge', 'fSibSp', 'fParch', 'fam_size', 'sfFare'] bin_cols = ['_Sex', 'Pclass', '_Embarked'] dummy_cols = ( # list(fm_id_dummies.columns.values) + list(sex_dummies.columns.values) + list(class_dummies.columns.values) + list(embark_dummies.columns.values)) print(data.ix[real_train, scaled_cols + dummy_cols].apply(is_bad).sum()) # WARNING! THIS TAKES ABOUT 15 MINUTES TO RUN ON MY COMPUTER!!! from sklearn.grid_search import GridSearchCV from sklearn.ensemble import RandomForestClassifier n_features = list(range(1, len(bin_cols + quant_cols)+1)) depths = list(range(1, len(bin_cols + quant_cols)+1)) param_grid = {'n_estimators': [50], 'criterion': ['gini', 'entropy'], 'max_features': n_features, 'max_depth': depths, 'oob_score': [True]} rf_cv_clf = GridSearchCV(RandomForestClassifier(), param_grid, scoring='accuracy', refit=True, verbose=1, cv=3, n_jobs=4) rf_cv_clf.fit( data.ix[real_train, bin_cols + quant_cols], data.ix[real_train, 'Survived']) rf_grid_val_scores = [tup.mean_validation_score for tup in rf_cv_clf.grid_scores_] print(rf_cv_clf.best_params_) print(rf_cv_clf.best_score_) print(rf_cv_clf.best_estimator_.oob_score_) grid_std_scores = [np.std(tup.cv_validation_scores) for tup in rf_cv_clf.grid_scores_] print('grid std scores: \n%s' % grid_std_scores[0:10]) # Visualize performance across two dimensions # Use grid_scores_.parameters, loop through results - may want to abstract the loop into a helper function def cv_grid_viz(cv_clf, default_params_dict, x_param, y_param, title, x_exp=False, y_exp=False, markersize=500): """ Helper function to visualize a grid of CV results inputs: cv_clf, default_params_dict, x_param, y_param, title keyword inputs: x_exp=False, y_exp=False, markersize=500 outputs: x_list, y_list, acc_graph_list side-effects: make matplotlib graph """ cv_params_list = [tup.parameters for tup in cv_clf.grid_scores_] cv_scores = cv_grid_val_scores = [tup.mean_validation_score for tup in cv_clf.grid_scores_] x_list = [] y_list = [] acc_graph_list = [] for params, score in zip(cv_params_list, cv_scores): x_temp = None y_temp = None fail = False for param_key in params: if param_key in default_params_dict: if default_params_dict[param_key] == params[param_key]: continue else: fail = True elif param_key == x_param: x_temp = params[param_key] elif param_key == y_param: y_temp = params[param_key] else: continue if x_temp and y_temp and not fail: x_list.append(x_temp) y_list.append(y_temp) acc_graph_list.append(score) if x_exp: x_list = [np.log(x) for x in x_list] x_param = 'log(' + x_param + ')' if y_exp: y_list = [np.log(y) for y in y_list] y_param = 'log(' + y_param + ')' figsize(10, 8) scatter(x_list, y_list, c=acc_graph_list, marker='o', s=markersize) legend(loc='best') xlabel(x_param) ylabel(y_param) suptitle(title) colorbar() show() return x_list, y_list, acc_graph_list cv_grid_viz(rf_cv_clf, {'criterion':'entropy','oob_score':True,'n_estimators':50}, 'max_features', 'max_depth', 'RF Parameter Tuning (CV Accuracy)') print() data['SK RF'] = rf_cv_clf.best_estimator_.predict(data[bin_cols + quant_cols]) # Save the file import csv def save_to_csv(fp, col): "Helper function to save a column of the dataframe into the preferred kaggle format" with open(fp, 'w') as f: writer = csv.writer(f) writer.writerow(['PassengerId', 'Survived']) for pid, s in zip(data.ix[real_test, col].index.values, data.ix[real_test, col].values): writer.writerow([pid, int(s)]) save_to_csv('sk_rf.csv', 'SK RF') dummy_cols = ( # list(fm_id_dummies.columns.values) + list(sex_dummies.columns.values) + list(class_dummies.columns.values) # + list(aCab.columns.values) ) print(dummy_cols) # Train an SVM classifier from sklearn.svm import SVC rbf_param_grid = {'C': list(np.exp(range(-10,10))), 'kernel': ['rbf'], 'gamma': list(np.exp(range(-10,10)))} svm_clf = GridSearchCV(SVC(), param_grid=rbf_param_grid, scoring='accuracy', refit=True, verbose=1, cv=3, n_jobs=4) lscale_cols = ['slog_age', 'slog_fare'] svm_clf.fit(data.ix[real_train, lscale_cols + dummy_cols], data.ix[real_train, 'Survived']) # Visualize SVM performance across two dimensions cv_grid_viz(svm_clf, {'kernel':'rbf'}, 'C', 'gamma', 'SVM Parameter Tuning (CV Accuracy)', x_exp=True, y_exp=True, markersize=175) print(svm_clf.best_score_) print(svm_clf.best_estimator_) # Make predictions and save data['SVM Pred'] = svm_clf.best_estimator_.predict(data[lscale_cols + dummy_cols]) print(data['SVM Pred'].head()) save_to_csv('svm.csv', 'SVM Pred') ## Logistic Regression revisited from sklearn.linear_model import LogisticRegression alpha_params = np.sqrt(np.exp(range(-25, 10, 1))) C_list = [1/a for a in alpha_params] parameter_grid = {'C' : C_list, 'penalty': ['l1', 'l2'], 'intercept_scaling': [1000.], #'class_weight':['auto'] } log_reg = LogisticRegression() log_clf = GridSearchCV(log_reg, parameter_grid, scoring='accuracy', refit=True, verbose=1, cv=10, n_jobs=4) log_clf.fit(data.ix[real_train, lscale_cols + dummy_cols], data.ix[real_train, 'Survived']) cv_grid_viz(log_clf, {'penalty':'l1','class_weight':'auto'}, 'C', 'intercept_scaling', 'Logistic Regression Parameter Tuning (CV Accuracy)', x_exp=True, markersize=175) print(log_clf.best_score_) print(log_clf.best_params_) data['SK SM'] = log_clf.predict(data.ix[:, lscale_cols + dummy_cols]) print(data['SK SM'].head()) perf_report(data.ix[real_train], 'Survived', 'SK SM', 'SK SM train') save_to_csv('sk_logit.csv', 'SK SM') print() # Train an adaboost classifier from sklearn.ensemble import AdaBoostClassifier ada_param_grid = {'n_estimators':[x*50 + 100 for x in range(1,11)], 'learning_rate':[x/100. for x in range(1,11)]} ada_clf = GridSearchCV(AdaBoostClassifier(), param_grid=ada_param_grid, scoring='accuracy', refit=True, verbose=1, cv=3, n_jobs=4) ada_clf.fit(data.ix[real_train, bin_cols + quant_cols], data.ix[real_train, 'Survived']) print('Done') print(ada_clf.best_score_) cv_grid_viz(ada_clf, {}, 'n_estimators', 'learning_rate', 'AdaBoost Parameter Tuning (CV Accuracy)') print() # Try doing a long, staged fit ada_clf_staged = AdaBoostClassifier(n_estimators=1000, learning_rate=0.05) ada_clf_staged.fit(data.ix[train, bin_cols + quant_cols], data.ix[train, 'Survived']) # Visualize training set performance to ensure convergence staged_scores_train = ada_clf_staged.staged_score(data.ix[train, bin_cols + quant_cols], data.ix[train, 'Survived']) staged_scores_test = ada_clf_staged.staged_score(data.ix[test, bin_cols + quant_cols], data.ix[test, 'Survived']) plt.plot(list(staged_scores_train), c='blue') plt.plot(list(staged_scores_test), c='red') plt.ylabel('Accuracy') plt.xlabel('n_estimators') plt.title('AdaBoost Convergence Test') plt.show() print(ada_clf_staged.score(data.ix[test, bin_cols + quant_cols], data.ix[test, 'Survived'])) # Save the results data['ada'] = ada_clf.predict(data[bin_cols + quant_cols]) perf_report(data.ix[real_train], 'Survived', 'SK SM', 'SK SM train') perf_report(data.ix[real_train], 'Survived', 'SVM Pred', 'SVM Pred train') perf_report(data.ix[real_train], 'Survived', 'SK RF', 'SK RF train') perf_report(data.ix[real_train], 'Survived', 'ada', 'AdaBoost train') print()