Fitting and evaluating a model
import pandas as pd
import numpy as np
import pylab as pl
train = pd.read_csv("./data/credit-data-trainingset.csv")
test = pd.read_csv("./data/credit-data-testset.csv")
test.head()
serious_dlqin2yrs | revolving_utilization_of_unsecured_lines | age | number_of_time30-59_days_past_due_not_worse | debt_ratio | monthly_income | number_of_open_credit_lines_and_loans | number_of_times90_days_late | number_real_estate_loans_or_lines | number_of_time60-89_days_past_due_not_worse | number_of_dependents | monthly_income_imputed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.233810 | 30 | 0 | 0.036050 | 3300 | 5 | 0 | 0 | 0 | 0 | 6017 |
1 | 1 | 0.964673 | 40 | 3 | 0.382965 | 13700 | 9 | 3 | 1 | 1 | 2 | 2850 |
2 | 0 | 0.061086 | 78 | 0 | 2058.000000 | 2500 | 10 | 0 | 2 | 0 | 0 | 2500 |
3 | 0 | 0.075427 | 32 | 0 | 0.085512 | 7916 | 6 | 0 | 0 | 0 | 0 | 4145 |
4 | 0 | 0.046560 | 58 | 0 | 0.241622 | 2416 | 9 | 0 | 1 | 0 | 0 | 2850 |
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
features = ['revolving_utilization_of_unsecured_lines', 'debt_ratio',
'monthly_income', 'age', 'number_of_times90_days_late']
clf = KNeighborsClassifier(n_neighbors=13, warn_on_equidistant=False)
-c:4: DeprecationWarning: The warn_on_equidistant parameter is deprecated and will be removed in 0.16.
clf.fit(train[features], train.serious_dlqin2yrs)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=13, p=2, weights='uniform')
#classes (returns an array)
clf.predict(test[features])
array([0, 0, 0, ..., 0, 0, 0])
#probabilities (returns a numpy array)
clf.predict_proba(test[features])
array([[ 1. , 0. ], [ 0.84615385, 0.15384615], [ 0.92307692, 0.07692308], ..., [ 0.84615385, 0.15384615], [ 0.92307692, 0.07692308], [ 1. , 0. ]])
probs = clf.predict_proba(test[features])
prob_true = probs[::,1]
pl.hist(prob_true)
(array([ 3.08820000e+04, 4.57100000e+03, 1.47800000e+03, 4.19000000e+02, 1.37000000e+02, 7.50000000e+01, 1.00000000e+01, 2.00000000e+00, 5.00000000e+00, 6.00000000e+00]), array([ 0. , 0.09230769, 0.18461538, 0.27692308, 0.36923077, 0.46153846, 0.55384615, 0.64615385, 0.73846154, 0.83076923, 0.92307692]), <a list of 10 Patch objects>)
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix
preds = clf.predict_proba(test[features])
preds
array([[ 1. , 0. ], [ 0.84615385, 0.15384615], [ 0.92307692, 0.07692308], ..., [ 0.84615385, 0.15384615], [ 0.92307692, 0.07692308], [ 1. , 0. ]])
confusion_matrix(test['serious_dlqin2yrs'], clf.predict(test[features]))
array([[35051, 18], [ 2494, 22]])
print classification_report(test['serious_dlqin2yrs'], clf.predict(test[features]), labels=[0, 1])
precision recall f1-score support 0 0.93 1.00 0.97 35069 1 0.55 0.01 0.02 2516 avg / total 0.91 0.93 0.90 37585
pandas
. Be sure to label the rows/columns.¶HINT: use crosstab
pd.crosstab(test['serious_dlqin2yrs'], clf.predict(test[features]), rownames=["Actual"], colnames=["Predicted"])
Predicted | 0 | 1 |
---|---|---|
Actual | ||
0 | 35051 | 18 |
1 | 2494 | 22 |
To evaluate our classifier, we're going to use an ROC Curve. ROC Curves are great for evaluating binary (0, 1) classification models. A ROC Curve plots the False Postive Rate (fpr) vs. the True Positive Rate (tpr) for a classifier.
See example in scikit-learn docs.
def plot_roc(name, probs):
fpr, tpr, thresholds = roc_curve(test['serious_dlqin2yrs'], probs)
roc_auc = auc(fpr, tpr)
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.05])
pl.ylim([0.0, 1.05])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title(name)
pl.legend(loc="lower right")
pl.show()
plot_roc("Perfect Classifier", test['serious_dlqin2yrs'])
plot_roc("Guessing", np.random.uniform(0, 1, len(test['serious_dlqin2yrs'])))
#[::,1] selects the 2nd column of the numpy array
plot_roc("KNN", preds[::,1])
clf = RandomForestClassifier()
clf.fit(train[features], train.serious_dlqin2yrs)
RandomForestClassifier(bootstrap=True, compute_importances=None, criterion='gini', max_depth=None, max_features='auto', min_density=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0)
probs = clf.predict_proba(test[features])[::,1]
plot_roc("RandomForest", probs)
Pick a different algorithm and a new set of features. Can you beat the 0.74 AUC?
train.head()
serious_dlqin2yrs | revolving_utilization_of_unsecured_lines | age | number_of_time30-59_days_past_due_not_worse | debt_ratio | monthly_income | number_of_open_credit_lines_and_loans | number_of_times90_days_late | number_real_estate_loans_or_lines | number_of_time60-89_days_past_due_not_worse | number_of_dependents | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.766127 | 45 | 2 | 0.802982 | 9120 | 13 | 0 | 6 | 0 | 2 |
1 | 0 | 0.957151 | 40 | 0 | 0.121876 | 2600 | 4 | 0 | 0 | 0 | 1 |
2 | 0 | 0.658180 | 38 | 1 | 0.085113 | 3042 | 2 | 1 | 0 | 0 | 0 |
3 | 0 | 0.907239 | 49 | 1 | 0.024926 | 63588 | 7 | 0 | 1 | 0 | 0 |
4 | 0 | 0.213179 | 74 | 0 | 0.375607 | 3500 | 3 | 0 | 1 | 0 | 1 |
features = ['revolving_utilization_of_unsecured_lines', 'debt_ratio',
'number_of_times90_days_late', 'number_real_estate_loans_or_lines']
clf = GradientBoostingClassifier()
clf.fit(train[features], train.serious_dlqin2yrs)
probs = clf.predict_proba(test[features])[::,1]
plot_roc("Your Classifier", probs)
We're going to take the P(delinquent) outputted by the model and convert it to a FICO style score. We calculate the log odds which we then convert into 'points'. in this case, a increase/decrease in 40 points (arbritrary) means a person's riskness has halved/doubled--40/log(2). We're starting with a base score of 340 (arbitrary).
probs
odds = (1 - probs) / probs
score = np.log(odds)*(40/np.log(2)) + 340
pl.hist(score)
(array([ 2.00000000e+00, 1.20000000e+01, 1.98000000e+02, 6.51000000e+02, 1.08900000e+03, 9.92000000e+02, 6.31900000e+03, 4.44100000e+03, 8.81300000e+03, 1.50680000e+04]), array([ 175.37769656, 218.97334868, 262.5690008 , 306.16465292, 349.76030504, 393.35595716, 436.95160928, 480.5472614 , 524.14291352, 567.73856564, 611.33421776]), <a list of 10 Patch objects>)
def convert_prob_to_score(p):
"""
takes a probability and converts it to a score
Example:
convert_prob_to_score(0.1)
466
"""
odds = (1 - p) / p
scores = np.log(odds)*(40/np.log(2)) + 340
return scores.astype(np.int)
convert_prob_to_score(probs)
array([537, 300, 582, ..., 445, 587, 597])