import pandas as pd import numpy as np import pylab as pl train = pd.read_csv("./data/credit-data-trainingset.csv") test = pd.read_csv("./data/credit-data-testset.csv") test.head() from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.svm import SVC features = ['revolving_utilization_of_unsecured_lines', 'debt_ratio', 'monthly_income', 'age', 'number_of_times90_days_late'] clf = KNeighborsClassifier(n_neighbors=13, warn_on_equidistant=False) clf.fit(train[features], train.serious_dlqin2yrs) #classes (returns an array) clf.predict(test[features]) #probabilities (returns a numpy array) clf.predict_proba(test[features]) probs = clf.predict_proba(test[features]) prob_true = probs[::,1] pl.hist(prob_true) from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix preds = clf.predict_proba(test[features]) preds confusion_matrix(test['serious_dlqin2yrs'], clf.predict(test[features])) print classification_report(test['serious_dlqin2yrs'], clf.predict(test[features]), labels=[0, 1]) pd.crosstab(test['serious_dlqin2yrs'], clf.predict(test[features]), rownames=["Actual"], colnames=["Predicted"]) def plot_roc(name, probs): fpr, tpr, thresholds = roc_curve(test['serious_dlqin2yrs'], probs) roc_auc = auc(fpr, tpr) pl.clf() pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) pl.plot([0, 1], [0, 1], 'k--') pl.xlim([0.0, 1.05]) pl.ylim([0.0, 1.05]) pl.xlabel('False Positive Rate') pl.ylabel('True Positive Rate') pl.title(name) pl.legend(loc="lower right") pl.show() plot_roc("Perfect Classifier", test['serious_dlqin2yrs']) plot_roc("Guessing", np.random.uniform(0, 1, len(test['serious_dlqin2yrs']))) #[::,1] selects the 2nd column of the numpy array plot_roc("KNN", preds[::,1]) clf = RandomForestClassifier() clf.fit(train[features], train.serious_dlqin2yrs) probs = clf.predict_proba(test[features])[::,1] plot_roc("RandomForest", probs) train.head() features = ['revolving_utilization_of_unsecured_lines', 'debt_ratio', 'number_of_times90_days_late', 'number_real_estate_loans_or_lines'] clf = GradientBoostingClassifier() clf.fit(train[features], train.serious_dlqin2yrs) probs = clf.predict_proba(test[features])[::,1] plot_roc("Your Classifier", probs) probs odds = (1 - probs) / probs score = np.log(odds)*(40/np.log(2)) + 340 pl.hist(score) def convert_prob_to_score(p): """ takes a probability and converts it to a score Example: convert_prob_to_score(0.1) 466 """ odds = (1 - p) / p scores = np.log(odds)*(40/np.log(2)) + 340 return scores.astype(np.int) convert_prob_to_score(probs)