Metis Data Science Bootcamp, New York City, Feb 2015
© Frederik Durant
A bank wants to run a targeted marketing campaign to sell a term deposit product.
Given a certain campaign budget:
Which customers should it target?
In what order?
Is it wise to spend the complete campaign budget?
The bank has a database of 41K past customer call records, indicating successful and failed sales of this product.
import pymysql
import credentials
import pandas.io.sql as psql
db = pymysql.connect(host=credentials.host,
user=credentials.user,
passwd=credentials.password,
db='bank')
query = "SELECT * FROM bank_full;"
bankDF = psql.read_sql(query, con=db)
db.close()
print "Number of database records read: %d" % len(bankDF)
#print bankDF.columns
bankDF.head()
Number of database records read: 41188
age | job | marital | education | bank_def | housing | loan | contact | contact_month | day_of_week | ... | campaign | pdays | previous | poutcome | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 56 | housemaid | married | basic.4y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | no |
1 | 57 | services | married | high.school | unknown | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | no |
2 | 37 | services | married | high.school | no | yes | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | no |
3 | 40 | admin. | married | basic.6y | no | no | no | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | no |
4 | 56 | services | married | high.school | no | no | yes | telephone | may | mon | ... | 1 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | no |
5 rows × 21 columns
Most of our models can't handle the categorical fields, so our next step is to convert them to dummy numerical variables.
import pandas as pd
import numpy as np
tmpDF = bankDF.copy()
tmpDF.y.replace(to_replace=['yes','no'], value=[1,0], inplace=True)
# Delete the field(s) unknown before the call is made
nonAvailableFields = ['duration']
for field in nonAvailableFields:
tmpDF.drop(field, axis=1, inplace=True)
# Key: categorical field name
# Value: list of abbreviation and post-dummification redundant field
categoricalFields = {'job':['job','unknown'],
'marital':['mar','unknown'],
'education':['ed','unknown'],
'bank_def':['def','unknown'],
'housing':['h','unknown'],
'loan':['loan','unknown'],
'contact':['cntct','telephone'],
'contact_month':['cntct',None], # Because 'jan' and 'feb' are missing in the data, despite being plausible
'day_of_week':['dow','fri'], # Even though 'sat' and 'sun' are missing, these are not plausible values
'poutcome':['pout','nonexistent'],
}
# Convert all categorical fields in dummy numerical ones
# AND delete one dummified field because it contains redundant information
for (categoricalField, value) in categoricalFields.iteritems():
(prefix, dummifiedRedundantField) = value
tmpDF = pd.concat([tmpDF, pd.get_dummies(tmpDF[categoricalField], prefix=prefix)], axis=1)
if dummifiedRedundantField:
tmpDF.drop("%s_%s" % (prefix, dummifiedRedundantField), axis=1, inplace=True)
# of course, also delete the categorical fields
for (categoricalField,_) in categoricalFields.iteritems():
tmpDF.drop(categoricalField, axis=1, inplace=True)
bankDF2 = tmpDF.copy()
#print bankDF2.columns
bankDF2.head()
age | campaign | pdays | previous | emp_var_rate | cons_price_idx | cons_conf_idx | euribor3m | nr_employed | y | ... | ed_illiterate | ed_professional.course | ed_university.degree | loan_no | loan_yes | def_no | def_yes | mar_divorced | mar_married | mar_single | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 56 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
1 | 57 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2 | 37 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
3 | 40 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
4 | 56 | 1 | 999 | 0 | 1.1 | 93.994 | -36.4 | 4.857 | 5191 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
5 rows × 54 columns
%matplotlib inline
import sys
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from matplotlib import pyplot as plt
from math import log10
from time import sleep
X_values = bankDF2.copy()
X_values.drop('y', axis=1, inplace=True)
y_values = bankDF2['y']
# Scale the X values
X_values_scaled = preprocessing.scale(X_values.astype(float), copy=False)
scorerType = 'roc_auc'
nrCrossValidationFolds=10
# We MUST shuffle, because the data seem to be somehow ordered by date,
# resulting in grouped rows of similar socio-economic value combinations!!!
cvGenerator = KFold(len(X_values), n_folds=nrCrossValidationFolds, shuffle=True)
bestLogCValue = None
bestKValue = None
bestNrEst = None
bestMaxFeat = None
print "%d columns:" % len(X_values.columns), [str(c) for c in X_values.columns]
print "%d values:" % len(X_values_scaled[0]), X_values_scaled[0]
53 columns: ['age', 'campaign', 'pdays', 'previous', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed', 'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired', 'job_self-employed', 'job_services', 'job_student', 'job_technician', 'job_unemployed', 'cntct_cellular', 'pout_failure', 'pout_success', 'dow_mon', 'dow_thu', 'dow_tue', 'dow_wed', 'cntct_apr', 'cntct_aug', 'cntct_dec', 'cntct_jul', 'cntct_jun', 'cntct_mar', 'cntct_may', 'cntct_nov', 'cntct_oct', 'cntct_sep', 'h_no', 'h_yes', 'ed_basic.4y', 'ed_basic.6y', 'ed_basic.9y', 'ed_high.school', 'ed_illiterate', 'ed_professional.course', 'ed_university.degree', 'loan_no', 'loan_yes', 'def_no', 'def_yes', 'mar_divorced', 'mar_married', 'mar_single'] 53 values: [ 1.53303429 -0.56592197 0.1954139 -0.34949428 0.64809227 0.72272247 0.88644656 0.71245988 0.33167991 -0.58202282 -0.53831699 -0.19143021 6.15277204 -0.2764353 -0.2087573 -0.18903213 -0.3265564 -0.1473267 -0.44244927 -0.15887166 -1.31826996 -0.3392905 -0.1857 1.95899952 -0.51458089 -0.49439422 -0.4960667 -0.26127446 -0.42007603 -0.06662113 -0.45925282 -0.38504233 -0.11590677 1.41115463 -0.33253245 -0.13319736 -0.11846175 1.10081447 -1.04887691 2.97708361 -0.24274754 -0.41474269 -0.54809999 -0.0209096 -0.38191849 -0.64753149 0.46173139 -0.42287213 0.51371278 -0.00853476 -0.35509663 0.80763764 -0.62493754]
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
# After-the-fact hardcoded optimizations, in case we want to skip the experimentation phase
# (e.g. during a presentation)
if bestLogCValue == None:
bestLogCValue = -1
if bestKValue == None:
bestKValue = 30
if bestNrEst == None or bestMaxFeat == None:
bestNrEst = 256
bestMaxFeat = 8
models = {}
models = {'logres with log10(C)=%d' % bestLogCValue : LogisticRegression(C=pow(10,bestLogCValue)), # Takes about 1 second elapsed time
'knn with K=%d' % bestKValue : KNeighborsClassifier(n_neighbors=bestKValue), # Takes about 12 seconds elapsed time
'gaussianNB': GaussianNB(),
'baseline' : DummyClassifier(strategy='stratified')
# 'randomforest with nrEst=%d and maxFeat=%d' % (bestNrEst,bestMaxFeat): RandomForestClassifier(n_estimators=bestNrEst, max_features=bestMaxFeat) # Takes about 5 seconds elapsed time
}
fig = plt.figure(1,(9,6))
plt.title("%d-fold cross-validation %s scores for various model types" % (nrCrossValidationFolds, scorerType))
plt.xlabel("Fold #")
plt.ylabel(scorerType)
plt.grid()
for modelName, model in models.items():
print >> sys.stderr, "Building %s model ..." % modelName,
print >> sys.stderr, "applying it ...",
scores = cross_val_score(model, X_values, y=y_values, scoring=scorerType, cv=cvGenerator, n_jobs=1, verbose=10)
print >> sys.stderr, "done"
plt.plot(range(1,nrCrossValidationFolds+1), scores, 'o-', label="%s (%2.2f%% +/- %2.2f%%)" % (modelName, scores.mean() * 100, scores.std() * 100))
plt.legend(loc='best')
plt.show()
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.748927 - 3.7s
Building knn with K=30 model ... applying it ...[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 3.7s
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.758574 - 3.7s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.769673 - 3.6s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.771936 - 3.6s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.806344 - 3.6s
[Parallel(n_jobs=1)]: Done 2 jobs | elapsed: 7.4s [Parallel(n_jobs=1)]: Done 5 jobs | elapsed: 18.2s
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.759772 - 3.7s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.792484 - 3.7s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.773067 - 3.6s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.779205 - 3.7s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.782687 - 3.7s
[Parallel(n_jobs=1)]: Done 8 jobs | elapsed: 29.2s [Parallel(n_jobs=1)]: Done 10 out of 10 | elapsed: 36.6s finished
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.730469 - 0.1s
done Building gaussianNB model ... applying it ...[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 0.1s
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.757479 - 0.1s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.769577 - 0.1s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.773234 - 0.0s [CV] no parameters to be set .........................................
[Parallel(n_jobs=1)]: Done 2 jobs | elapsed: 0.1s [Parallel(n_jobs=1)]: Done 5 jobs | elapsed: 0.3s
[CV] ................ no parameters to be set, score=0.790082 - 0.0s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.769992 - 0.1s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.782535 - 0.1s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.773197 - 0.1s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.756975 - 0.0s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.783625 - 0.0s
[Parallel(n_jobs=1)]: Done 8 jobs | elapsed: 0.5s [Parallel(n_jobs=1)]: Done 10 out of 10 | elapsed: 0.5s finished
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.748908 - 0.4s
done Building logres with log10(C)=-1 model ... applying it ...[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 0.4s
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.779050 - 0.5s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.790301 - 0.4s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.786824 - 0.4s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.816228 - 0.4s
[Parallel(n_jobs=1)]: Done 2 jobs | elapsed: 0.9s [Parallel(n_jobs=1)]: Done 5 jobs | elapsed: 2.1s
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.785703 - 0.4s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.801108 - 0.4s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.785443 - 0.3s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.777990 - 0.4s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.794478 - 0.4s
[Parallel(n_jobs=1)]: Done 8 jobs | elapsed: 3.1s [Parallel(n_jobs=1)]: Done 10 out of 10 | elapsed: 3.8s finished
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.508652 - 0.0s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.494918 - 0.0s
done Building baseline model ... applying it ...[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 0.0s [Parallel(n_jobs=1)]: Done 2 jobs | elapsed: 0.0s
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.485547 - 0.0s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.495306 - 0.0s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.499960 - 0.0s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.506277 - 0.0s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.494437 - 0.0s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.514219 - 0.0s
[Parallel(n_jobs=1)]: Done 5 jobs | elapsed: 0.1s [Parallel(n_jobs=1)]: Done 8 jobs | elapsed: 0.2s
[CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.503079 - 0.0s [CV] no parameters to be set ......................................... [CV] ................ no parameters to be set, score=0.503250 - 0.0s
[Parallel(n_jobs=1)]: Done 10 out of 10 | elapsed: 0.2s finished done
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
import mpld3
mpld3.enable_notebook()
import sys
from matplotlib import pyplot as plt
import re
from math import log
import json
import copy
# We don't use cross validation here, but will generate a random test set instead
X_train, X_test, y_train, y_test = train_test_split(X_values, y_values, test_size=0.1, random_state=0)
finalPredictions = {}
for modelName, model in models.items():
fittedModel = model.fit(X_train, y_train)
if hasattr(model, "predict_proba"):
print >> sys.stderr, "Predicting probabilities for %s model ..." % modelName,
finalPredictions[modelName] = {'train': model.predict_proba(X_train),
'test': model.predict_proba(X_test)}
print >> sys.stderr, "done"
def plotROCCurve(figSize=7):
# Define some CSS to control our custom labels
css = """
table {
border-collapse: collapse;
}
th {
color: #ffffff;
background-color: #000000;
}
td {
padding: 2px;
background-color: #cccccc;
}
table, th, td {
font-family:Arial, Helvetica, sans-serif;
border: 1px solid black;
text-align: right;
}
"""
jsonROCData = {}
rocCurveFigure, ax = plt.subplots(figsize=(figSize,figSize))
ax.grid(True, alpha=0.3)
for modelName, probs in finalPredictions.iteritems():
modelNameShort = re.split("\s+", modelName)[0]
# if modelNameShort not in ['knn','baseline']:
# continue
y_probs = [x[1] for x in finalPredictions[modelName]['test']]
fpr, tpr, thresholds = roc_curve(y_true=y_test,
y_score=y_probs, pos_label=1)
# print "%s: fpr (size = %d)" % (modelNameShort, len(fpr)), fpr
# print "tpr (size = %d)" % len(tpr), tpr
# print "thresholds (size = %d)" % len(thresholds), thresholds
roc_auc = roc_auc_score(y_true=y_test, y_score=y_probs)
jsonROCData[modelNameShort] = {}
jsonROCData[modelNameShort]['fpr'] = [x for x in fpr]
jsonROCData[modelNameShort]['tpr'] = [ x for x in tpr]
jsonROCData[modelNameShort]['thresholds'] = [x for x in thresholds]
jsonROCData[modelNameShort]['roc_auc'] = roc_auc
points = plt.plot(fpr, tpr, 'x-', label="%s (AUC = %1.2f%%)" % (modelName, roc_auc*100))
labels = ["<table><th colspan='2'>%s</th><tr><td>FPR</td><td>%0.1f%%</td></tr><tr><td>TPR</td><td>%0.1f%%</td></tr><tr><td>threshold</td><td>%0.4f</td></tr></table>" % (modelNameShort, fpr[i]*100, tpr[i]*100, thresholds[i]) for i in range(len(fpr))]
mpld3.plugins.connect(rocCurveFigure, mpld3.plugins.PointHTMLTooltip(points[0],labels=labels, css=css))
ax.set_title("ROC curve for prediction of successful term deposit sale", y=1.06, fontsize=14 + log(figSize))
ax.set_xlabel("False Positive Rate (FP/FP+TN)", labelpad=15 + log(figSize), fontsize=12 + log(figSize))
ax.set_ylabel("True Positive Rate (TP/TP+FN)", labelpad=15 + log(figSize), fontsize=12 + log(figSize))
plt.legend(loc="best")
plt.show()
return jsonROCData
# Export data to JSON file for visualization in D3.js or similar
jsonROCData = plotROCCurve(figSize=9)
with open('d3/ROCCurve.json', 'w') as outfile:
json.dump(jsonROCData, outfile)
from IPython.html.widgets import interact, fixed
#mpld3.save_html(plotROCCurve(figSize=9), "plotROCCurveBanking.html", figid="ROC_Curve_Banking")
interact(plotROCCurve, figSize=(5,10))
<function __main__.plotROCCurve>
import numpy as np
import mpld3
mpld3.enable_notebook()
import re
from math import log
def sortTestSetByProbability(y_test, y_probs):
sortedIndexList = sorted(range(len(y_probs)), key=lambda k: y_probs[k], reverse=True)
return [(y_probs[i], y_test[i], i) for i in sortedIndexList]
def calculateLift(sortedProbsAndTrueLabels, nrPercentiles=10, posLabel=1):
''' Input is a list of tuples containing (prob, true label), sorted in descending order per prob
Output is a a list of nrPercentiles tuples containing, for each percentile:
(percentOfTargetPopulationInThisPercentile,
percentOfPositivePopulationInThisPercentile,
cumulativeLift,
localLift
)
'''
numberOfPositiveInstances = len([x for x in sortedProbsAndTrueLabels if x[1] == posLabel])
percentileSplitOfInstances = [np_array.tolist() for np_array in np.array_split(np.array([x[1] for x in sortedProbsAndTrueLabels]), nrPercentiles)]
# print "len(percentileSplitOfInstances) = ", len(percentileSplitOfInstances)
# print "percentileSplitOfInstances[0:2] = ", percentileSplitOfInstances[0:2]
result = [(0.0,0.0,0.0,0.0)]
cumulativePercentage = 0.0
cumulativeLift = 0
localLift = 0
for c in range(len(percentileSplitOfInstances)):
percentile = percentileSplitOfInstances[c]
nrPositivesInPercentile = len([x for x in percentile if x == posLabel])
p = float(c+1)/float(nrPercentiles)
cumulativePercentage += float(nrPositivesInPercentile)/float(numberOfPositiveInstances)
cumulativeLift = cumulativePercentage/p
localLift = (float(nrPositivesInPercentile)/float(numberOfPositiveInstances))/(1.0/float(nrPercentiles))
result.append((p,cumulativePercentage,cumulativeLift, localLift))
return result
def plotLiftCurve(numberOfPercentiles=10, figSize=5):
# Define some CSS to control our custom labels
css = """
table {
border-collapse: collapse;
}
th {
color: #ffffff;
background-color: #000000;
}
td {
padding: 2px;
background-color: #cccccc;
}
table, th, td {
font-family:Arial, Helvetica, sans-serif;
border: 1px solid black;
text-align: right;
}
"""
jsonLiftData = {}
liftCurveFigure, ax = plt.subplots(figsize=(figSize,figSize))
ax.grid(True, alpha=0.3)
ax.set_autoscaley_on(False)
ax.set_ylim([0,100])
for modelName, probs in finalPredictions.iteritems():
modelNameShort = re.split("\s+", modelName)[0]
y_probs = [x[1] for x in finalPredictions[modelName]['test']]
sortedProbsAndTrueLabels = sortTestSetByProbability(y_test,y_probs)
lifts = calculateLift(sortedProbsAndTrueLabels, nrPercentiles=numberOfPercentiles, posLabel=1)
# print modelName, lifts
xPerc = [x[0]*100 for x in lifts]
yPerc = [x[1]*100 for x in lifts]
jsonLiftData[modelNameShort] = {}
jsonLiftData[modelNameShort]['xPerc'] = [x for x in xPerc]
jsonLiftData[modelNameShort]['yPerc'] = [y for y in yPerc]
jsonLiftData[modelNameShort]['cumLift'] = [l[2] for l in lifts]
jsonLiftData[modelNameShort]['localLift'] = [l[3] for l in lifts]
points = ax.plot(xPerc, yPerc, 'o-', label="%s" % modelNameShort)
labels = ["<table><th colspan='2'>%s</th><tr><td>top-scored</td><td>%2.1f%%</td></tr><tr><td>yields</td><td>%2.1f%%</td></tr><tr><td>cum. lift</td><td>%1.1f</td></tr><tr><td>local lift</td><td>%1.1f</td></tr></table>" % (modelNameShort, xPerc[i],yPerc[i], lifts[i][2], lifts[i][3]) for i in range(len(yPerc))]
mpld3.plugins.connect(liftCurveFigure, mpld3.plugins.PointHTMLTooltip(points[0],labels=labels, css=css))
ax.set_title("Lift curve for prediction of successful term deposit sale", y=1.06, fontsize=14 + log(figSize))
ax.set_xlabel("Percentage of test instances (decreasing by probability)", labelpad=15 + log(figSize), fontsize=12 + log(figSize))
ax.set_ylabel("Percentage of positive instances targeted", labelpad=15 + log(figSize), fontsize=12 + log(figSize))
plt.legend(loc=4)
plt.show()
return jsonLiftData
# Export data to JSON file for visualization in D3.js or similar
jsonLiftData = plotLiftCurve(numberOfPercentiles=100, figSize=9)
with open('d3/LiftCurve.json', 'w') as outfile:
json.dump(jsonLiftData, outfile)
from IPython.html.widgets import interact, fixed
#mpld3.save_html(plotLiftCurve(numberOfPercentiles=10, figSize=9), "plotLiftCurveBanking.html", figid="Lift_Curve_Banking")
interact(plotLiftCurve, numberOfPercentiles=(5, 20), figSize=(5,10))
<function __main__.plotLiftCurve>
From the given case description, we don't exactly know how much a single contact costs, and how much a successful one would earn in terms of revenue. However, for each combination of these two, we can draw a profit curve.
def calculateProfit(sortedProbsAndTrueLabels,
nrPercentiles=20,
posLabel=1,
avgCostPerContact=10,
avgRevenuePerContact=50
):
''' Input is a list of tuples containing (prob, avg cost per contact, avg revenue per contact,
number of percentiles, true label), sorted in descending order per prob
Output is a list containing tuples of the form
(percentOfTargetPopulationInThisPercentile,
cumulative revenue,
cumulative cost,
cumulative profit,
revenue for this percentile,
cost for this percentile,
profit for this percentile)
'''
assert(avgCostPerContact >= 0.0), "avgCostPerContact is expressed as a positive float"
assert(avgRevenuePerContact >= 0.0), "avgRevenuePerContact is expressed as a positive float"
numberOfPositiveInstances = len([x for x in sortedProbsAndTrueLabels if x[1] == posLabel])
percentileSplitOfInstances = [np_array.tolist() for np_array in np.array_split(np.array([x[1] for x in sortedProbsAndTrueLabels]), nrPercentiles)]
# print "len(percentileSplitOfInstances) = ", len(percentileSplitOfInstances)
# print "percentileSplitOfInstances[0:2] = ", percentileSplitOfInstances[0:2]
result = [(0.0,0.0,0.0,0.0,0.0,0.0,0.0)]
cumulativeCost = 0.0
cumulativeRevenue = 0.0
cumulativeProfit = 0.0
for c in range(len(percentileSplitOfInstances)):
percentile = percentileSplitOfInstances[c]
nrPositivesInPercentile = len([x for x in percentile if x == posLabel])
nrNegativesInPercentile = len(percentile) - nrPositivesInPercentile
assert(nrPositivesInPercentile + nrNegativesInPercentile == len(percentile))
assert(len(percentile) > 0)
p = float(c+1)/float(nrPercentiles)
costThisPercentile = avgCostPerContact * (nrPositivesInPercentile + nrNegativesInPercentile)
revenueThisPercentile = avgRevenuePerContact * nrPositivesInPercentile
profitThisPercentile = revenueThisPercentile - costThisPercentile
cumulativeRevenue = cumulativeRevenue + revenueThisPercentile
cumulativeCost = cumulativeCost + costThisPercentile
cumulativeProfit = cumulativeProfit + profitThisPercentile
result.append((p, cumulativeRevenue, cumulativeCost, cumulativeProfit, revenueThisPercentile, costThisPercentile, profitThisPercentile))
return result
def plotProfitCurve(numberOfPercentiles=10, figSize=7, avgCostPerContact=15, avgRevenuePerContact=75, yUnit=1000.0):
# Define some CSS to control our custom labels
css = """
table {
border-collapse: collapse;
}
th {
color: #ffffff;
background-color: #000000;
}
td {
padding: 2px;
background-color: #cccccc;
}
table, th, td {
font-family:Arial, Helvetica, sans-serif;
border: 1px solid black;
text-align: right;
}
"""
jsonProfitData = {}
profitCurveFigure, ax = plt.subplots(1, figsize=(int(figSize),int(figSize)))
ax.grid(True, alpha=0.3)
ax.set_autoscaley_on(True)
for modelName, probs in finalPredictions.iteritems():
modelNameShort = re.split("\s+", modelName)[0]
y_probs = [x[1] for x in finalPredictions[modelName]['test']]
sortedProbsAndTrueLabels = sortTestSetByProbability(y_test,y_probs)
profits = calculateProfit(sortedProbsAndTrueLabels,
avgCostPerContact=avgCostPerContact,
avgRevenuePerContact=avgRevenuePerContact,
nrPercentiles=int(numberOfPercentiles),
posLabel=1)
xPerc = [x[0]*100 for x in profits]
yProfit = [x[3]/yUnit for x in profits]
jsonProfitData[modelNameShort] = {}
jsonProfitData[modelNameShort]['sortedProbsAndTrueLabels'] = sortedProbsAndTrueLabels
jsonProfitData[modelNameShort]['xPerc'] = [x for x in xPerc]
jsonProfitData[modelNameShort]['yProfit'] = [y for y in yProfit]
jsonProfitData[modelNameShort]['cumProfit'] = [p[3] for p in profits]
jsonProfitData[modelNameShort]['intervalProfit'] = [p[6] for p in profits]
points = ax.plot(xPerc, yProfit, 'o-', label="%s" % modelNameShort)
labels = ["<table><th colspan='2'>%s</th><tr><td>top-scored</td><td>%2.1f%%</td></tr><tr><td>cum. profit (USD)</td><td>%.0f</td></tr><tr><td>interval profit (USD)</td><td>%.0f</td></tr></table>" % (modelNameShort, xPerc[i],profits[i][3], profits[i][6]) for i in range(len(yProfit))]
mpld3.plugins.connect(profitCurveFigure, mpld3.plugins.PointHTMLTooltip(points[0],labels=labels, css=css))
# print "%s: " % modelNameShort, profits
ax.set_title("Profit curve for prediction of successful term deposit sale", y=1.06, fontsize=14 + log(int(figSize)))
ax.set_xlabel("Percentage of test instances (decreasing by probability)", labelpad=15 + log(int(figSize)), fontsize=12 + log(int(figSize)))
ax.set_ylabel("Profit (in %d USD)" % int(yUnit), labelpad=15 + log(int(figSize)), fontsize=12 + log(int(figSize)))
plt.legend(loc=1)
plt.show()
return jsonProfitData
# Export data to JSON file for visualization in D3.js or similar
jsonProfitData = plotProfitCurve(avgCostPerContact=15, avgRevenuePerContact=50, numberOfPercentiles=100, figSize=9, yUnit=1)
with open('d3/ProfitCurve.json', 'w') as outfile:
json.dump(jsonProfitData, outfile)
from IPython.html.widgets import interact, fixed
#mpld3.save_html(plotProfitCurve(avgCostPerContact=10, avgRevenuePerContact=50, numberOfPercentiles=20, figSize=9), "plotLiftCurveBanking.html", figid="Lift_Curve_Banking")
interact(plotProfitCurve, numberOfPercentiles=(10,50), figSize=(5,10), avgCostPerContact=(10,50), avgRevenuePerContact=(20,100))
<function __main__.plotProfitCurve>