bank.csv
is already in our repo, so there is no need to download the data from the UCI websiteimport pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
/Users/sinanozdemir/anaconda/envs/sfdat26-env/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment. warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
bank = pd.read_csv('../data/bank.csv', sep=';')
bank.head()
age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | campaign | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 30 | blue-collar | married | basic.9y | no | yes | no | cellular | may | fri | ... | 2 | 999 | 0 | nonexistent | -1.8 | 92.893 | -46.2 | 1.313 | 5099.1 | no |
1 | 39 | services | single | high.school | no | no | no | telephone | may | fri | ... | 4 | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.855 | 5191.0 | no |
2 | 25 | services | married | high.school | no | yes | no | telephone | jun | wed | ... | 1 | 999 | 0 | nonexistent | 1.4 | 94.465 | -41.8 | 4.962 | 5228.1 | no |
3 | 38 | services | married | basic.9y | no | unknown | unknown | telephone | jun | fri | ... | 3 | 999 | 0 | nonexistent | 1.4 | 94.465 | -41.8 | 4.959 | 5228.1 | no |
4 | 47 | admin. | married | university.degree | no | yes | no | cellular | nov | mon | ... | 1 | 999 | 0 | nonexistent | -0.1 | 93.200 | -42.0 | 4.191 | 5195.8 | no |
5 rows × 21 columns
bank.shape
(4119, 21)
# Do we have any null values?
bank.isnull().sum()
age 0 job 0 marital 0 education 0 default 0 housing 0 loan 0 contact 0 month 0 day_of_week 0 duration 0 campaign 0 pdays 0 previous 0 poutcome 0 emp.var.rate 0 cons.price.idx 0 cons.conf.idx 0 euribor3m 0 nr.employed 0 y 0 dtype: int64
# list all columns (for reference)
bank.columns
Index([u'age', u'job', u'marital', u'education', u'default', u'housing', u'loan', u'contact', u'month', u'day_of_week', u'duration', u'campaign', u'pdays', u'previous', u'poutcome', u'emp.var.rate', u'cons.price.idx', u'cons.conf.idx', u'euribor3m', u'nr.employed', u'y'], dtype='object')
# convert the response to numeric values and store as a new column called "outcome"
bank['outcome'] = bank.y.map({'no':0, 'yes':1})
bank['outcome'].head()
0 0 1 0 2 0 3 0 4 0 Name: outcome, dtype: int64
Let's explore a few columns manually and visually
# Exercise!
# create a boxplot of age the data by outcome
# TODO
print "\n"*20
# Solution
bank.boxplot(column='age', by='outcome')
# probably not a great feature, why?
<matplotlib.axes._subplots.AxesSubplot at 0x114dd9310>
# looks like a useful feature
bank.groupby('job').outcome.mean().plot(kind='bar')
# Ask yourself, WHY is this going to be useful?
<matplotlib.axes._subplots.AxesSubplot at 0x117c38fd0>
# create job_dummies (we will add it to the bank DataFrame later)
job_dummies = pd.get_dummies(bank.job, prefix='job')
# removing the first job from the list (making it k-1 columns) where k is the number of jobs
job_dummies.drop(job_dummies.columns[0], axis=1, inplace=True)
job_dummies.head()
job_blue-collar | job_entrepreneur | job_housemaid | job_management | job_retired | job_self-employed | job_services | job_student | job_technician | job_unemployed | job_unknown | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
# looks like a useful feature
bank.groupby('default').outcome.mean().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x1180e21d0>
# Exercise!
# How many people have a value of no, yes, and unknown?
# TODO
print "\n"*20
# Solution!
bank.default.value_counts()
# only one person in the dataset has a status of yes
no 3315 unknown 803 yes 1 Name: default, dtype: int64
# So, let's treat this as a 2-class feature rather than a 3-class feature
bank['default'] = bank.default.map({'no':0, 'unknown':1, 'yes':1})
bank.head()
age | job | marital | education | default | housing | loan | contact | month | day_of_week | ... | pdays | previous | poutcome | emp.var.rate | cons.price.idx | cons.conf.idx | euribor3m | nr.employed | y | outcome | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 30 | blue-collar | married | basic.9y | 0 | yes | no | cellular | may | fri | ... | 999 | 0 | nonexistent | -1.8 | 92.893 | -46.2 | 1.313 | 5099.1 | no | 0 |
1 | 39 | services | single | high.school | 0 | no | no | telephone | may | fri | ... | 999 | 0 | nonexistent | 1.1 | 93.994 | -36.4 | 4.855 | 5191.0 | no | 0 |
2 | 25 | services | married | high.school | 0 | yes | no | telephone | jun | wed | ... | 999 | 0 | nonexistent | 1.4 | 94.465 | -41.8 | 4.962 | 5228.1 | no | 0 |
3 | 38 | services | married | basic.9y | 0 | unknown | unknown | telephone | jun | fri | ... | 999 | 0 | nonexistent | 1.4 | 94.465 | -41.8 | 4.959 | 5228.1 | no | 0 |
4 | 47 | admin. | married | university.degree | 0 | yes | no | cellular | nov | mon | ... | 999 | 0 | nonexistent | -0.1 | 93.200 | -42.0 | 4.191 | 5195.8 | no | 0 |
5 rows × 22 columns
# looks like a useful feature
bank.groupby('contact').outcome.mean()
contact cellular 0.141403 telephone 0.051806 Name: outcome, dtype: float64
# Exercise!
# Convert the feature to numeric values, 0 for cellular, 1 for telephone
# TODO
print "\n"*20
# Solution!
bank['contact'] = bank.contact.map({'cellular':0, 'telephone':1})
# looks like a useful feature at first glance
bank.groupby('month').outcome.mean()
month apr 0.167442 aug 0.100629 dec 0.545455 jul 0.082982 jun 0.128302 mar 0.583333 may 0.065312 nov 0.096413 oct 0.362319 sep 0.406250 Name: outcome, dtype: float64
# but, it looks like their success rate is actually just correlated with number of calls
# thus, the month feature is unlikely to generalize
bank.groupby('month').outcome.agg(['count', 'mean']).sort('count')
/Users/sinanozdemir/anaconda/envs/sfdat26-env/lib/python2.7/site-packages/ipykernel/__main__.py:3: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....) app.launch_new_instance()
count | mean | |
---|---|---|
month | ||
dec | 22 | 0.545455 |
mar | 48 | 0.583333 |
sep | 64 | 0.406250 |
oct | 69 | 0.362319 |
apr | 215 | 0.167442 |
nov | 446 | 0.096413 |
jun | 530 | 0.128302 |
aug | 636 | 0.100629 |
jul | 711 | 0.082982 |
may | 1378 | 0.065312 |
bank.groupby('month').outcome.agg(['count', 'mean']).sort('count').corr()
/Users/sinanozdemir/anaconda/envs/sfdat26-env/lib/python2.7/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....) if __name__ == '__main__':
count | mean | |
---|---|---|
count | 1.000000 | -0.766364 |
mean | -0.766364 | 1.000000 |
# looks like an excellent feature, can you think of a reason why duration isn't helpful in practice?
bank.boxplot(column='duration', by='outcome')
<matplotlib.axes._subplots.AxesSubplot at 0x11847a610>
# Answer: You can't know the duration of a call beforehand, thus it can't be used in your model
# looks like a useful feature
bank.groupby('previous').outcome.mean()
previous 0 0.082884 1 0.208421 2 0.410256 3 0.600000 4 0.714286 5 1.000000 6 0.500000 Name: outcome, dtype: float64
# looks like a useful feature
bank.groupby('poutcome').outcome.mean()
poutcome failure 0.147577 nonexistent 0.082884 success 0.647887 Name: outcome, dtype: float64
# create poutcome_dummies
poutcome_dummies = pd.get_dummies(bank.poutcome, prefix='poutcome')
poutcome_dummies.drop(poutcome_dummies.columns[0], axis=1, inplace=True)
# concatenate bank DataFrame with job_dummies and poutcome_dummies
bank = pd.concat([bank, job_dummies, poutcome_dummies], axis=1)
# looks like an excellent feature
bank.boxplot(column='euribor3m', by='outcome')
<matplotlib.axes._subplots.AxesSubplot at 0x1186bf150>
# new list of columns (including dummy columns)
bank.columns
Index([u'age', u'job', u'marital', u'education', u'default', u'housing', u'loan', u'contact', u'month', u'day_of_week', u'duration', u'campaign', u'pdays', u'previous', u'poutcome', u'emp.var.rate', u'cons.price.idx', u'cons.conf.idx', u'euribor3m', u'nr.employed', u'y', u'outcome', u'job_blue-collar', u'job_entrepreneur', u'job_housemaid', u'job_management', u'job_retired', u'job_self-employed', u'job_services', u'job_student', u'job_technician', u'job_unemployed', u'job_unknown', u'poutcome_nonexistent', u'poutcome_success'], dtype='object')
# create X (including 13 dummy columns)
feature_cols = ['default', 'contact', 'previous', 'euribor3m'] + list(bank.columns[-13:])
X = bank[feature_cols]
# create y
y = bank.outcome
# Exercise!
# calculate cross-validated Accuracy for logisic regression, and knn with 5 neighbors
# TODO
print "\n"*20
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
# Solution!
logreg = LogisticRegression()
cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean()
0.90215602496498271
knn = KNeighborsClassifier(5)
cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean()
0.90046229940191136
# Exercise!
# use grid search to try k in range from 1-29 (odd only) for knn, and graph the accuracy from a cross val score vs the k
# TODO
print "\n"*20
# Solution!
from sklearn.grid_search import GridSearchCV
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': range(1, 29, 2)}
grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid.fit(X, y)
GridSearchCV(cv=5, error_score='raise', estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform'), fit_params={}, iid=True, n_jobs=1, param_grid={'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27]}, pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)
grid_mean_scores = [result[1] for result in grid.grid_scores_]
plt.figure()
plt.plot(range(1, 29, 2), grid_mean_scores)
[<matplotlib.lines.Line2D at 0x11b643b10>]
grid.best_estimator_
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=27, p=2, weights='uniform')
# look's like adding to k after 5 doesn't even help that much....
# Exercise!
# Create a confusion matrix from the best estimator in the gridsearch and find the accuracy, sensitivity, and specificity
# TODO
print "\n"*20
# Solution!
from sklearn import metrics
preds = grid.best_estimator_.predict(X)
print metrics.confusion_matrix(y, preds)
# Accuracy = (3625 + 100) / 4119 == .904
# Sensitivity = 100 / (100 + 351) == .222
# Specificity = 3625 / (3625 + 43) == .988
[[3625 43] [ 351 100]]
# Exercise!
# Compare the accuracy to the null accuracy rate
# TODO
print "\n"*20
# Solution!
null_accuracy_rate = bank['outcome'].value_counts()[0] / float(bank.shape[0])
null_accuracy_rate
# accuracy is greater than the null accuracy rate!
0.89050740470988099
# What can we do to make the model better?
# Thought Experiment
What if we want to regress to the duration of the call so the rep can better plan their day?
bank.corr()
age | default | contact | duration | campaign | pdays | previous | emp.var.rate | cons.price.idx | cons.conf.idx | ... | job_management | job_retired | job_self-employed | job_services | job_student | job_technician | job_unemployed | job_unknown | poutcome_nonexistent | poutcome_success | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
age | 1.000000 | 0.161427 | 0.012700 | 0.041299 | -0.014169 | -0.043425 | 0.050931 | -0.019192 | -0.000482 | 0.098135 | ... | 0.065606 | 0.412542 | 0.010991 | -0.050379 | -0.185453 | -0.064930 | -0.009394 | 0.063831 | -0.031205 | 0.043341 |
default | 0.161427 | 1.000000 | 0.150494 | -0.020140 | 0.021239 | 0.089514 | -0.092754 | 0.184389 | 0.162098 | 0.021570 | ... | -0.043783 | 0.023665 | -0.019191 | 0.021453 | -0.017566 | -0.081766 | -0.010086 | 0.065701 | 0.091124 | -0.086343 |
contact | 0.012700 | 0.150494 | 1.000000 | -0.028185 | 0.085654 | 0.120670 | -0.215464 | 0.383405 | 0.574452 | 0.252035 | ... | -0.045940 | -0.038983 | 0.003609 | 0.024214 | -0.018891 | -0.021849 | 0.001462 | 0.021516 | 0.251147 | -0.109971 |
duration | 0.041299 | -0.020140 | -0.028185 | 1.000000 | -0.085348 | -0.046998 | 0.025724 | -0.028848 | 0.016672 | -0.034745 | ... | -0.011460 | 0.044257 | -0.001466 | -0.030936 | 0.016982 | -0.006173 | -0.004565 | -0.008591 | -0.021259 | 0.046154 |
campaign | -0.014169 | 0.021239 | 0.085654 | -0.085348 | 1.000000 | 0.058742 | -0.091490 | 0.176079 | 0.145021 | 0.007882 | ... | -0.011967 | -0.011147 | 0.021876 | 0.038893 | -0.032527 | -0.004871 | 0.005467 | -0.007765 | 0.098957 | -0.054564 |
pdays | -0.043425 | 0.089514 | 0.120670 | -0.046998 | 0.058742 | 1.000000 | -0.587941 | 0.270684 | 0.058472 | -0.092090 | ... | -0.011278 | -0.067405 | 0.007591 | 0.031058 | -0.043072 | 0.009654 | -0.028664 | -0.006368 | 0.488761 | -0.940565 |
previous | 0.050931 | -0.092754 | -0.215464 | 0.025724 | -0.091490 | -0.587941 | 1.000000 | -0.415238 | -0.164922 | -0.051420 | ... | 0.030516 | 0.051054 | 0.006366 | -0.007326 | 0.078260 | -0.017419 | 0.013485 | -0.001958 | -0.854241 | 0.510865 |
emp.var.rate | -0.019192 | 0.184389 | 0.383405 | -0.028848 | 0.176079 | 0.270684 | -0.415238 | 1.000000 | 0.755155 | 0.195022 | ... | -0.025176 | -0.087204 | 0.000153 | -0.003592 | -0.116507 | 0.047053 | -0.028425 | 0.013772 | 0.482106 | -0.256587 |
cons.price.idx | -0.000482 | 0.162098 | 0.574452 | 0.016672 | 0.145021 | 0.058472 | -0.164922 | 0.755155 | 1.000000 | 0.045835 | ... | -0.048750 | -0.032616 | -0.001529 | 0.025988 | -0.032322 | -0.002939 | -0.003210 | 0.016884 | 0.289799 | -0.060257 |
cons.conf.idx | 0.098135 | 0.021570 | 0.252035 | -0.034745 | 0.007882 | -0.092090 | -0.051420 | 0.195022 | 0.045835 | 1.000000 | ... | -0.005887 | 0.065338 | 0.007642 | -0.073024 | 0.025850 | 0.058946 | -0.000228 | 0.032504 | 0.100217 | 0.087382 |
euribor3m | -0.015033 | 0.176800 | 0.390313 | -0.032329 | 0.159435 | 0.301478 | -0.458851 | 0.970308 | 0.657159 | 0.276595 | ... | -0.011447 | -0.088148 | 0.005542 | -0.012849 | -0.119714 | 0.046468 | -0.023823 | 0.017662 | 0.502340 | -0.282272 |
nr.employed | -0.041936 | 0.167754 | 0.261496 | -0.044218 | 0.161037 | 0.381983 | -0.514853 | 0.897173 | 0.472560 | 0.107054 | ... | -0.002022 | -0.112450 | 0.001342 | -0.002975 | -0.144023 | 0.042848 | -0.030453 | 0.010605 | 0.508717 | -0.354030 |
outcome | 0.060374 | -0.076567 | -0.137401 | 0.418565 | -0.076091 | -0.332012 | 0.255697 | -0.283216 | -0.098326 | 0.054393 | ... | -0.015814 | 0.078373 | -0.017796 | -0.021253 | 0.055781 | 0.009032 | 0.032872 | -0.002169 | -0.207179 | 0.325804 |
job_blue-collar | -0.042976 | 0.167771 | 0.098988 | 0.010396 | -0.021404 | 0.062240 | -0.041762 | 0.050353 | 0.080969 | -0.122025 | ... | -0.152741 | -0.107122 | -0.104747 | -0.169771 | -0.074502 | -0.234697 | -0.086993 | -0.051108 | 0.026745 | -0.053400 |
job_entrepreneur | 0.039110 | 0.020113 | 0.028028 | -0.005750 | -0.024137 | 0.032033 | -0.017265 | 0.022054 | 0.011657 | -0.014320 | ... | -0.056409 | -0.039561 | -0.038684 | -0.062698 | -0.027514 | -0.086676 | -0.032128 | -0.018875 | 0.008954 | -0.029330 |
job_housemaid | 0.089297 | 0.036203 | 0.018311 | -0.017642 | -0.004750 | 0.017684 | -0.027623 | 0.036182 | 0.022542 | 0.042351 | ... | -0.048400 | -0.033944 | -0.033192 | -0.053796 | -0.023608 | -0.074370 | -0.027566 | -0.016195 | 0.042448 | -0.014792 |
job_management | 0.065606 | -0.043783 | -0.045940 | -0.011460 | -0.011967 | -0.011278 | 0.030516 | -0.025176 | -0.048750 | -0.005887 | ... | 1.000000 | -0.059877 | -0.058549 | -0.094895 | -0.041643 | -0.131185 | -0.048626 | -0.028567 | -0.007995 | 0.004104 |
job_retired | 0.412542 | 0.023665 | -0.038983 | 0.044257 | -0.011147 | -0.067405 | 0.051054 | -0.087204 | -0.032616 | 0.065338 | ... | -0.059877 | 1.000000 | -0.041062 | -0.066553 | -0.029206 | -0.092005 | -0.034103 | -0.020035 | -0.045550 | 0.056007 |
job_self-employed | 0.010991 | -0.019191 | 0.003609 | -0.001466 | 0.021876 | 0.007591 | 0.006366 | 0.000153 | -0.001529 | 0.007642 | ... | -0.058549 | -0.041062 | 1.000000 | -0.065077 | -0.028558 | -0.089964 | -0.033346 | -0.019591 | -0.014306 | -0.003325 |
job_services | -0.050379 | 0.021453 | 0.024214 | -0.030936 | 0.038893 | 0.031058 | -0.007326 | -0.003592 | 0.025988 | -0.073024 | ... | -0.094895 | -0.066553 | -0.065077 | 1.000000 | -0.046286 | -0.145812 | -0.054047 | -0.031752 | 0.002033 | -0.025132 |
job_student | -0.185453 | -0.017566 | -0.018891 | 0.016982 | -0.032527 | -0.043072 | 0.078260 | -0.116507 | -0.032322 | 0.025850 | ... | -0.041643 | -0.029206 | -0.028558 | -0.046286 | 1.000000 | -0.063988 | -0.023718 | -0.013934 | -0.084656 | 0.011176 |
job_technician | -0.064930 | -0.081766 | -0.021849 | -0.006173 | -0.004871 | 0.009654 | -0.017419 | 0.047053 | -0.002939 | 0.058946 | ... | -0.131185 | -0.092005 | -0.089964 | -0.145812 | -0.063988 | 1.000000 | -0.074716 | -0.043896 | 0.020288 | -0.013611 |
job_unemployed | -0.009394 | -0.010086 | 0.001462 | -0.004565 | 0.005467 | -0.028664 | 0.013485 | -0.028425 | -0.003210 | -0.000228 | ... | -0.048626 | -0.034103 | -0.033346 | -0.054047 | -0.023718 | -0.074716 | 1.000000 | -0.016270 | -0.012524 | 0.034295 |
job_unknown | 0.063831 | 0.065701 | 0.021516 | -0.008591 | -0.007765 | -0.006368 | -0.001958 | 0.013772 | 0.016884 | 0.032504 | ... | -0.028567 | -0.020035 | -0.019591 | -0.031752 | -0.013934 | -0.043896 | -0.016270 | 1.000000 | 0.004583 | 0.009007 |
poutcome_nonexistent | -0.031205 | 0.091124 | 0.251147 | -0.021259 | 0.098957 | 0.488761 | -0.854241 | 0.482106 | 0.289799 | 0.100217 | ... | -0.007995 | -0.045550 | -0.014306 | 0.002033 | -0.084656 | 0.020288 | -0.012524 | 0.004583 | 1.000000 | -0.459409 |
poutcome_success | 0.043341 | -0.086343 | -0.109971 | 0.046154 | -0.054564 | -0.940565 | 0.510865 | -0.256587 | -0.060257 | 0.087382 | ... | 0.004104 | 0.056007 | -0.003325 | -0.025132 | 0.011176 | -0.013611 | 0.034295 | 0.009007 | -0.459409 | 1.000000 |
26 rows × 26 columns
bank[['age', 'duration']].corr()
# age won't be useful
age | duration | |
---|---|---|
age | 1.000000 | 0.041299 |
duration | 0.041299 | 1.000000 |
bank.groupby('education').duration.mean().plot(kind='bar')
# or education, except that illiterate column is interesting..
<matplotlib.axes._subplots.AxesSubplot at 0x11c8b59d0>
bank.groupby('marital').duration.mean().plot(kind='bar')
# or marital
<matplotlib.axes._subplots.AxesSubplot at 0x11ccd5490>
bank.groupby('job').duration.mean().plot(kind='bar')
# job maybe..
<matplotlib.axes._subplots.AxesSubplot at 0x11c75a150>
bank.groupby('day_of_week').duration.mean().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x11c8992d0>
bank.groupby('previous').duration.mean().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x11c908590>
duration_response = bank['duration']
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
rmse = np.sqrt(abs(cross_val_score(linreg, X, duration_response, cv = 5, scoring='mean_squared_error').mean()))
rmse
255.17246331494047
# Not that great..
# job field
bank.groupby('job').duration.mean().plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x11d87d510>
# Only with the retired field
linreg = LinearRegression()
rmse = np.sqrt(abs(cross_val_score(linreg, X[['job_retired']], duration_response, cv = 5, scoring='mean_squared_error').mean()))
rmse
254.46828669453504
# How can we make this a bit better?