In [54]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
%matplotlib inline
In [45]:
df = pd.read_csv('../data/classification/Caravan.csv', ';')
df = df.sample(frac = 1)
df.shape
Out[45]:
(5822, 86)
In [47]:
X = df[ df.columns[:-1]  ]
le = LabelEncoder()
y = le.fit_transform(df['Purchase'])
In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=0.3,
    shuffle = True,
    random_state=2)
In [57]:
C = 1
clf = SVC(kernel='sigmoid', C = C, probability = True)
clf.fit(X_train, y_train)
Out[57]:
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='sigmoid',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
In [58]:
clf.score(X_test, y_test)
Out[58]:
0.93589009730967376
In [59]:
yhat = clf.predict(X_test)
confusion_matrix(yhat, y_test)
Out[59]:
array([[1635,  101],
       [  11,    0]])
In [60]:
# ROC AUC

yhat_proba = clf.predict_proba(X_test)[:,-1]
print("-- ROC AUC Score test {:.4f}".format(roc_auc_score(y_test, yhat_proba)  ))

print(classification_report(yhat, y_test))
-- ROC AUC Score test 0.4099
             precision    recall  f1-score   support

          0       0.99      0.94      0.97      1736
          1       0.00      0.00      0.00        11

avg / total       0.99      0.94      0.96      1747

Oversampling

In [66]:
df_maj = df[df.Purchase == 'No']
df_min = df[df.Purchase == 'Yes'].sample(frac = 4, replace = True )

odf = pd.concat([df_maj, df_min])


odf = odf.sample(frac = 1)
odf.Purchase.value_counts()
Out[66]:
No     5474
Yes    1392
Name: Purchase, dtype: int64
In [75]:
X = odf[ odf.columns[:-1]  ]
le = LabelEncoder()
y = le.fit_transform(odf['Purchase'])
X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=0.3,
    shuffle = True,
    random_state=2)
In [78]:
C = 1
clf = SVC(kernel='poly', C = C, probability = True, degree = 6)
clf.fit(X_train, y_train)
Out[78]:
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=6, gamma='auto', kernel='poly',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
In [79]:
yhat = clf.predict(X_test)
print(confusion_matrix(yhat, y_test))

yhat_proba = clf.predict_proba(X_test)[:,-1]
print("-- ROC AUC Score test {:.4f}".format(roc_auc_score(y_test, yhat_proba)  ))
[[1465   35]
 [ 184  376]]
-- ROC AUC Score test 0.9127

Undersample

In [115]:
df_maj = df[df.Purchase == 'No'].sample(frac = 0.2)
df_min = df[df.Purchase == 'Yes']

udf = pd.concat([df_maj, df_min])


udf = udf.sample(frac = 1)
udf.Purchase.value_counts()
Out[115]:
No     1095
Yes     348
Name: Purchase, dtype: int64
In [116]:
X = udf[ udf.columns[:-1]  ]
le = LabelEncoder()
y = le.fit_transform(udf['Purchase'])
X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=0.3,
    shuffle = True,
    random_state=2)
In [117]:
C = 1
clf = SVC(kernel='rbf', C = C, probability = True)
clf.fit(X_train, y_train)
Out[117]:
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
In [118]:
yhat = clf.predict(X_test)
print(confusion_matrix(yhat, y_test))

yhat_proba = clf.predict_proba(X_test)[:,-1]
print("-- ROC AUC Score test {:.4f}".format(roc_auc_score(y_test, yhat_proba)  ))
[[318  85]
 [ 14  16]]
-- ROC AUC Score test 0.6596
In [ ]: