%%html
<link rel="stylesheet" href="static/hyrule.css" type="text/css">
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from __future__ import division
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)
df = pd.read_pickle('affair_dataframe.pkl')
df.head()
rate_marriage | age | yrs_married | children | religious | educ | occupation | occupation_husb | affairs | affair | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 32 | 9.0 | 3 | 3 | 17 | 2 | 5 | 0.111111 | 1 |
1 | 3 | 27 | 13.0 | 3 | 1 | 14 | 3 | 4 | 3.230769 | 1 |
2 | 4 | 22 | 2.5 | 0 | 1 | 16 | 3 | 5 | 1.400000 | 1 |
3 | 4 | 37 | 16.5 | 4 | 3 | 16 | 5 | 5 | 0.727273 | 1 |
4 | 5 | 27 | 9.0 | 1 | 1 | 14 | 3 | 4 | 4.666666 | 1 |
from sklearn import linear_model as lm
from sklearn import cross_validation as cv
from sklearn import metrics
x_cols = list(df.columns)
y_col = 'affair'
x_cols.remove(y_col)
x_train,x_test,y_train,y_test=cv.train_test_split(df[x_cols],
df[y_col],
test_size=0.333,
random_state=1234)
model = lm.LogisticRegression(penalty='l1', C=1e100).fit(x_train, y_train)
model.coef_
array([[ -0.23708717, -0.01274157, 0.07038539, 0.08437953, -0.31884344, -0.02325092, 0.01232658, -0.09594941, 10.73613023]])
metrics.accuracy_score(y_test, model.predict(x_test))
0.96226415094339623
# K-fold validation
kf = cv.KFold(n=len(df), n_folds=10, shuffle=True, random_state=1234)
for train, test in kf:
model = lm.LogisticRegression(penalty='l1', C=1e100).fit(df.iloc[train][x_cols],
df.iloc[train][y_col])
# Score for most classifiers by default returns accuracy.
print metrics.accuracy_score(df.iloc[test][y_col],
model.predict(df.iloc[test][x_cols]))
0.956043956044 0.967032967033 0.967032967033 0.94191522763 0.967032967033 0.970172684458 0.968553459119 0.960691823899 0.954402515723 0.973270440252
import pickle; m = pickle.load(open('affairs.pkl'))
m
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)
m.predict_proba([1,1,1])[:,1]
array([ 0.33142288])
11.25