In [3]:

%%html
<link rel="stylesheet" href="static/hyrule.css" type="text/css">

In [4]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from __future__ import division
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

In [5]:

df = pd.read_pickle('affair_dataframe.pkl')
df.head()

Out[5]:

	rate_marriage	age	yrs_married	children	religious	educ	occupation	occupation_husb	affairs	affair
0	3	32	9.0	3	3	17	2	5	0.111111	1
1	3	27	13.0	3	1	14	3	4	3.230769	1
2	4	22	2.5	0	1	16	3	5	1.400000	1
3	4	37	16.5	4	3	16	5	5	0.727273	1
4	5	27	9.0	1	1	14	3	4	4.666666	1

In [6]:

from sklearn import linear_model as lm
from sklearn import cross_validation as cv
from sklearn import metrics
x_cols = list(df.columns)
y_col = 'affair'
x_cols.remove(y_col)
x_train,x_test,y_train,y_test=cv.train_test_split(df[x_cols],
                                                  df[y_col],
                                                  test_size=0.333,
                                                  random_state=1234)
model = lm.LogisticRegression(penalty='l1', C=1e100).fit(x_train, y_train)
model.coef_

Out[6]:

array([[ -0.23708717,  -0.01274157,   0.07038539,   0.08437953,
         -0.31884344,  -0.02325092,   0.01232658,  -0.09594941,
         10.73613023]])

In [7]:

metrics.accuracy_score(y_test, model.predict(x_test))

Out[7]:

0.96226415094339623

In [8]:

# K-fold validation
kf = cv.KFold(n=len(df), n_folds=10, shuffle=True, random_state=1234)

for train, test in kf:
    model = lm.LogisticRegression(penalty='l1', C=1e100).fit(df.iloc[train][x_cols], 
                                                             df.iloc[train][y_col])
    # Score for most classifiers by default returns accuracy.
    print metrics.accuracy_score(df.iloc[test][y_col], 
                                 model.predict(df.iloc[test][x_cols]))

0.956043956044
0.967032967033
0.967032967033
0.94191522763
0.967032967033
0.970172684458
0.968553459119
0.960691823899
0.954402515723
0.973270440252

In [9]:

import pickle; m = pickle.load(open('affairs.pkl'))

In [11]:

Out[11]:

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [10]:

m.predict_proba([1,1,1])[:,1]

Out[10]:

array([ 0.33142288])

In [12]:

Out[12]:

11.25

In [ ]: