In [107]:

from sklearn import linear_model

In [108]:

import pandas as pd

In [109]:

train = pd.read_csv('train.csv')

データの加工¶

In [110]:

train.Age = train.Age.fillna(train.Age.mean())

for i, sex in enumerate(train.Sex):
    if sex=='male':
        train.Sex[i]=1
    else:
        train.Sex[i]=0

ロジスティック回帰¶

In [111]:

logiReg = linear_model.LogisticRegression()

Seriesオブジェクト¶

In [112]:

y = train['Survived']
print type(y)

<class 'pandas.core.series.Series'>

DataFrameオブジェクト¶

In [113]:

X = train[['Age', 'Sex']]
print type(X)

<class 'pandas.core.frame.DataFrame'>

In [114]:

logiReg.fit(X, y)

Out[114]:

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [115]:

print logiReg.coef_ # 回帰係数
print logiReg.intercept_ # 切片
print logiReg.score(X, y) # 決定係数

[[-0.0042936  -2.41865573]]
[ 1.11913633]
0.786756453423

In [120]:

py = logiReg.predict(X) # 当てはめ

In [121]:

table = pd.crosstab(y, py)
table

Out[121]:

col_0	0	1
Survived
0	468	81
1	109	233

In [122]:

(468+233)/(468+233+81+109.0)

Out[122]:

0.7867564534231201

In [ ]: