import pandas.rpy.common as com
SAheart = com.load_data('SAheart', package='ElemStatLearn')
SAheart.head()
sbp | tobacco | ldl | adiposity | famhist | typea | obesity | alcohol | age | chd | |
---|---|---|---|---|---|---|---|---|---|---|
1 | 160 | 12.00 | 5.73 | 23.11 | Present | 49 | 25.30 | 97.20 | 52 | 1 |
2 | 144 | 0.01 | 4.41 | 28.61 | Absent | 55 | 28.87 | 2.06 | 63 | 1 |
3 | 118 | 0.08 | 3.48 | 32.28 | Present | 52 | 29.14 | 3.81 | 46 | 0 |
4 | 170 | 7.50 | 6.41 | 38.03 | Present | 51 | 31.99 | 24.26 | 58 | 1 |
5 | 134 | 13.60 | 3.50 | 27.78 | Present | 60 | 25.99 | 57.34 | 49 | 1 |
%load_ext rmagic
%%R -o train
set.seed(8484)
train = sample(1:dim(SAheart)[1],size=dim(SAheart)[1]/2,replace=F)
trainSA = SAheart.ix[train,:]
test = filter(lambda x: x not in train, SAheart.index)
testSA = SAheart.ix[test, :]
from statsmodels.formula.api import glm
from statsmodels.api import families as f
lm = glm('chd ~ age + alcohol + obesity + tobacco + typea + ldl', trainSA, family=f.Binomial()).fit()
def missClass(values, prediction):
return float( sum( ((prediction > 0.5) * 1) != values ) ) / float(len(values))
print 'training set misclassification %.4f' % missClass(trainSA['chd'], lm.fittedvalues)
print 'test set misclassification %.4f' % missClass(testSA['chd'], lm.predict(testSA.ix[:,:-1]))
training set misclassification 0.2597 test set misclassification 0.3203
olive = com.load_data('olive', package='pgmm')
olive = olive.ix[:,:-1]
olive.head()
Region | Area | Palmitic | Palmitoleic | Stearic | Oleic | Linoleic | Linolenic | Arachidic | |
---|---|---|---|---|---|---|---|---|---|
1 | 1 | 1 | 1075 | 75 | 226 | 7823 | 672 | 36 | 60 |
2 | 1 | 1 | 1088 | 73 | 224 | 7709 | 781 | 31 | 61 |
3 | 1 | 1 | 911 | 54 | 246 | 8113 | 549 | 31 | 63 |
4 | 1 | 1 | 966 | 57 | 240 | 7952 | 619 | 50 | 78 |
5 | 1 | 1 | 1051 | 67 | 259 | 7771 | 672 | 50 | 80 |
from sklearn import tree
import patsy as pt
import pandas as pd
y, X = pt.dmatrices('Area ~ Region + Palmitic + Palmitoleic + Stearic + \
Oleic + Linoleic + Linolenic + Arachidic - 1', olive)
clf = tree.DecisionTreeClassifier().fit(X, y)
import StringIO, pydot
from IPython.core.display import HTML
dot_data = StringIO.StringIO()
tree.export_graphviz(clf, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png('tree3.png')
#HTML('<img src="files/tree3.png" width=3000 height=300/>')
True
olive.mean()
Region 1.699301 Area 4.599650 Palmitic 1231.741259 Palmitoleic 126.094406 Stearic 228.865385 Oleic 7311.748252 Linoleic 980.527972 Linolenic 31.888112 Arachidic 58.097902
newdata = olive[['Region', 'Palmitic', 'Palmitoleic', 'Stearic', 'Oleic', 'Linoleic', 'Linolenic', 'Arachidic']].mean()
clf.predict_proba(newdata)
array([[ 0., 1., 0., 0., 0., 0., 0., 0., 0.]])