In [1]:

import pandas.rpy.common as com

In [11]:

SAheart = com.load_data('SAheart', package='ElemStatLearn')
SAheart.head()

Out[11]:

	sbp	tobacco	ldl	adiposity	famhist	typea	obesity	alcohol	age	chd
1	160	12.00	5.73	23.11	Present	49	25.30	97.20	52	1
2	144	0.01	4.41	28.61	Absent	55	28.87	2.06	63	1
3	118	0.08	3.48	32.28	Present	52	29.14	3.81	46	0
4	170	7.50	6.41	38.03	Present	51	31.99	24.26	58	1
5	134	13.60	3.50	27.78	Present	60	25.99	57.34	49	1

In [12]:

%load_ext rmagic

In [13]:

%%R -o train
set.seed(8484)
train = sample(1:dim(SAheart)[1],size=dim(SAheart)[1]/2,replace=F)

In [37]:

trainSA = SAheart.ix[train,:]
test = filter(lambda x: x not in train, SAheart.index)
testSA = SAheart.ix[test, :]

In [38]:

from statsmodels.formula.api import glm
from statsmodels.api import families as f

In [39]:

lm = glm('chd ~ age + alcohol + obesity + tobacco + typea + ldl', trainSA, family=f.Binomial()).fit()

In [40]:

def missClass(values, prediction):
    return float( sum( ((prediction > 0.5) * 1) != values ) ) / float(len(values))

In [52]:

print 'training set misclassification %.4f' % missClass(trainSA['chd'], lm.fittedvalues)
print 'test set misclassification %.4f' % missClass(testSA['chd'], lm.predict(testSA.ix[:,:-1]))

training set misclassification 0.2597
test set misclassification 0.3203

In [56]:

olive = com.load_data('olive', package='pgmm')
olive = olive.ix[:,:-1]
olive.head()

Out[56]:

	Region	Area	Palmitic	Palmitoleic	Stearic	Oleic	Linoleic	Linolenic	Arachidic
1	1	1	1075	75	226	7823	672	36	60
2	1	1	1088	73	224	7709	781	31	61
3	1	1	911	54	246	8113	549	31	63
4	1	1	966	57	240	7952	619	50	78
5	1	1	1051	67	259	7771	672	50	80

In [60]:

from sklearn import tree
import patsy as pt
import pandas as pd

In [96]:

y, X = pt.dmatrices('Area ~ Region + Palmitic + Palmitoleic + Stearic + \
                            Oleic + Linoleic + Linolenic + Arachidic - 1', olive)

clf = tree.DecisionTreeClassifier().fit(X, y)

In [97]:

import StringIO, pydot
from IPython.core.display import HTML

dot_data = StringIO.StringIO()
tree.export_graphviz(clf, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png('tree3.png')
#HTML('<img src="files/tree3.png" width=3000 height=300/>')

Out[97]:

True

In [104]:

olive.mean()

Out[104]:

Region            1.699301
Area              4.599650
Palmitic       1231.741259
Palmitoleic     126.094406
Stearic         228.865385
Oleic          7311.748252
Linoleic        980.527972
Linolenic        31.888112
Arachidic        58.097902

In [111]:

newdata = olive[['Region', 'Palmitic', 'Palmitoleic', 'Stearic', 'Oleic', 'Linoleic', 'Linolenic', 'Arachidic']].mean()

In [112]:

clf.predict_proba(newdata)

Out[112]:

array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [ ]: