import pandas.rpy.common as com iris = com.load_data('iris') iris.columns = map(lambda x: x.lower().replace('.', '_'), iris.columns) iris.columns iris['species'].value_counts() spec = iris.groupby('species') cols = ['k', 'r', 'g'] i = 0 for s, df in spec: plot(df['petal_width'], df['sepal_width'], 'o', color=cols[i], label=s) i += 1 legend() xlabel('Petal Width') ylabel('Sepal Width'); import patsy as pt from sklearn import tree y, X = pt.dmatrices('species ~ sepal_width + petal_width - 1', iris) clf = tree.DecisionTreeClassifier(max_depth=3).fit(X, y) import StringIO, pydot from IPython.core.display import HTML dot_data = StringIO.StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_png('tree.png') HTML('') clf.tree_.threshold %load_ext rmagic %%R -o rnewdata set.seed(32313) rnewdata <- data.frame(sepal_width = runif(20,2,4.5),petal_width = runif(20,0,2.5)) import pandas as pd newdata = pd.DataFrame(rnewdata.T, columns=['sepal_width', 'petal_width']) pred1 = clf.predict_proba(newdata) species = iris['species'].unique() idx = ['0', '1'] cols = [x + '_' + y for x in species for y in idx] pd.DataFrame(np.hstack(pred1), columns=cols) # I don't understand what these probabilities mean Cars93 = com.load_data('Cars93', package='MASS') Cars93.columns = map(lambda x: x.lower().replace('.', '_'), Cars93.columns) Cars93.ix[:6, :15] y, X = pt.dmatrices('drivetrain ~ mpg_city + mpg_highway + airbags + \ enginesize + width + length + weight + price + \ cylinders + horsepower + wheelbase - 1', Cars93) clf = tree.DecisionTreeClassifier().fit(X, y) dot_data = StringIO.StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_png('tree2.png') HTML('') # pruning not currently supported