import pandas.rpy.common as com
iris = com.load_data('iris')
iris.columns = map(lambda x: x.lower().replace('.', '_'), iris.columns)
iris.columns
iris['species'].value_counts()
spec = iris.groupby('species')
cols = ['k', 'r', 'g']
i = 0
for s, df in spec:
plot(df['petal_width'], df['sepal_width'], 'o', color=cols[i], label=s)
i += 1
legend()
xlabel('Petal Width')
ylabel('Sepal Width');
import patsy as pt
from sklearn import tree
y, X = pt.dmatrices('species ~ sepal_width + petal_width - 1', iris)
clf = tree.DecisionTreeClassifier(max_depth=3).fit(X, y)
import StringIO, pydot
from IPython.core.display import HTML
dot_data = StringIO.StringIO()
tree.export_graphviz(clf, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png('tree.png')
HTML('')
clf.tree_.threshold
%load_ext rmagic
%%R -o rnewdata
set.seed(32313)
rnewdata <- data.frame(sepal_width = runif(20,2,4.5),petal_width = runif(20,0,2.5))
import pandas as pd
newdata = pd.DataFrame(rnewdata.T, columns=['sepal_width', 'petal_width'])
pred1 = clf.predict_proba(newdata)
species = iris['species'].unique()
idx = ['0', '1']
cols = [x + '_' + y for x in species for y in idx]
pd.DataFrame(np.hstack(pred1), columns=cols)
# I don't understand what these probabilities mean
Cars93 = com.load_data('Cars93', package='MASS')
Cars93.columns = map(lambda x: x.lower().replace('.', '_'), Cars93.columns)
Cars93.ix[:6, :15]
y, X = pt.dmatrices('drivetrain ~ mpg_city + mpg_highway + airbags + \
enginesize + width + length + weight + price + \
cylinders + horsepower + wheelbase - 1', Cars93)
clf = tree.DecisionTreeClassifier().fit(X, y)
dot_data = StringIO.StringIO()
tree.export_graphviz(clf, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png('tree2.png')
HTML('')
# pruning not currently supported