In [1]:

import pandas as pd
import numpy as np
import pandas.rpy.common as com

In [5]:

iris = com.load_data('iris')
iris.columns = map(lambda x: x.lower().replace('.', '_'), iris.columns)
iris.head()

Out[5]:

	sepal_length	sepal_width	petal_length	petal_width	species
1	5.1	3.5	1.4	0.2	setosa
2	4.9	3.0	1.4	0.2	setosa
3	4.7	3.2	1.3	0.2	setosa
4	4.6	3.1	1.5	0.2	setosa
5	5.0	3.6	1.4	0.2	setosa

In [30]:

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

species = pd.Categorical.from_array(iris['species'])

X = iris[['petal_width', 'petal_length']]
y = species.labels

clf = RandomForestClassifier(n_estimators=500, oob_score=True).fit(X, y)
# predict on training data just for testing
y_predict = clf.predict(X)

# OOB score (how to interpret?)
print 'OOB score: %.2f\n' % clf.oob_score_

# confusion matrix on training data
print 'Confusion matrix:'
print metrics.confusion_matrix(y, y)

OOB score: 0.97

Confusion matrix:
[[50  0  0]
 [ 0 50  0]
 [ 0  0 50]]

In [33]:

# to parallelize
clf = RandomForestClassifier(n_estimators=500, oob_score=True, n_jobs=3).fit(X, y)

In [42]:

sepal_length = np.random.normal(loc=iris['sepal_length'].mean(), scale=iris['sepal_length'].std(), size=1000)
sepal_width = np.random.normal(loc=iris['sepal_width'].mean(), scale=iris['sepal_width'].std(), size=1000)
petal_width = np.random.normal(loc=iris['petal_width'].mean(), scale=iris['petal_width'].std(), size=1000)
petal_length = np.random.normal(loc=iris['petal_length'].mean(), scale=iris['petal_length'].std(), size=1000)

newdata = pd.DataFrame(dict(zip(iris.columns, [sepal_length, sepal_width, petal_length, petal_width])))

In [51]:

X_test = newdata[['petal_width', 'petal_length']]
y_predict = clf.predict(X_test)

In [52]:

cols = ['red', 'blue', 'green']

for i in range(len(cols)):
    idx = y_predict == i
    plot(newdata.ix[idx, 'petal_length'], newdata.ix[idx, 'petal_width'], 'o', color=cols[i])

In [ ]: