import pandas as pd
import numpy as np
import pandas.rpy.common as com

iris = com.load_data('iris')
iris.columns = map(lambda x: x.lower().replace('.', '_'), iris.columns)
iris.head()

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

species = pd.Categorical.from_array(iris['species'])

X = iris[['petal_width', 'petal_length']]
y = species.labels

clf = RandomForestClassifier(n_estimators=500, oob_score=True).fit(X, y)
# predict on training data just for testing
y_predict = clf.predict(X)

# OOB score (how to interpret?)
print 'OOB score: %.2f\n' % clf.oob_score_

# confusion matrix on training data
print 'Confusion matrix:'
print metrics.confusion_matrix(y, y)

# to parallelize
clf = RandomForestClassifier(n_estimators=500, oob_score=True, n_jobs=3).fit(X, y)

sepal_length = np.random.normal(loc=iris['sepal_length'].mean(), scale=iris['sepal_length'].std(), size=1000)
sepal_width = np.random.normal(loc=iris['sepal_width'].mean(), scale=iris['sepal_width'].std(), size=1000)
petal_width = np.random.normal(loc=iris['petal_width'].mean(), scale=iris['petal_width'].std(), size=1000)
petal_length = np.random.normal(loc=iris['petal_length'].mean(), scale=iris['petal_length'].std(), size=1000)

newdata = pd.DataFrame(dict(zip(iris.columns, [sepal_length, sepal_width, petal_length, petal_width])))

X_test = newdata[['petal_width', 'petal_length']]
y_predict = clf.predict(X_test)

cols = ['red', 'blue', 'green']

for i in range(len(cols)):
    idx = y_predict == i
    plot(newdata.ix[idx, 'petal_length'], newdata.ix[idx, 'petal_width'], 'o', color=cols[i])