import pandas as pd import numpy as np import pandas.rpy.common as com iris = com.load_data('iris') iris.columns = map(lambda x: x.lower().replace('.', '_'), iris.columns) iris.head() from sklearn.ensemble import RandomForestClassifier from sklearn import metrics species = pd.Categorical.from_array(iris['species']) X = iris[['petal_width', 'petal_length']] y = species.labels clf = RandomForestClassifier(n_estimators=500, oob_score=True).fit(X, y) # predict on training data just for testing y_predict = clf.predict(X) # OOB score (how to interpret?) print 'OOB score: %.2f\n' % clf.oob_score_ # confusion matrix on training data print 'Confusion matrix:' print metrics.confusion_matrix(y, y) # to parallelize clf = RandomForestClassifier(n_estimators=500, oob_score=True, n_jobs=3).fit(X, y) sepal_length = np.random.normal(loc=iris['sepal_length'].mean(), scale=iris['sepal_length'].std(), size=1000) sepal_width = np.random.normal(loc=iris['sepal_width'].mean(), scale=iris['sepal_width'].std(), size=1000) petal_width = np.random.normal(loc=iris['petal_width'].mean(), scale=iris['petal_width'].std(), size=1000) petal_length = np.random.normal(loc=iris['petal_length'].mean(), scale=iris['petal_length'].std(), size=1000) newdata = pd.DataFrame(dict(zip(iris.columns, [sepal_length, sepal_width, petal_length, petal_width]))) X_test = newdata[['petal_width', 'petal_length']] y_predict = clf.predict(X_test) cols = ['red', 'blue', 'green'] for i in range(len(cols)): idx = y_predict == i plot(newdata.ix[idx, 'petal_length'], newdata.ix[idx, 'petal_width'], 'o', color=cols[i])