import pandas as pd
import numpy as np
import pandas.rpy.common as com
iris = com.load_data('iris')
iris.columns = map(lambda x: x.lower().replace('.', '_'), iris.columns)
iris.head()
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
1 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
2 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
3 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
4 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
5 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
species = pd.Categorical.from_array(iris['species'])
X = iris[['petal_width', 'petal_length']]
y = species.labels
clf = RandomForestClassifier(n_estimators=500, oob_score=True).fit(X, y)
# predict on training data just for testing
y_predict = clf.predict(X)
# OOB score (how to interpret?)
print 'OOB score: %.2f\n' % clf.oob_score_
# confusion matrix on training data
print 'Confusion matrix:'
print metrics.confusion_matrix(y, y)
OOB score: 0.97 Confusion matrix: [[50 0 0] [ 0 50 0] [ 0 0 50]]
# to parallelize
clf = RandomForestClassifier(n_estimators=500, oob_score=True, n_jobs=3).fit(X, y)
sepal_length = np.random.normal(loc=iris['sepal_length'].mean(), scale=iris['sepal_length'].std(), size=1000)
sepal_width = np.random.normal(loc=iris['sepal_width'].mean(), scale=iris['sepal_width'].std(), size=1000)
petal_width = np.random.normal(loc=iris['petal_width'].mean(), scale=iris['petal_width'].std(), size=1000)
petal_length = np.random.normal(loc=iris['petal_length'].mean(), scale=iris['petal_length'].std(), size=1000)
newdata = pd.DataFrame(dict(zip(iris.columns, [sepal_length, sepal_width, petal_length, petal_width])))
X_test = newdata[['petal_width', 'petal_length']]
y_predict = clf.predict(X_test)
cols = ['red', 'blue', 'green']
for i in range(len(cols)):
idx = y_predict == i
plot(newdata.ix[idx, 'petal_length'], newdata.ix[idx, 'petal_width'], 'o', color=cols[i])