In [1]:
import multiprocessing
In [2]:
multiprocessing.cpu_count()
Out[2]:
16
In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
In [4]:
train_df = pd.read_csv('forest_train.csv', index_col='Id')
In [5]:
train_df.head()
Out[5]:
Elevation Aspect Slope Horizontal_Distance_To_Hydrology Vertical_Distance_To_Hydrology Horizontal_Distance_To_Roadways Hillshade_9am Hillshade_Noon Hillshade_3pm Horizontal_Distance_To_Fire_Points ... Soil_Type32 Soil_Type33 Soil_Type34 Soil_Type35 Soil_Type36 Soil_Type37 Soil_Type38 Soil_Type39 Soil_Type40 Cover_Type
Id
1 2596 51 3 258 0 510 221 232 148 6279 ... 0 0 0 0 0 0 0 0 0 5
2 2590 56 2 212 -6 390 220 235 151 6225 ... 0 0 0 0 0 0 0 0 0 5
3 2804 139 9 268 65 3180 234 238 135 6121 ... 0 0 0 0 0 0 0 0 0 2
4 2785 155 18 242 118 3090 238 238 122 6211 ... 0 0 0 0 0 0 0 0 0 2
5 2595 45 2 153 -1 391 220 234 150 6172 ... 0 0 0 0 0 0 0 0 0 5

5 rows × 55 columns

In [6]:
forest = RandomForestClassifier(n_estimators=1000, n_jobs=-1, max_depth=15, random_state=42, oob_score=True)
In [7]:
%%time
forest.fit(train_df.drop('Cover_Type', axis=1), train_df['Cover_Type'])
CPU times: user 29.2 s, sys: 386 ms, total: 29.6 s
Wall time: 5.48 s
Out[7]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=True, random_state=42, verbose=0, warm_start=False)
In [8]:
forest.oob_score_
Out[8]:
0.84656084656084651
In [ ]: