#!/usr/bin/env python # coding: utf-8 # In[2]: # First let's import the dataset, using Pandas. import pandas as pd train = pd.read_csv("train.csv") # make sure you're in the right directory if using iPython! test = pd.read_csv("test.csv") train.head() # ignore the first column, it's how I split the data. # In[3]: from sklearn.ensemble import RandomForestClassifier # however, are data has to be in a numpy array in order for the random forest algorithm to except it! cols = ['petal_length', 'petal_width', 'sepal_length', 'sepal_width'] colsRes = ['class'] trainArr = train.as_matrix(cols) # training array trainRes = train.as_matrix(colsRes) # training results ## Training! rf = RandomForestClassifier(n_estimators=100) # 100 decision trees is a good enough number rf.fit(trainArr, trainRes) # finally, we fit the data to the algorithm!!! :) # note - you might get an warning saying you entered a 2 column vector..ignore it. # In[5]: ## Testing! # put the test results in the same format! testArr = test.as_matrix(cols) results = rf.predict(testArr) # something I like to do is to add it back to the dataframe, so I can compare side-by-side test['predictions'] = results test.head()