#!/usr/bin/env python # coding: utf-8 # ## Results # ### Training # # We now run our RandomForest modeling software on our training set, described earlier, and derive a model along with some parameters describing how good our model is. # In[17]: get_ipython().run_line_magic('pylab', 'inline') # We pull in the training, validation and test sets created according to the scheme described # in the data exploration lesson. import pandas as pd samtrain = pd.read_csv('../datasets/samsung/samtrain.csv') samval = pd.read_csv('../datasets/samsung/samval.csv') samtest = pd.read_csv('../datasets/samsung/samtest.csv') # We use the Python RandomForest package from the scikits.learn collection of algorithms. # The package is called sklearn.ensemble.RandomForestClassifier # For this we need to convert the target column ('activity') to integer values # because the Python RandomForest package requires that. # In R it would have been a "factor" type and R would have used that for classification. # We map activity to an integer according to # laying = 1, sitting = 2, standing = 3, walk = 4, walkup = 5, walkdown = 6 # Code is in supporting library randomforest.py import randomforests as rf samtrain = rf.remap_col(samtrain,'activity') samval = rf.remap_col(samval,'activity') samtest = rf.remap_col(samtest,'activity') # In[3]: import sklearn.ensemble as sk rfc = sk.RandomForestClassifier(n_estimators=500, compute_importances=True, oob_score=True) train_data = samtrain[samtrain.columns[1:-2]] train_truth = samtrain['activity'] model = rfc.fit(train_data, train_truth) # In[4]: # use the OOB (out of band) score which is an estimate of accuracy of our model. rfc.oob_score_ # In[5]: ### TRY THIS # use "feature importance" scores to see what the top 10 important features are fi = enumerate(rfc.feature_importances_) cols = samtrain.columns [(value,cols[i]) for (i,value) in fi if value > 0.04] ## Change the value 0.04 which we picked empirically to give us 10 variables ## try running this code after changing the value up and down so you get more or less variables ## do you see how this might be useful in refining the model? ## Here is the code in case you mess up the line above ## [(value,cols[i]) for (i,value) in fi if value > 0.04] # We use the predict() function using our model on our validation set and our test set and get the following results from our analysis of errors in the predictions. # In[6]: # pandas data frame adds a spurious unknown column in 0 position hence starting at col 1 # not using subject column, activity ie target is in last columns hence -2 i.e dropping last 2 cols val_data = samval[samval.columns[1:-2]] val_truth = samval['activity'] val_pred = rfc.predict(val_data) test_data = samtest[samtest.columns[1:-2]] test_truth = samtest['activity'] test_pred = rfc.predict(test_data) # ####Prediction Errors and Computed Error Measures # In[7]: print("mean accuracy score for validation set = %f" %(rfc.score(val_data, val_truth))) print("mean accuracy score for test set = %f" %(rfc.score(test_data, test_truth))) # In[8]: # use the confusion matrix to see how observations were misclassified as other activities # See [5] import sklearn.metrics as skm test_cm = skm.confusion_matrix(test_truth,test_pred) # In[9]: # visualize the confusion matrix # In[10]: import pylab as pl pl.matshow(test_cm) pl.title('Confusion matrix for test data') pl.colorbar() pl.show() # In[11]: # compute a number of other common measures of prediction goodness # We now compute some commonly used measures of prediction "goodness". # For more detail on these measures see # [6],[7],[8],[9] # In[12]: # Accuracy print("Accuracy = %f" %(skm.accuracy_score(test_truth,test_pred))) # In[13]: # Precision print("Precision = %f" %(skm.precision_score(test_truth,test_pred))) # In[14]: # Recall print("Recall = %f" %(skm.recall_score(test_truth,test_pred))) # In[15]: # F1 Score print("F1 score = %f" %(skm.f1_score(test_truth,test_pred))) # ### Exercise # # Instead of using domain knowledge to reduce variables, use Random Forests directly on the full set of columns. Then use variable importance and sort the variables. # # Compare the model you get with the model you got from using domain knowledge. # You can short circuit the data cleanup process as well by simply renaming the variables x1, x2...xn, y where y is 'activity' the dependent variable. # # Now look at the new Random Forest model you get. It is likely to be more accurate at prediction than the one we have above. It is a black box model, where there is no meaning attached to the variables. # # * What insights does it give you? # * Which model do you prefer? # * Why? # * Is this an absolute preference or might it change? # * What might cause it to change? # ## References # # [1] Original dataset as R data https://spark-public.s3.amazonaws.com/dataanalysis/samsungData.rda # [2] Human Activity Recognition Using Smartphones http://archive.ics.uci.edu/ml/datasets/Human+Activity+Recognition+Using+Smartphones # [3] Android Developer Reference http://developer.android.com/reference/android/hardware/Sensor.html # [4] Random Forests http://en.wikipedia.org/wiki/Random_forest # [5] Confusion matrix http://en.wikipedia.org/wiki/Confusion_matrix # [6] Mean Accuracy http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=1054102&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D1054102 # # [7] Precision http://en.wikipedia.org/wiki/Precision_and_recall # [8] Recall http://en.wikipedia.org/wiki/Precision_and_recall # [9] F Measure http://en.wikipedia.org/wiki/Precision_and_recall # In[16]: from IPython.core.display import HTML def css_styling(): styles = open("../styles/custom.css", "r").read() return HTML(styles) css_styling() # In[16]: