Executive Summary

We trained various models to predict activity based on data from wearable activity monitors. The data is from the Human Activity Recognition data set http://web.archive.org/web/20161224072740/http:/groupware.les.inf.puc-rio.br/har, a dataset collected on 8 hours of activities on 4 different subjects.

The following models were fit to the training data and cross-validation used to estimate the out of sample error:

The Random Forest and Gradient Boosting Machine performed had the best validation set accuracies of 99.3% and 99.1% respectively.

Importing and pre-processing

Loading libraries

library(lattice)
library(ggplot2)
library(caret)
library(ggplot2)
library(RANN)
library(e1071)
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(parallel)
library(data.table)
# detecting the number of cores and creating a cluster
numCores <- detectCores()
cl <- makeCluster(numCores - 4)
# setwd("/Users/andrewlau/Google Drive/Coursera/JHU Data Specialisation/Course 8/")

Import data

The first 7 columns are name, ID and timestamps which are not useful for predicting and have thus been removed.

# download.file("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv", "/Users/andrewlau/Google Drive/Coursera/JHU Data Specialisation/Course 8/pml-training.csv")
# download.file("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv", "/Users/andrewlau/Google Drive/Coursera/JHU Data Specialisation/Course 8/pml-testing.csv")
# fread imports a lot faster than read.csv(), and especially using multiple cores
train <- fread("/Users/andrewlau/Google Drive/Programming/Coursera/JHU Data Specialisation/Course 8/Assignment/pml-training.csv", nThread=numCores)
test <- fread("/Users/andrewlau/Google Drive/Programming/Coursera/JHU Data Specialisation/Course 8/Assignment/pml-testing.csv", nThread=numCores)
train <- as.data.frame(train)
test <- as.data.frame(test)

#check structure of data
# str(train)
# str(test)

#removing nonsensical predictors
train$V1 <- NULL
train$user_name <- NULL
train$raw_timestamp_part_1 <- NULL
train$raw_timestamp_part_2 <- NULL
train$cvtd_timestamp <- NULL
train$new_window <- NULL
train$num_window <- NULL

test$V1 <- NULL
test$user_name <- NULL
test$raw_timestamp_part_1 <- NULL
test$raw_timestamp_part_2 <- NULL
test$cvtd_timestamp <- NULL
test$new_window <- NULL
test$num_window <- NULL

Data pre-processing

Some data exploration is performed to get a better feel of what the data is and how it is structured and pre-processing is performed to help the models find signal in the data.

  • Check distribution amongst classes
  • Check classes of each column
  • Convert predictors that are clearly meant to be numeric from factor to numeric
  • The classes of each column in the train and test set are different, this has been corrected
  • Remove factors with near zero variance
trainPP <- train
testPP <- test

#split amongst classes
barplot(table(trainPP$classe))

#check each column's class
# sapply(trainPP,class)
# sapply(testPP,class)

#there seems to be numeric saved as factors
#converting factors with more than 25 levels to numeric
for(i in names(trainPP)){
    if(class(trainPP[[i]]) == "factor"){
        if(length(levels(trainPP[[i]])) > 25){
            trainPP[[i]] <- as.numeric(as.character(trainPP[[i]]))
            testPP[[i]] <- as.numeric(as.character(testPP[[i]]))    
        }
    }
}

#make all classes in test same as train
#test set has logical of all NA's, convert to numeric
for(i in 1:dim(testPP)[2]){
    if(class(testPP[,i]) == "logical" & all(is.na(testPP[,i]))){
            testPP[,i] <- as.numeric(rep(NA,dim(testPP)[1]))
    }
}

#convert all classes in test to match train
for(i in 1:dim(trainPP)[2]){
    if(class(trainPP[, i]) == "factor"){
        testPP[, i] <- as.factor(testPP[, i])
    }
    else{
        class(testPP[, i]) <- class(trainPP[, i])
    }
}

#check classes in trainPP and testPP match

#removing near zero variance factors
nzv <- nearZeroVar(trainPP)
trainPP <- trainPP[, -nzv]
testPP <- testPP[, -nzv]

vars <- c("kurtosis_roll_belt" , "kurtosis_picth_belt" , "skewness_roll_belt" , "skewness_roll_belt.1" , "max_yaw_belt" , "min_yaw_belt" , "kurtosis_roll_arm" , "kurtosis_picth_arm" , "kurtosis_yaw_arm" , "skewness_roll_arm" , "skewness_pitch_arm" , "skewness_yaw_arm" , "kurtosis_roll_dumbbell" , "kurtosis_picth_dumbbell" , "skewness_roll_dumbbell" , "max_yaw_dumbbell" , "min_yaw_dumbbell" , "kurtosis_roll_forearm" , "kurtosis_picth_forearm" , "skewness_roll_forearm" , "skewness_pitch_forearm" , "max_yaw_forearm" , "min_yaw_forearm")

for(i in vars){
        trainPP[[i]][is.nan(trainPP[[i]])] <- NA
        testPP[[i]][is.nan(testPP[[i]])] <- NA
}


#final dimensions of the data
dim(testPP)
## [1]  20 118
dim(trainPP)
## [1] 19622   118
# str(trainPP
#     , list.len = dim(trainPP)[2]
#     )
# str(testPP
#     , list.len = dim(testPP)[2]
#     )

Data Splitting

Training dataset is split into a trainTrain and trainValid set.

set.seed(1234)

trainIndex <- createDataPartition(train$classe, p = .6, 
                                  list = FALSE, 
                                  times = 1)
trainTrain <- train[ trainIndex,]
trainValid  <- train[-trainIndex,]

trainPPTrain <- trainPP[ trainIndex,]
trainPPValid  <- trainPP[-trainIndex,]

Modelling

6-fold cross-validation is used here to estimate the out-of-sample error. “Introduction to Statistical Learning” recommends between 6 and 10 folds.

fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv"
                           ,number = 6
                           ,repeats = 1
                            )

LDA

set.seed(825)

# multicore processing was used, we register the cluster here
# registerDoParallel(cl)
# find out how many cores are being used
getDoParWorkers()
## [1] 1
ldaFit1 <- train(classe ~ ., data = trainPPTrain[,colSums(is.na(trainPPTrain))==0]
                 ,method = "lda"
                 ,trControl = fitControl
                 ,na.action = na.pass
                 )
# stopCluster(cl)
# registerDoSEQ()
#accuracy on trainTrain
ldaFit1
## Linear Discriminant Analysis 
## 
## 11776 samples
##    52 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (6 fold, repeated 1 times) 
## Summary of sample sizes: 9815, 9814, 9812, 9813, 9813, 9813, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.7025301  0.6233695

The LDA produces a model with an estimated out-of-sample accuracy of 70.3%.

Decision Tree

treeGrid <-  expand.grid(cp = seq(from=0.0000000000000000000000000000000000001, to=0.1, length.out = 10)
                         )
set.seed(78)
# find out how many cores are being used
getDoParWorkers()
## [1] 1
treeFit1 <- train(classe ~ ., data = trainPPTrain
                 ,method = "rpart"
                 ,tuneGrid = treeGrid
                 ,trControl = fitControl
                 ,na.action = na.pass
                 # ,preProcess = c("knnImpute")
                 )
#hyperparameters
plot(treeFit1)

#variable importance
varImp(treeFit1)
## rpart variable importance
## 
##   only 20 most important variables shown (out of 117)
## 
##                   Overall
## roll_belt          100.00
## yaw_belt            84.90
## pitch_forearm       81.45
## pitch_belt          73.30
## roll_forearm        69.99
## accel_dumbbell_y    63.60
## magnet_dumbbell_z   62.82
## magnet_dumbbell_y   53.39
## roll_dumbbell       47.57
## magnet_forearm_z    38.69
## magnet_belt_z       37.97
## yaw_arm             36.64
## magnet_dumbbell_x   36.42
## accel_belt_z        34.59
## accel_dumbbell_z    33.68
## magnet_belt_y       33.68
## accel_forearm_x     32.28
## accel_arm_x         27.79
## magnet_forearm_y    27.13
## gyros_belt_z        24.08
#accuracy on trainTrain
treeFit1
## CART 
## 
## 11776 samples
##   117 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (6 fold, repeated 1 times) 
## Summary of sample sizes: 9814, 9813, 9813, 9812, 9814, 9814, ... 
## Resampling results across tuning parameters:
## 
##   cp            Accuracy   Kappa    
##   1.000000e-37  0.9155901  0.8932205
##   1.111111e-02  0.7225744  0.6493470
##   2.222222e-02  0.5915400  0.4795246
##   3.333333e-02  0.5295542  0.3924347
##   4.444444e-02  0.5039913  0.3586148
##   5.555556e-02  0.4533767  0.2705709
##   6.666667e-02  0.3676967  0.1269362
##   7.777778e-02  0.3676967  0.1269362
##   8.888889e-02  0.3676967  0.1269362
##   1.000000e-01  0.3676967  0.1269362
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 1e-37.

A decision tree gives an estimated out-of-sample accuracy of 91.6% which is an improvement over the LDA model.

Random Forest

Random forest can improve upon the prediction accuracy of decision trees by bootstrapping and aggregating (bagging) many decision trees which decreases the variance of predictions and hence overall error. N predictors are chosen at random with each decision tree to “decorrelate” the decision trees and hence reduce the variance further. Let us examine how it performs versus the decision tree fit above.

# Find out how many cores are being used
getDoParWorkers()
## [1] 1
rforestGrid <-  expand.grid(mtry = c(5,10,15,20,35)
                         )

set.seed(934)

#random forest doesn't work with NA predictors, removed those columns
rforestFit1 <- train(classe ~ ., data = trainPPTrain[,colSums(is.na(trainPPTrain))==0]
                 ,method = "rf"
                 ,tuneGrid = rforestGrid
                 ,trControl = fitControl
                 ,na.action = na.pass
                 )
#hyperparameters
plot(rforestFit1)

#variable importance
varImp(rforestFit1)
## rf variable importance
## 
##   only 20 most important variables shown (out of 52)
## 
##                      Overall
## roll_belt             100.00
## yaw_belt               67.36
## pitch_forearm          61.20
## magnet_dumbbell_z      52.65
## magnet_dumbbell_y      51.04
## pitch_belt             50.48
## roll_forearm           42.76
## magnet_dumbbell_x      30.76
## accel_dumbbell_y       27.17
## magnet_belt_z          27.02
## roll_dumbbell          26.50
## accel_belt_z           24.10
## magnet_belt_y          23.96
## accel_dumbbell_z       19.84
## roll_arm               19.10
## accel_forearm_x        18.17
## magnet_forearm_z       16.71
## gyros_belt_z           15.73
## total_accel_dumbbell   15.28
## magnet_belt_x          14.14
#accuracy on trainTrain
rforestFit1
## Random Forest 
## 
## 11776 samples
##    52 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (6 fold, repeated 1 times) 
## Summary of sample sizes: 9812, 9815, 9814, 9813, 9814, 9812, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    5    0.9904047  0.9878605
##   10    0.9909993  0.9886131
##   15    0.9909144  0.9885057
##   20    0.9901497  0.9875381
##   35    0.9868385  0.9833490
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.

We are getting estimated out-of-sample errors of 99.1% on our training and validation test sets using the Random Forest. This is a significant improvement over the decision tree.

Gradient Boosint Machine

Here we fit a Gradient Boosting Machine.

fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv"
                           ,number = 3
                           ## repeated ten times
                           ,repeats = 1
                            )
gbmGrid <-  expand.grid(interaction.depth = c(6,8), 
                        n.trees = 1500, 
                        shrinkage = 0.01,
                        n.minobsinnode = c(ceiling(0.025*dim(train)[1]),ceiling(0.05*dim(train)[1])))

set.seed(825)
# Find out how many cores are being used
getDoParWorkers()
gbmFit1 <- train(classe ~ ., data = trainPPTrain
                 ,method = "gbm"
                 ,tuneGrid = gbmGrid
                 ,trControl = fitControl
                 ,bag.fraction = 0.7
                 ,verbose = TRUE
                 ,na.action = na.pass
                 )
#hyperparameters
plot(gbmFit1)

#variable importance
summary(gbmFit1)
#accuracy on trainTrain
gbmFit1
## Stochastic Gradient Boosting 
## 
## 11776 samples
##   117 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 1 times) 
## Summary of sample sizes: 7851, 7851, 7850 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.minobsinnode  Accuracy   Kappa    
##   6                  491             0.9776660  0.9717493
##   6                  982             0.9216199  0.9008157
##   8                  491             0.9825063  0.9778719
##   8                  982             0.9223840  0.9017929
## 
## Tuning parameter 'n.trees' was held constant at a value of 1500
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.01
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 1500,
##  interaction.depth = 8, shrinkage = 0.01 and n.minobsinnode = 491.

The Gradient Boosting Model is giving us estimate out-of-sample errors of 99.1%. This is similar to the estimated accuracy of the random forest model, however at the cost of far greater computational run time. This level of accuracy is probably good enough. If we wanted greater prediction accuracy, the next thing to try would be stacking models together, but the current level of accuracy is sufficient.

Results

The validation accuracies are calculated below. For our final model, we would use the random forest. It’s prediction accuracy is similar and slightly better than the GBM but with far less computational run time.

# LDA
# prediction on validation
ldaPredict1_trainPPValid <- predict(ldaFit1, newdata = trainPPValid, na.action=na.pass)
# prediction on test
ldaPredict1_testPP <- predict(ldaFit1, newdata = testPP, na.action=na.pass)

# decision tree
# prediction on validation
treePredict1_trainPPValid <- predict(treeFit1, newdata = trainPPValid, na.action=na.pass)
# prediction on test
treePredict1_testPP <- predict(treeFit1, newdata = testPP, na.action=na.pass)

# random forest
# prediction on validation
rforestPredict1_trainPPValid <- predict(rforestFit1, newdata = trainPPValid, na.action=na.pass)
# prediction on test
rforestPredict1_testPP <- predict(rforestFit1, newdata = testPP, na.action=na.pass)

# GBM
# prediction on validation
gbmPredict1_trainPPValid <- predict(gbmFit1, newdata = trainPPValid, na.action=na.pass)
# prediction on test
gbmPredict1_testPP <- predict(gbmFit1, newdata = testPP, na.action=na.pass)

results <- data.frame("model" = c("LDA", "Decision Tree", "Random Forest", "Gradient Boosting Machine"), "Validation set accuracy" = c(
        confusionMatrix(data=ldaPredict1_trainPPValid,reference = as.factor(trainPPValid$classe))$overall[1]
        ,confusionMatrix(data=treePredict1_trainPPValid,reference = as.factor(trainPPValid$classe))$overall[1]
        ,confusionMatrix(data=rforestPredict1_trainPPValid,reference = as.factor(trainPPValid$classe))$overall[1] 
        ,confusionMatrix(data=gbmPredict1_trainPPValid,reference = as.factor(trainPPValid$classe))$overall[1]
        ))
results
data.frame("LDA" = ldaPredict1_testPP, "Decision Tree" = treePredict1_testPP
           , "Random Forest" = rforestPredict1_testPP ,"GBM"=gbmPredict1_testPP)