We trained various models to predict activity based on data from wearable activity monitors. The data is from the Human Activity Recognition data set http://web.archive.org/web/20161224072740/http:/groupware.les.inf.puc-rio.br/har, a dataset collected on 8 hours of activities on 4 different subjects.
The following models were fit to the training data and cross-validation used to estimate the out of sample error:
The Random Forest and Gradient Boosting Machine performed had the best validation set accuracies of 99.3% and 99.1% respectively.
library(lattice)
library(ggplot2)
library(caret)
library(ggplot2)
library(RANN)
library(e1071)
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(parallel)
library(data.table)
# detecting the number of cores and creating a cluster
numCores <- detectCores()
cl <- makeCluster(numCores - 4)
# setwd("/Users/andrewlau/Google Drive/Coursera/JHU Data Specialisation/Course 8/")
The first 7 columns are name, ID and timestamps which are not useful for predicting and have thus been removed.
# download.file("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv", "/Users/andrewlau/Google Drive/Coursera/JHU Data Specialisation/Course 8/pml-training.csv")
# download.file("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv", "/Users/andrewlau/Google Drive/Coursera/JHU Data Specialisation/Course 8/pml-testing.csv")
# fread imports a lot faster than read.csv(), and especially using multiple cores
train <- fread("/Users/andrewlau/Google Drive/Programming/Coursera/JHU Data Specialisation/Course 8/Assignment/pml-training.csv", nThread=numCores)
test <- fread("/Users/andrewlau/Google Drive/Programming/Coursera/JHU Data Specialisation/Course 8/Assignment/pml-testing.csv", nThread=numCores)
train <- as.data.frame(train)
test <- as.data.frame(test)
#check structure of data
# str(train)
# str(test)
#removing nonsensical predictors
train$V1 <- NULL
train$user_name <- NULL
train$raw_timestamp_part_1 <- NULL
train$raw_timestamp_part_2 <- NULL
train$cvtd_timestamp <- NULL
train$new_window <- NULL
train$num_window <- NULL
test$V1 <- NULL
test$user_name <- NULL
test$raw_timestamp_part_1 <- NULL
test$raw_timestamp_part_2 <- NULL
test$cvtd_timestamp <- NULL
test$new_window <- NULL
test$num_window <- NULL
Some data exploration is performed to get a better feel of what the data is and how it is structured and pre-processing is performed to help the models find signal in the data.
trainPP <- train
testPP <- test
#split amongst classes
barplot(table(trainPP$classe))
#check each column's class
# sapply(trainPP,class)
# sapply(testPP,class)
#there seems to be numeric saved as factors
#converting factors with more than 25 levels to numeric
for(i in names(trainPP)){
if(class(trainPP[[i]]) == "factor"){
if(length(levels(trainPP[[i]])) > 25){
trainPP[[i]] <- as.numeric(as.character(trainPP[[i]]))
testPP[[i]] <- as.numeric(as.character(testPP[[i]]))
}
}
}
#make all classes in test same as train
#test set has logical of all NA's, convert to numeric
for(i in 1:dim(testPP)[2]){
if(class(testPP[,i]) == "logical" & all(is.na(testPP[,i]))){
testPP[,i] <- as.numeric(rep(NA,dim(testPP)[1]))
}
}
#convert all classes in test to match train
for(i in 1:dim(trainPP)[2]){
if(class(trainPP[, i]) == "factor"){
testPP[, i] <- as.factor(testPP[, i])
}
else{
class(testPP[, i]) <- class(trainPP[, i])
}
}
#check classes in trainPP and testPP match
#removing near zero variance factors
nzv <- nearZeroVar(trainPP)
trainPP <- trainPP[, -nzv]
testPP <- testPP[, -nzv]
vars <- c("kurtosis_roll_belt" , "kurtosis_picth_belt" , "skewness_roll_belt" , "skewness_roll_belt.1" , "max_yaw_belt" , "min_yaw_belt" , "kurtosis_roll_arm" , "kurtosis_picth_arm" , "kurtosis_yaw_arm" , "skewness_roll_arm" , "skewness_pitch_arm" , "skewness_yaw_arm" , "kurtosis_roll_dumbbell" , "kurtosis_picth_dumbbell" , "skewness_roll_dumbbell" , "max_yaw_dumbbell" , "min_yaw_dumbbell" , "kurtosis_roll_forearm" , "kurtosis_picth_forearm" , "skewness_roll_forearm" , "skewness_pitch_forearm" , "max_yaw_forearm" , "min_yaw_forearm")
for(i in vars){
trainPP[[i]][is.nan(trainPP[[i]])] <- NA
testPP[[i]][is.nan(testPP[[i]])] <- NA
}
#final dimensions of the data
dim(testPP)
## [1] 20 118
dim(trainPP)
## [1] 19622 118
# str(trainPP
# , list.len = dim(trainPP)[2]
# )
# str(testPP
# , list.len = dim(testPP)[2]
# )
Training dataset is split into a trainTrain and trainValid set.
set.seed(1234)
trainIndex <- createDataPartition(train$classe, p = .6,
list = FALSE,
times = 1)
trainTrain <- train[ trainIndex,]
trainValid <- train[-trainIndex,]
trainPPTrain <- trainPP[ trainIndex,]
trainPPValid <- trainPP[-trainIndex,]
6-fold cross-validation is used here to estimate the out-of-sample error. “Introduction to Statistical Learning” recommends between 6 and 10 folds.
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv"
,number = 6
,repeats = 1
)
set.seed(825)
# multicore processing was used, we register the cluster here
# registerDoParallel(cl)
# find out how many cores are being used
getDoParWorkers()
## [1] 1
ldaFit1 <- train(classe ~ ., data = trainPPTrain[,colSums(is.na(trainPPTrain))==0]
,method = "lda"
,trControl = fitControl
,na.action = na.pass
)
# stopCluster(cl)
# registerDoSEQ()
#accuracy on trainTrain
ldaFit1
## Linear Discriminant Analysis
##
## 11776 samples
## 52 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (6 fold, repeated 1 times)
## Summary of sample sizes: 9815, 9814, 9812, 9813, 9813, 9813, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7025301 0.6233695
The LDA produces a model with an estimated out-of-sample accuracy of 70.3%.
treeGrid <- expand.grid(cp = seq(from=0.0000000000000000000000000000000000001, to=0.1, length.out = 10)
)
set.seed(78)
# find out how many cores are being used
getDoParWorkers()
## [1] 1
treeFit1 <- train(classe ~ ., data = trainPPTrain
,method = "rpart"
,tuneGrid = treeGrid
,trControl = fitControl
,na.action = na.pass
# ,preProcess = c("knnImpute")
)
#hyperparameters
plot(treeFit1)
#variable importance
varImp(treeFit1)
## rpart variable importance
##
## only 20 most important variables shown (out of 117)
##
## Overall
## roll_belt 100.00
## yaw_belt 84.90
## pitch_forearm 81.45
## pitch_belt 73.30
## roll_forearm 69.99
## accel_dumbbell_y 63.60
## magnet_dumbbell_z 62.82
## magnet_dumbbell_y 53.39
## roll_dumbbell 47.57
## magnet_forearm_z 38.69
## magnet_belt_z 37.97
## yaw_arm 36.64
## magnet_dumbbell_x 36.42
## accel_belt_z 34.59
## accel_dumbbell_z 33.68
## magnet_belt_y 33.68
## accel_forearm_x 32.28
## accel_arm_x 27.79
## magnet_forearm_y 27.13
## gyros_belt_z 24.08
#accuracy on trainTrain
treeFit1
## CART
##
## 11776 samples
## 117 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (6 fold, repeated 1 times)
## Summary of sample sizes: 9814, 9813, 9813, 9812, 9814, 9814, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 1.000000e-37 0.9155901 0.8932205
## 1.111111e-02 0.7225744 0.6493470
## 2.222222e-02 0.5915400 0.4795246
## 3.333333e-02 0.5295542 0.3924347
## 4.444444e-02 0.5039913 0.3586148
## 5.555556e-02 0.4533767 0.2705709
## 6.666667e-02 0.3676967 0.1269362
## 7.777778e-02 0.3676967 0.1269362
## 8.888889e-02 0.3676967 0.1269362
## 1.000000e-01 0.3676967 0.1269362
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 1e-37.
A decision tree gives an estimated out-of-sample accuracy of 91.6% which is an improvement over the LDA model.
Random forest can improve upon the prediction accuracy of decision trees by bootstrapping and aggregating (bagging) many decision trees which decreases the variance of predictions and hence overall error. N predictors are chosen at random with each decision tree to “decorrelate” the decision trees and hence reduce the variance further. Let us examine how it performs versus the decision tree fit above.
# Find out how many cores are being used
getDoParWorkers()
## [1] 1
rforestGrid <- expand.grid(mtry = c(5,10,15,20,35)
)
set.seed(934)
#random forest doesn't work with NA predictors, removed those columns
rforestFit1 <- train(classe ~ ., data = trainPPTrain[,colSums(is.na(trainPPTrain))==0]
,method = "rf"
,tuneGrid = rforestGrid
,trControl = fitControl
,na.action = na.pass
)
#hyperparameters
plot(rforestFit1)
#variable importance
varImp(rforestFit1)
## rf variable importance
##
## only 20 most important variables shown (out of 52)
##
## Overall
## roll_belt 100.00
## yaw_belt 67.36
## pitch_forearm 61.20
## magnet_dumbbell_z 52.65
## magnet_dumbbell_y 51.04
## pitch_belt 50.48
## roll_forearm 42.76
## magnet_dumbbell_x 30.76
## accel_dumbbell_y 27.17
## magnet_belt_z 27.02
## roll_dumbbell 26.50
## accel_belt_z 24.10
## magnet_belt_y 23.96
## accel_dumbbell_z 19.84
## roll_arm 19.10
## accel_forearm_x 18.17
## magnet_forearm_z 16.71
## gyros_belt_z 15.73
## total_accel_dumbbell 15.28
## magnet_belt_x 14.14
#accuracy on trainTrain
rforestFit1
## Random Forest
##
## 11776 samples
## 52 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (6 fold, repeated 1 times)
## Summary of sample sizes: 9812, 9815, 9814, 9813, 9814, 9812, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 5 0.9904047 0.9878605
## 10 0.9909993 0.9886131
## 15 0.9909144 0.9885057
## 20 0.9901497 0.9875381
## 35 0.9868385 0.9833490
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.
We are getting estimated out-of-sample errors of 99.1% on our training and validation test sets using the Random Forest. This is a significant improvement over the decision tree.
Here we fit a Gradient Boosting Machine.
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv"
,number = 3
## repeated ten times
,repeats = 1
)
gbmGrid <- expand.grid(interaction.depth = c(6,8),
n.trees = 1500,
shrinkage = 0.01,
n.minobsinnode = c(ceiling(0.025*dim(train)[1]),ceiling(0.05*dim(train)[1])))
set.seed(825)
# Find out how many cores are being used
getDoParWorkers()
gbmFit1 <- train(classe ~ ., data = trainPPTrain
,method = "gbm"
,tuneGrid = gbmGrid
,trControl = fitControl
,bag.fraction = 0.7
,verbose = TRUE
,na.action = na.pass
)
#hyperparameters
plot(gbmFit1)
#variable importance
summary(gbmFit1)
#accuracy on trainTrain
gbmFit1
## Stochastic Gradient Boosting
##
## 11776 samples
## 117 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 1 times)
## Summary of sample sizes: 7851, 7851, 7850
## Resampling results across tuning parameters:
##
## interaction.depth n.minobsinnode Accuracy Kappa
## 6 491 0.9776660 0.9717493
## 6 982 0.9216199 0.9008157
## 8 491 0.9825063 0.9778719
## 8 982 0.9223840 0.9017929
##
## Tuning parameter 'n.trees' was held constant at a value of 1500
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.01
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 1500,
## interaction.depth = 8, shrinkage = 0.01 and n.minobsinnode = 491.
The Gradient Boosting Model is giving us estimate out-of-sample errors of 99.1%. This is similar to the estimated accuracy of the random forest model, however at the cost of far greater computational run time. This level of accuracy is probably good enough. If we wanted greater prediction accuracy, the next thing to try would be stacking models together, but the current level of accuracy is sufficient.
The validation accuracies are calculated below. For our final model, we would use the random forest. It’s prediction accuracy is similar and slightly better than the GBM but with far less computational run time.
# LDA
# prediction on validation
ldaPredict1_trainPPValid <- predict(ldaFit1, newdata = trainPPValid, na.action=na.pass)
# prediction on test
ldaPredict1_testPP <- predict(ldaFit1, newdata = testPP, na.action=na.pass)
# decision tree
# prediction on validation
treePredict1_trainPPValid <- predict(treeFit1, newdata = trainPPValid, na.action=na.pass)
# prediction on test
treePredict1_testPP <- predict(treeFit1, newdata = testPP, na.action=na.pass)
# random forest
# prediction on validation
rforestPredict1_trainPPValid <- predict(rforestFit1, newdata = trainPPValid, na.action=na.pass)
# prediction on test
rforestPredict1_testPP <- predict(rforestFit1, newdata = testPP, na.action=na.pass)
# GBM
# prediction on validation
gbmPredict1_trainPPValid <- predict(gbmFit1, newdata = trainPPValid, na.action=na.pass)
# prediction on test
gbmPredict1_testPP <- predict(gbmFit1, newdata = testPP, na.action=na.pass)
results <- data.frame("model" = c("LDA", "Decision Tree", "Random Forest", "Gradient Boosting Machine"), "Validation set accuracy" = c(
confusionMatrix(data=ldaPredict1_trainPPValid,reference = as.factor(trainPPValid$classe))$overall[1]
,confusionMatrix(data=treePredict1_trainPPValid,reference = as.factor(trainPPValid$classe))$overall[1]
,confusionMatrix(data=rforestPredict1_trainPPValid,reference = as.factor(trainPPValid$classe))$overall[1]
,confusionMatrix(data=gbmPredict1_trainPPValid,reference = as.factor(trainPPValid$classe))$overall[1]
))
results
data.frame("LDA" = ldaPredict1_testPP, "Decision Tree" = treePredict1_testPP
, "Random Forest" = rforestPredict1_testPP ,"GBM"=gbmPredict1_testPP)