library("kernlab")
library("caret")
library("randomForest")
WL <- read.csv("weightlift.csv")
Seed set to 555 for reproductible results
set.seed(555)
inTraining <- createDataPartition(y = WL$classe,p=.25, list =FALSE)
trainingData <- WL[inTraining,]
ValData <- WL[-inTraining,]
K-fold cross-validation, where K=3
trainOpts <- trainControl()
trainOpts$method="cv"
trainOpts$number=3
# Random Forest model was based on a total of 12 predictors
mainRFModel <- train(classe ~ roll_belt + pitch_belt + total_accel_belt + roll_arm + pitch_arm + total_accel_arm + roll_dumbbell + pitch_dumbbell + total_accel_dumbbell +
roll_forearm + pitch_forearm + total_accel_forearm, data = trainingData, method = "rf")
mainRFModel
## Random Forest
##
## 4907 samples
## 12 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 4907, 4907, 4907, 4907, 4907, 4907, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9427996 0.9276296
## 7 0.9370105 0.9203163
## 12 0.9219115 0.9012164
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
mainRFModel$finalModel
##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 4.32%
## Confusion matrix:
## A B C D E class.error
## A 1372 10 6 4 3 0.01648746
## B 25 869 35 15 6 0.08526316
## C 3 23 809 21 0 0.05490654
## D 3 1 24 775 1 0.03606965
## E 0 11 11 10 870 0.03547672
The highest accuracy attained by this in-sample 12-predictor random forest model is 94.28%, with a sample error of only 5.72%
testPredict <- predict(mainRFModel, ValData)
confusionMatrix(testPredict, ValData$classe)$table
## Reference
## Prediction A B C D E
## A 4120 46 4 8 4
## B 25 2630 45 3 10
## C 26 110 2466 76 16
## D 11 47 45 2320 24
## E 3 14 6 5 2651
confusionMatrix(testPredict, ValData$classe)$overall[1]
## Accuracy
## 0.9641182
The highest accuracy attained by this out-sample 12-predictor random forest model is 96.41%, with a sample error of only 3.59%
topPred <- varImp(mainRFModel)
print(topPred)
## rf variable importance
##
## Overall
## roll_belt 100.000
## pitch_belt 65.775
## pitch_forearm 57.746
## roll_dumbbell 44.017
## roll_forearm 39.705
## roll_arm 23.153
## total_accel_dumbbell 21.039
## pitch_dumbbell 20.590
## total_accel_belt 9.699
## pitch_arm 2.735
## total_accel_forearm 2.365
## total_accel_arm 0.000
valSort <- order(topPred$importance$Overall, decreasing = TRUE)
valKeep <- row.names(topPred$importance)[valSort[1:6]]
# Keep top 6 predictors
ValData <- ValData[,c(valKeep,"classe")]
# Show 6 top predictors
colnames(ValData)
## [1] "roll_belt" "pitch_belt" "pitch_forearm" "roll_dumbbell"
## [5] "roll_forearm" "roll_arm" "classe"
inTrainingVal <- createDataPartition(y=ValData$classe, p = 0.6, list=FALSE)
trainingVal <- ValData[inTrainingVal,]
testing <- ValData[-inTrainingVal,]
valRFModel <- train(classe ~., data = trainingVal, method = "rf", trControl = trainOpts)
valRFModel
## Random Forest
##
## 8831 samples
## 6 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 5887, 5887, 5888
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9525536 0.9399918
## 4 0.9490431 0.9355537
## 6 0.9411168 0.9255408
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
valRFModel$finalModel
##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 3.9%
## Confusion matrix:
## A B C D E class.error
## A 2466 25 9 8 3 0.01792115
## B 36 1583 48 29 13 0.07372733
## C 5 36 1456 33 10 0.05454545
## D 1 4 34 1404 5 0.03038674
## E 1 21 14 9 1578 0.02772643
The highest accuracy attained by this in-sample 6-predictor validation random forest model is 95.25%, with a sample error of only 4.75%
testPredictVal <- predict(valRFModel,testing)
confusionMatrix(testPredictVal, testing$classe)$table
## Reference
## Prediction A B C D E
## A 1642 22 5 3 2
## B 17 1059 19 5 11
## C 10 26 975 19 6
## D 3 28 18 934 6
## E 2 3 9 3 1057
confusionMatrix(testPredictVal, testing$classe)$overall[1]
## Accuracy
## 0.9631203
The highest accuracy attained by this in-sample 6-predictor validation random forest model is 96.31%, with a sample error of only 3.69%