library("kernlab")
library("caret")
library("randomForest")
WL <- read.csv("weightlift.csv")

Partition Data Set

Seed set to 555 for reproductible results

set.seed(555)
inTraining <- createDataPartition(y = WL$classe,p=.25, list =FALSE)
trainingData <- WL[inTraining,]
ValData <- WL[-inTraining,]

Set Up Training Options

K-fold cross-validation, where K=3

trainOpts <- trainControl()
trainOpts$method="cv"
trainOpts$number=3

Setting Up Random Forest Model

Train & Build

# Random Forest model was based on a total of 12 predictors
mainRFModel <- train(classe ~ roll_belt + pitch_belt + total_accel_belt + roll_arm + pitch_arm + total_accel_arm + roll_dumbbell + pitch_dumbbell + total_accel_dumbbell + 
                       roll_forearm + pitch_forearm + total_accel_forearm, data = trainingData, method = "rf")

Evaluate (In-Sample)

mainRFModel
## Random Forest 
## 
## 4907 samples
##   12 predictor
##    5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 4907, 4907, 4907, 4907, 4907, 4907, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9427996  0.9276296
##    7    0.9370105  0.9203163
##   12    0.9219115  0.9012164
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 2.
mainRFModel$finalModel
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 4.32%
## Confusion matrix:
##      A   B   C   D   E class.error
## A 1372  10   6   4   3  0.01648746
## B   25 869  35  15   6  0.08526316
## C    3  23 809  21   0  0.05490654
## D    3   1  24 775   1  0.03606965
## E    0  11  11  10 870  0.03547672

The highest accuracy attained by this in-sample 12-predictor random forest model is 94.28%, with a sample error of only 5.72%

Evaluate (Out-Sample)

testPredict <- predict(mainRFModel, ValData)
confusionMatrix(testPredict, ValData$classe)$table
##           Reference
## Prediction    A    B    C    D    E
##          A 4120   46    4    8    4
##          B   25 2630   45    3   10
##          C   26  110 2466   76   16
##          D   11   47   45 2320   24
##          E    3   14    6    5 2651
confusionMatrix(testPredict, ValData$classe)$overall[1]
##  Accuracy 
## 0.9641182

The highest accuracy attained by this out-sample 12-predictor random forest model is 96.41%, with a sample error of only 3.59%

Validation Set

Find top predictors in mainRFModel

topPred <- varImp(mainRFModel)
print(topPred)
## rf variable importance
## 
##                      Overall
## roll_belt            100.000
## pitch_belt            65.775
## pitch_forearm         57.746
## roll_dumbbell         44.017
## roll_forearm          39.705
## roll_arm              23.153
## total_accel_dumbbell  21.039
## pitch_dumbbell        20.590
## total_accel_belt       9.699
## pitch_arm              2.735
## total_accel_forearm    2.365
## total_accel_arm        0.000

Keep top 6 predictors and outcome variable only

valSort <- order(topPred$importance$Overall, decreasing = TRUE)
valKeep <- row.names(topPred$importance)[valSort[1:6]]
# Keep top 6 predictors
ValData <- ValData[,c(valKeep,"classe")]
# Show 6 top predictors
colnames(ValData)
## [1] "roll_belt"     "pitch_belt"    "pitch_forearm" "roll_dumbbell"
## [5] "roll_forearm"  "roll_arm"      "classe"

Partition Validation Data Set

inTrainingVal <- createDataPartition(y=ValData$classe, p = 0.6, list=FALSE)
trainingVal <- ValData[inTrainingVal,]
testing <- ValData[-inTrainingVal,]

Retrain with only those top 6 predictors

valRFModel <- train(classe ~., data = trainingVal, method = "rf", trControl = trainOpts)

Evaluate Validation Model (In-Sample)

valRFModel
## Random Forest 
## 
## 8831 samples
##    6 predictor
##    5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold) 
## Summary of sample sizes: 5887, 5887, 5888 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9525536  0.9399918
##   4     0.9490431  0.9355537
##   6     0.9411168  0.9255408
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 2.
valRFModel$finalModel
## 
## Call:
##  randomForest(x = x, y = y, mtry = param$mtry) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 3.9%
## Confusion matrix:
##      A    B    C    D    E class.error
## A 2466   25    9    8    3  0.01792115
## B   36 1583   48   29   13  0.07372733
## C    5   36 1456   33   10  0.05454545
## D    1    4   34 1404    5  0.03038674
## E    1   21   14    9 1578  0.02772643

The highest accuracy attained by this in-sample 6-predictor validation random forest model is 95.25%, with a sample error of only 4.75%

Evaluate Validation Model (Out-Sample)

testPredictVal <- predict(valRFModel,testing)
confusionMatrix(testPredictVal, testing$classe)$table
##           Reference
## Prediction    A    B    C    D    E
##          A 1642   22    5    3    2
##          B   17 1059   19    5   11
##          C   10   26  975   19    6
##          D    3   28   18  934    6
##          E    2    3    9    3 1057
confusionMatrix(testPredictVal, testing$classe)$overall[1]
##  Accuracy 
## 0.9631203

The highest accuracy attained by this in-sample 6-predictor validation random forest model is 96.31%, with a sample error of only 3.69%