1. Load Necessary Libraries
library(caret)
library(GGally)
library(rpart)
library(rpart.plot)
library(party)
library(RGtk2)
library(rattle)
library(xgboost)
library(formattable)
library(dplyr)
library(tidyr)
library(tibble)
library(ggthemes)
  1. Download Data
train <- read.csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv')
test <- read.csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv')
  1. Create training / test partition for model validation
inTrain <- createDataPartition(train$classe, p=0.75, list = F)
training <- train[inTrain,]
testing <- train[-inTrain,]
  1. Identify and remove near zero variance predictors
#Find near zero variance predictors
nzvs <- nearZeroVar(training, saveMetrics = T)
nzvars <- nzvs[nzvs$nzv==T,0]

#Remove near zero variance predictors from train and test sets
smallTrain <- training[,!colnames(training) %in% rownames(nzvars)]
smallTest <- testing[,!colnames(testing) %in% rownames(nzvars)]
  1. Find predictors with large amount of NA values
x <- array()
for(i in 1:ncol(smallTrain)){
        x[i] <- sum(is.na(smallTrain[,i]))
}
print(x)
##   [1]     0     0     0     0     0     0     0     0     0     0 14405
##  [12] 14405 14405 14405 14405 14405 14405 14405 14405 14405 14405 14405
##  [23] 14405 14405 14405 14405     0     0     0     0     0     0     0
##  [34]     0     0     0     0     0     0 14405     0     0     0     0
##  [45]     0     0     0     0     0 14405 14405 14405 14405 14405 14405
##  [56]     0     0     0 14405 14405 14405 14405 14405 14405     0 14405
##  [67] 14405 14405 14405 14405 14405 14405 14405 14405 14405     0     0
##  [78]     0     0     0     0     0     0     0     0     0     0 14405
##  [89] 14405 14405     0 14405     0     0     0     0     0     0     0
## [100]     0     0     0
table(x)
## x
##     0 14405 
##    59    43
  1. Remove Predictors with NA values
#subset train and test sets by rule above eliminating NA columns
smallerTrain <- smallTrain[,colSums(is.na(smallTrain)) == 0]
smallerTest <- smallTest[,colSums(is.na(smallTest)) == 0]
  1. Final Data Cleaning Steps
#get rid of id number which is duplicate of row index

sTrain <- smallerTrain[,-1]
sTest <- smallerTest[,-1]
  1. Prediction with Decision Trees
#--------------
# Decision Tree
#--------------
set.seed(867)

tree <- rpart(classe ~ ., data = sTrain, method = 'class')
fancyRpartPlot(tree)

#Predict using tree
TreeFit <- predict(tree, sTest, type = 'class')
TreeResults <- confusionMatrix(TreeFit, testing$classe)

#Tree Accuracy
TreeResults$overall[1]
##  Accuracy 
## 0.8711256
  1. Decision Tree Results
#Display confusion matrix results
tcm <- as.data.frame(TreeResults$table)

ggplot(tcm, aes(Prediction, Reference)) + geom_tile(aes(fill=Freq)) +
    geom_text(aes(label=digits(Freq,0))) +
    theme_hc()+
    scale_fill_gradient(low ="ivory3", high = "palegreen3") +
    ggtitle(label = paste("Decision Tree Accuracy:",round(TreeResults$overall['Accuracy'],4)),
            subtitle = "Confusion Matrix Plot") +
    theme(plot.title = element_text(hjust = 0.5, size = 12),
          plot.subtitle = element_text(hjust = 0.5, size = 10))


  1. Prediction with Random Forests
set.seed(867)
control <- trainControl(method="repeatedcv", number=10, repeats=3)
forest <- train(classe ~ ., data = sTrain, method = 'rf', trControl = control)
print(forest)
## Random Forest 
## 
## 14718 samples
##    57 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 13247, 13247, 13247, 13245, 13248, 13245, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9902388  0.9876513
##   40    0.9991619  0.9989399
##   79    0.9987088  0.9983668
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 40.
forestFit <- predict(forest, sTest)
forestResults <- confusionMatrix(forestFit, testing$classe)
forestResults$overall[1]
##  Accuracy 
## 0.9991843
  1. Random Forest Results
fcm <- as.data.frame(forestResults$table)
ggplot(fcm, aes(Prediction, Reference)) + geom_tile(aes(fill=Freq)) +
    geom_text(aes(label=digits(Freq,0))) +
    theme_hc()+
    scale_fill_gradient(low = "lemonchiffon1", high = "chartreuse3") +
    ggtitle(label = paste("Random Forest Accuracy:",round(forestResults$overall['Accuracy'],4)),
            subtitle = "Confusion Matrix Plot") +
    theme(plot.title = element_text(hjust = 0.5, size = 12),
          plot.subtitle = element_text(hjust = 0.5, size = 10))

The best fit comes from Random Forests with an accuracy of 99.92%.

  1. Apply Random Forest model to test set
sample(predict(forest,testing),50)
##  [1] C B C E A A E C E C C A A A B B B B D B A A A A C C A A C D A D B B A
## [36] A C D D B C A B E A E D C A D
## Levels: A B C D E