library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(purrr)
library(tidyr)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
traindata <- read.csv(url("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"),header=TRUE)
testdata <- read.csv(url("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"),header=TRUE)
library(dplyr)
train <- traindata[, -c(1:7)]
train <-  train %>%  discard(~sum(is.na(.x))/length(.x)* 100 >= 30)
#Also remove near zero varibles
NearZ <- nearZeroVar(train)
train <- train[, -NearZ]  

test <- testdata[, -c(1:7)]
test <-  test %>%   discard(~sum(is.na(.x))/length(.x)* 100 >= 30)
NearZ <- nearZeroVar(test)
#There are no near zero variables


Index <- createDataPartition(train$classe, p = 0.7, list = FALSE)
training <- train[Index,]
testing <- train[-Index,]
Index <- createDataPartition(train$classe, p = 0.7, list = FALSE)
training <- train[Index,]
testing <- train[-Index,]

1. Modelling using Recursive Partitioning And Regression Trees (RPART)

Build a model using training data

cart <- train( classe ~ .,data=training,  method='rpart')

Predict

cart_predict <- predict(cart, testing)

Extract confusion matrix and the accuracy of the model when tested with the test set

cart_cm<- confusionMatrix(cart_predict, as.factor(testing$classe))
cart_acc<- cart_cm[["overall"]][["Accuracy"]]*100
cart_acc
## [1] 47.5616
library(rattle)
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
fancyRpartPlot(cart$finalModel)

2. Modelling using Gradient Boosting Machines (GBM)

Build a model using training data

set.seed(123)
controlGBM <- trainControl(method = "repeatedcv", number = 5, repeats=1)
gbm <- train( classe ~ .,data=training,  method='gbm', trControl = controlGBM, verbose=FALSE)

gbm
## Stochastic Gradient Boosting 
## 
## 13737 samples
##    52 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 1 times) 
## Summary of sample sizes: 10990, 10990, 10988, 10990, 10990 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa    
##   1                   50      0.7529304  0.6865957
##   1                  100      0.8201944  0.7724017
##   1                  150      0.8520062  0.8126521
##   2                   50      0.8551372  0.8163888
##   2                  100      0.9061658  0.8812499
##   2                  150      0.9320817  0.9140407
##   3                   50      0.8945181  0.8663872
##   3                  100      0.9403077  0.9244641
##   3                  150      0.9601808  0.9496198
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150, interaction.depth =
##  3, shrinkage = 0.1 and n.minobsinnode = 10.

Predict

gbm_predict <- predict(gbm, testing)
#Extract confusion matrix and the accuracy of the model when tested with #the test set
gbm_cm<- confusionMatrix(gbm_predict, as.factor(testing$classe))
gbmacc<- gbm_cm[["overall"]][["Accuracy"]]*100
gbmacc
## [1] 96.48258

3. Modelling using Random Forest

training$classe = as.factor(training$classe)
testing$classe = as.factor(testing$classe)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
## 
##     importance
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
#Build a model using training data
rf <- randomForest(classe ~., data=training)
rf
## 
## Call:
##  randomForest(formula = classe ~ ., data = training) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 7
## 
##         OOB estimate of  error rate: 0.54%
## Confusion matrix:
##      A    B    C    D    E  class.error
## A 3903    2    0    0    1 0.0007680492
## B   10 2642    6    0    0 0.0060195636
## C    0   16 2378    2    0 0.0075125209
## D    0    0   23 2226    3 0.0115452931
## E    0    0    2    9 2514 0.0043564356

Plot the results of model

plot(rf)

##### The plot shows that error rate decreases when number of trees increases until about 20-50 trees, after that the error rate is pretty constant. So we use number of trees in teh forest to be 50.

rf <- randomForest(classe ~., data=training, ntree=50)

Predict

rf_predict <- predict(rf, testing)

Extract confusion matrix and the accuracy of the model when tested with #the test set

rf_cm<- confusionMatrix(rf_predict, testing$classe)
rfacc<- rf_cm[["overall"]][["Accuracy"]] *100
rfacc
## [1] 99.54121

Combine the accuracy results into a table

t<- data.frame()
t<- cbind(gbmacc, cart_acc, rfacc)
t
##        gbmacc cart_acc    rfacc
## [1,] 96.48258  47.5616 99.54121

Predicting on the test set

rf_final <- predict(rf, test)
rf_final
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
##  B  A  B  A  A  E  D  B  A  A  B  C  B  A  E  E  A  B  B  B 
## Levels: A B C D E