library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
library(tidyr)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
traindata <- read.csv(url("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"),header=TRUE)
testdata <- read.csv(url("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"),header=TRUE)
library(dplyr)
train <- traindata[, -c(1:7)]
train <- train %>% discard(~sum(is.na(.x))/length(.x)* 100 >= 30)
#Also remove near zero varibles
NearZ <- nearZeroVar(train)
train <- train[, -NearZ]
test <- testdata[, -c(1:7)]
test <- test %>% discard(~sum(is.na(.x))/length(.x)* 100 >= 30)
NearZ <- nearZeroVar(test)
#There are no near zero variables
Index <- createDataPartition(train$classe, p = 0.7, list = FALSE)
training <- train[Index,]
testing <- train[-Index,]
Index <- createDataPartition(train$classe, p = 0.7, list = FALSE)
training <- train[Index,]
testing <- train[-Index,]
cart <- train( classe ~ .,data=training, method='rpart')
cart_predict <- predict(cart, testing)
cart_cm<- confusionMatrix(cart_predict, as.factor(testing$classe))
cart_acc<- cart_cm[["overall"]][["Accuracy"]]*100
cart_acc
## [1] 47.5616
library(rattle)
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
fancyRpartPlot(cart$finalModel)
set.seed(123)
controlGBM <- trainControl(method = "repeatedcv", number = 5, repeats=1)
gbm <- train( classe ~ .,data=training, method='gbm', trControl = controlGBM, verbose=FALSE)
gbm
## Stochastic Gradient Boosting
##
## 13737 samples
## 52 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 1 times)
## Summary of sample sizes: 10990, 10990, 10988, 10990, 10990
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa
## 1 50 0.7529304 0.6865957
## 1 100 0.8201944 0.7724017
## 1 150 0.8520062 0.8126521
## 2 50 0.8551372 0.8163888
## 2 100 0.9061658 0.8812499
## 2 150 0.9320817 0.9140407
## 3 50 0.8945181 0.8663872
## 3 100 0.9403077 0.9244641
## 3 150 0.9601808 0.9496198
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150, interaction.depth =
## 3, shrinkage = 0.1 and n.minobsinnode = 10.
gbm_predict <- predict(gbm, testing)
#Extract confusion matrix and the accuracy of the model when tested with #the test set
gbm_cm<- confusionMatrix(gbm_predict, as.factor(testing$classe))
gbmacc<- gbm_cm[["overall"]][["Accuracy"]]*100
gbmacc
## [1] 96.48258
training$classe = as.factor(training$classe)
testing$classe = as.factor(testing$classe)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
#Build a model using training data
rf <- randomForest(classe ~., data=training)
rf
##
## Call:
## randomForest(formula = classe ~ ., data = training)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 0.54%
## Confusion matrix:
## A B C D E class.error
## A 3903 2 0 0 1 0.0007680492
## B 10 2642 6 0 0 0.0060195636
## C 0 16 2378 2 0 0.0075125209
## D 0 0 23 2226 3 0.0115452931
## E 0 0 2 9 2514 0.0043564356
plot(rf)
##### The plot shows that error rate decreases when number of trees increases until about 20-50 trees, after that the error rate is pretty constant. So we use number of trees in teh forest to be 50.
rf <- randomForest(classe ~., data=training, ntree=50)
rf_predict <- predict(rf, testing)
rf_cm<- confusionMatrix(rf_predict, testing$classe)
rfacc<- rf_cm[["overall"]][["Accuracy"]] *100
rfacc
## [1] 99.54121
t<- data.frame()
t<- cbind(gbmacc, cart_acc, rfacc)
t
## gbmacc cart_acc rfacc
## [1,] 96.48258 47.5616 99.54121
rf_final <- predict(rf, test)
rf_final
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E