library(caret)
library(GGally)
library(rpart)
library(rpart.plot)
library(party)
library(RGtk2)
library(rattle)
library(xgboost)
library(formattable)
library(dplyr)
library(tidyr)
library(tibble)
library(ggthemes)
train <- read.csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv')
test <- read.csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv')
inTrain <- createDataPartition(train$classe, p=0.75, list = F)
training <- train[inTrain,]
testing <- train[-inTrain,]
#Find near zero variance predictors
nzvs <- nearZeroVar(training, saveMetrics = T)
nzvars <- nzvs[nzvs$nzv==T,0]
#Remove near zero variance predictors from train and test sets
smallTrain <- training[,!colnames(training) %in% rownames(nzvars)]
smallTest <- testing[,!colnames(testing) %in% rownames(nzvars)]
x <- array()
for(i in 1:ncol(smallTrain)){
x[i] <- sum(is.na(smallTrain[,i]))
}
print(x)
## [1] 0 0 0 0 0 0 0 0 0 0 14405
## [12] 14405 14405 14405 14405 14405 14405 14405 14405 14405 14405 14405
## [23] 14405 14405 14405 14405 0 0 0 0 0 0 0
## [34] 0 0 0 0 0 0 14405 0 0 0 0
## [45] 0 0 0 0 0 14405 14405 14405 14405 14405 14405
## [56] 0 0 0 14405 14405 14405 14405 14405 14405 0 14405
## [67] 14405 14405 14405 14405 14405 14405 14405 14405 14405 0 0
## [78] 0 0 0 0 0 0 0 0 0 0 14405
## [89] 14405 14405 0 14405 0 0 0 0 0 0 0
## [100] 0 0 0
table(x)
## x
## 0 14405
## 59 43
#subset train and test sets by rule above eliminating NA columns
smallerTrain <- smallTrain[,colSums(is.na(smallTrain)) == 0]
smallerTest <- smallTest[,colSums(is.na(smallTest)) == 0]
#get rid of id number which is duplicate of row index
sTrain <- smallerTrain[,-1]
sTest <- smallerTest[,-1]
#--------------
# Decision Tree
#--------------
set.seed(867)
tree <- rpart(classe ~ ., data = sTrain, method = 'class')
fancyRpartPlot(tree)
#Predict using tree
TreeFit <- predict(tree, sTest, type = 'class')
TreeResults <- confusionMatrix(TreeFit, testing$classe)
#Tree Accuracy
TreeResults$overall[1]
## Accuracy
## 0.8711256
#Display confusion matrix results
tcm <- as.data.frame(TreeResults$table)
ggplot(tcm, aes(Prediction, Reference)) + geom_tile(aes(fill=Freq)) +
geom_text(aes(label=digits(Freq,0))) +
theme_hc()+
scale_fill_gradient(low ="ivory3", high = "palegreen3") +
ggtitle(label = paste("Decision Tree Accuracy:",round(TreeResults$overall['Accuracy'],4)),
subtitle = "Confusion Matrix Plot") +
theme(plot.title = element_text(hjust = 0.5, size = 12),
plot.subtitle = element_text(hjust = 0.5, size = 10))
set.seed(867)
control <- trainControl(method="repeatedcv", number=10, repeats=3)
forest <- train(classe ~ ., data = sTrain, method = 'rf', trControl = control)
print(forest)
## Random Forest
##
## 14718 samples
## 57 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 13247, 13247, 13247, 13245, 13248, 13245, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9902388 0.9876513
## 40 0.9991619 0.9989399
## 79 0.9987088 0.9983668
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 40.
forestFit <- predict(forest, sTest)
forestResults <- confusionMatrix(forestFit, testing$classe)
forestResults$overall[1]
## Accuracy
## 0.9991843
fcm <- as.data.frame(forestResults$table)
ggplot(fcm, aes(Prediction, Reference)) + geom_tile(aes(fill=Freq)) +
geom_text(aes(label=digits(Freq,0))) +
theme_hc()+
scale_fill_gradient(low = "lemonchiffon1", high = "chartreuse3") +
ggtitle(label = paste("Random Forest Accuracy:",round(forestResults$overall['Accuracy'],4)),
subtitle = "Confusion Matrix Plot") +
theme(plot.title = element_text(hjust = 0.5, size = 12),
plot.subtitle = element_text(hjust = 0.5, size = 10))
The best fit comes from Random Forests with an accuracy of 99.92%.
sample(predict(forest,testing),50)
## [1] C B C E A A E C E C C A A A B B B B D B A A A A C C A A C D A D B B A
## [36] A C D D B C A B E A E D C A D
## Levels: A B C D E