To complete this project we need to include specific libaries.
Then we need to get rid of spurious variables. Any variable that include the words, such as kurtosis, Max, Min, var, stddev, avg,skewness, timestamp, raw, name, and window can be removed because they will not help with prediction modeling. We further clean the data with nearzerovar function. This removes variables with approximately 0 variance.
library(lattice)
library(ggplot2)
library(caret)
library(dplyr)
library(rpart)
library(rpart.plot)
library(tibble)
library(bitops)
library(rattle)
library(randomForest)
train <- read.csv(file = "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv", header=TRUE)
final_test<-read.csv(file = "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv", header=TRUE)
train<-data.frame(train)
new_train<-train%>%
select(-contains('kurtosis'))%>%
select(-contains('skewness'))%>%
select(-contains('stddev'))%>%
select(-contains('max'))%>%
select(-contains('min'))%>%
select(-contains('var'))%>%
select(-contains('avg'))%>%
select(-contains('raw'))%>%
select(-contains('timestamp'))%>%
select(-contains('name'))%>%
select(-contains('window'))
new_train<- new_train[, colSums(is.na(new_train)) == 0]
nearZvar <- nearZeroVar(new_train)
new_train<-new_train[,-nearZvar]
new_train<-new_train[,-1]
inValidation = createDataPartition(new_train$classe, p = 3/4)[[1]]
training = new_train[ inValidation,]
validation = new_train[-inValidation,]
set.seed(11122)
Tree_mod <- rpart(classe ~ ., data=training, method="class")
fancyRpartPlot(Tree_mod)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
predictTreeVal<-predict(Tree_mod, newdata=validation, type="class")
testclassvalid<-as.factor(validation$classe)
confusionMatrix(predictTreeVal,factor(validation$classe))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1260 189 10 85 57
## B 21 573 56 45 9
## C 70 104 711 134 79
## D 24 55 32 495 47
## E 20 28 46 45 709
##
## Overall Statistics
##
## Accuracy : 0.7643
## 95% CI : (0.7521, 0.7761)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7004
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9032 0.6038 0.8316 0.6157 0.7869
## Specificity 0.9028 0.9669 0.9044 0.9615 0.9653
## Pos Pred Value 0.7870 0.8139 0.6475 0.7580 0.8361
## Neg Pred Value 0.9591 0.9105 0.9622 0.9273 0.9527
## Prevalence 0.2845 0.1935 0.1743 0.1639 0.1837
## Detection Rate 0.2569 0.1168 0.1450 0.1009 0.1446
## Detection Prevalence 0.3265 0.1436 0.2239 0.1332 0.1729
## Balanced Accuracy 0.9030 0.7853 0.8680 0.7886 0.8761
out_sample_error_tree<-1-confusionMatrix(predictTreeVal,factor(validation$classe))$overall[[1]]
out_sample_error_tree
## [1] 0.2357259
set.seed(87234)
modelda <- train(classe ~ ., data=training, method = "lda")
predictlda <- predict(modelda, newdata=validation)
confusionMatrix(predictlda,factor(validation$classe))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1131 143 98 62 41
## B 30 603 71 38 144
## C 117 127 560 91 84
## D 113 38 105 578 92
## E 4 38 21 35 540
##
## Overall Statistics
##
## Accuracy : 0.6958
## 95% CI : (0.6827, 0.7086)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6148
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.8108 0.6354 0.6550 0.7189 0.5993
## Specificity 0.9020 0.9284 0.8965 0.9151 0.9755
## Pos Pred Value 0.7668 0.6806 0.5720 0.6242 0.8464
## Neg Pred Value 0.9230 0.9139 0.9248 0.9432 0.9154
## Prevalence 0.2845 0.1935 0.1743 0.1639 0.1837
## Detection Rate 0.2306 0.1230 0.1142 0.1179 0.1101
## Detection Prevalence 0.3008 0.1807 0.1996 0.1888 0.1301
## Balanced Accuracy 0.8564 0.7819 0.7757 0.8170 0.7874
out_sample_error_lda<-1-confusionMatrix(predictlda,factor(validation$classe))$overall[[1]]
out_sample_error_lda
## [1] 0.3042414
set.seed(98427)
controlfunction <- trainControl(method="cv", number=3, verboseIter = FALSE)
rand_for <- train(classe ~ ., data=training, method="rf", trControl=controlfunction)
predrf<-predict(rand_for, newdata=validation)
confusionMatrix(predict(rand_for, newdata=validation), factor(validation$classe))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1394 10 0 0 0
## B 0 938 6 0 0
## C 1 1 846 9 0
## D 0 0 3 795 2
## E 0 0 0 0 899
##
## Overall Statistics
##
## Accuracy : 0.9935
## 95% CI : (0.9908, 0.9955)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9917
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9993 0.9884 0.9895 0.9888 0.9978
## Specificity 0.9972 0.9985 0.9973 0.9988 1.0000
## Pos Pred Value 0.9929 0.9936 0.9872 0.9938 1.0000
## Neg Pred Value 0.9997 0.9972 0.9978 0.9978 0.9995
## Prevalence 0.2845 0.1935 0.1743 0.1639 0.1837
## Detection Rate 0.2843 0.1913 0.1725 0.1621 0.1833
## Detection Prevalence 0.2863 0.1925 0.1748 0.1631 0.1833
## Balanced Accuracy 0.9982 0.9934 0.9934 0.9938 0.9989
out_sample_error_rf<-1-confusionMatrix(predict(rand_for, newdata=validation), factor(validation$classe))$overall[[1]]
out_sample_error_rf
## [1] 0.006525285
Random Forest has the highest accuracy and lowest out of sample error of the 3 models. Linear Discriminant Analysis has lowest accuracy and the highest out of sample error out of the 3 models. The reason for Random Forest to have a high accuracy for is maybe do to over fitting.
plot(rand_for)
set.seed(80275)
pred_model<-predict(rand_for, newdata=final_test)
pred_model
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E