Include the libraries and then clean the Data

To complete this project we need to include specific libaries.

Then we need to get rid of spurious variables. Any variable that include the words, such as kurtosis, Max, Min, var, stddev, avg,skewness, timestamp, raw, name, and window can be removed because they will not help with prediction modeling. We further clean the data with nearzerovar function. This removes variables with approximately 0 variance.

library(lattice)
library(ggplot2)
library(caret)
library(dplyr)
library(rpart)
library(rpart.plot)
library(tibble)
library(bitops)
library(rattle)
library(randomForest)

train <- read.csv(file = "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv", header=TRUE)
final_test<-read.csv(file = "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv", header=TRUE)
train<-data.frame(train)

new_train<-train%>%
    select(-contains('kurtosis'))%>%
    select(-contains('skewness'))%>%
    select(-contains('stddev'))%>%
    select(-contains('max'))%>%
    select(-contains('min'))%>%
    select(-contains('var'))%>%
    select(-contains('avg'))%>%
    select(-contains('raw'))%>%
    select(-contains('timestamp'))%>%
    select(-contains('name'))%>%
    select(-contains('window'))
new_train<- new_train[, colSums(is.na(new_train)) == 0] 
nearZvar <- nearZeroVar(new_train)
new_train<-new_train[,-nearZvar]
new_train<-new_train[,-1]

inValidation = createDataPartition(new_train$classe, p = 3/4)[[1]]
training = new_train[ inValidation,]
validation = new_train[-inValidation,]

Tree Method

set.seed(11122)
Tree_mod <- rpart(classe ~ ., data=training, method="class")
fancyRpartPlot(Tree_mod)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

predictTreeVal<-predict(Tree_mod, newdata=validation, type="class")
testclassvalid<-as.factor(validation$classe)
confusionMatrix(predictTreeVal,factor(validation$classe))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1260  189   10   85   57
##          B   21  573   56   45    9
##          C   70  104  711  134   79
##          D   24   55   32  495   47
##          E   20   28   46   45  709
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7643          
##                  95% CI : (0.7521, 0.7761)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7004          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9032   0.6038   0.8316   0.6157   0.7869
## Specificity            0.9028   0.9669   0.9044   0.9615   0.9653
## Pos Pred Value         0.7870   0.8139   0.6475   0.7580   0.8361
## Neg Pred Value         0.9591   0.9105   0.9622   0.9273   0.9527
## Prevalence             0.2845   0.1935   0.1743   0.1639   0.1837
## Detection Rate         0.2569   0.1168   0.1450   0.1009   0.1446
## Detection Prevalence   0.3265   0.1436   0.2239   0.1332   0.1729
## Balanced Accuracy      0.9030   0.7853   0.8680   0.7886   0.8761
out_sample_error_tree<-1-confusionMatrix(predictTreeVal,factor(validation$classe))$overall[[1]]
out_sample_error_tree
## [1] 0.2357259

Linear Discriminant Analysis

set.seed(87234)
modelda  <- train(classe ~ ., data=training, method = "lda")
predictlda <- predict(modelda, newdata=validation)
confusionMatrix(predictlda,factor(validation$classe))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1131  143   98   62   41
##          B   30  603   71   38  144
##          C  117  127  560   91   84
##          D  113   38  105  578   92
##          E    4   38   21   35  540
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6958          
##                  95% CI : (0.6827, 0.7086)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6148          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.8108   0.6354   0.6550   0.7189   0.5993
## Specificity            0.9020   0.9284   0.8965   0.9151   0.9755
## Pos Pred Value         0.7668   0.6806   0.5720   0.6242   0.8464
## Neg Pred Value         0.9230   0.9139   0.9248   0.9432   0.9154
## Prevalence             0.2845   0.1935   0.1743   0.1639   0.1837
## Detection Rate         0.2306   0.1230   0.1142   0.1179   0.1101
## Detection Prevalence   0.3008   0.1807   0.1996   0.1888   0.1301
## Balanced Accuracy      0.8564   0.7819   0.7757   0.8170   0.7874
out_sample_error_lda<-1-confusionMatrix(predictlda,factor(validation$classe))$overall[[1]]
out_sample_error_lda
## [1] 0.3042414

Random Forest

set.seed(98427)
controlfunction <- trainControl(method="cv", number=3, verboseIter = FALSE)
rand_for <- train(classe ~ ., data=training, method="rf", trControl=controlfunction)
predrf<-predict(rand_for, newdata=validation)
confusionMatrix(predict(rand_for, newdata=validation), factor(validation$classe))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1394   10    0    0    0
##          B    0  938    6    0    0
##          C    1    1  846    9    0
##          D    0    0    3  795    2
##          E    0    0    0    0  899
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9935          
##                  95% CI : (0.9908, 0.9955)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9917          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9993   0.9884   0.9895   0.9888   0.9978
## Specificity            0.9972   0.9985   0.9973   0.9988   1.0000
## Pos Pred Value         0.9929   0.9936   0.9872   0.9938   1.0000
## Neg Pred Value         0.9997   0.9972   0.9978   0.9978   0.9995
## Prevalence             0.2845   0.1935   0.1743   0.1639   0.1837
## Detection Rate         0.2843   0.1913   0.1725   0.1621   0.1833
## Detection Prevalence   0.2863   0.1925   0.1748   0.1631   0.1833
## Balanced Accuracy      0.9982   0.9934   0.9934   0.9938   0.9989
out_sample_error_rf<-1-confusionMatrix(predict(rand_for, newdata=validation), factor(validation$classe))$overall[[1]]
out_sample_error_rf
## [1] 0.006525285

Random Forest has the highest accuracy and lowest out of sample error of the 3 models. Linear Discriminant Analysis has lowest accuracy and the highest out of sample error out of the 3 models. The reason for Random Forest to have a high accuracy for is maybe do to over fitting.

plot(rand_for)

Go with Random Forest for Final Prediction

set.seed(80275)
pred_model<-predict(rand_for, newdata=final_test)
pred_model
##  [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E