## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(1234)
download.file("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv", destfile = "training.csv")
download.file("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv", destfile = "testing.csv")
#data for partitioning
training <- read.csv("training.csv", na.strings=c("NA","","#DIV/0!"))
#testing data to validate
validate <- read.csv("testing.csv")
First we start with identifying near zero variance variables(NZV) and creating clean data sets without the NZV. Then we remove the unclean sets.
NZV <- nearZeroVar(training, saveMetrics = TRUE)
trainingclean <- training[, !NZV$nzv]
validateclean <- validate[, !NZV$nzv]
rm(training)
rm(validate)
rm(NZV)
Our data sets has some columns that are unnecessary to our analysis we will remove these columns all together
usless <- grepl("^X|timestamp|user_name", names(trainingclean))
training <- trainingclean[, !usless]
testing <- validateclean[, !usless]
rm(usless)
rm(trainingclean)
rm(validateclean)
dim(training)
## [1] 19622 119
dim(testing)
## [1] 20 119
Last step in the cleaning process is removing all the columns that are made entirely of NA’s
notneeded <- (colSums(is.na(training)) == 0)
training <- training[, notneeded]
testing <- testing[, notneeded]
rm(notneeded)
In this section I will split the training data to 70% for training and 30% for validation
inTrain <- createDataPartition(training$classe, p = 0.70, list = FALSE)
validation <- training[-inTrain, ]
training <- training[inTrain, ]
rm(inTrain)
Now we have three clean data sets in the project environment with the following dimensions
dim(testing)
## [1] 20 54
dim(training)
## [1] 13737 54
dim(validation)
## [1] 5885 54
tree <- rpart(classe ~ ., data = training, method = "class")
prp(tree)
Let’s check the model against our validation data set to check for performance to include confusion matrix and accuracy
predictTree <- predict(tree, validation, type = "class")
confusionMatrix(validation$classe, predictTree)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1489 70 19 81 15
## B 206 671 70 134 58
## C 54 34 823 49 66
## D 76 84 128 618 58
## E 52 111 98 133 688
##
## Overall Statistics
##
## Accuracy : 0.7288
## 95% CI : (0.7172, 0.7401)
## No Information Rate : 0.3189
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6557
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.7933 0.6918 0.7232 0.6089 0.7774
## Specificity 0.9538 0.9048 0.9572 0.9290 0.9212
## Pos Pred Value 0.8895 0.5891 0.8021 0.6411 0.6359
## Neg Pred Value 0.9079 0.9370 0.9352 0.9193 0.9590
## Prevalence 0.3189 0.1648 0.1934 0.1725 0.1504
## Detection Rate 0.2530 0.1140 0.1398 0.1050 0.1169
## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Balanced Accuracy 0.8736 0.7983 0.8402 0.7689 0.8493
accuracy <- postResample(predictTree, validation$classe)
OutOfSample <- 1 - as.numeric(confusionMatrix(validation$classe, predictTree)$overall[1])
forestmodel <- randomForest(classe ~ ., data = training, method = "class")
Check the accuracy with the validation data set
predictRF <- predict(forestmodel, validation)
confusionMatrix(validation$classe, predictRF)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1674 0 0 0 0
## B 3 1136 0 0 0
## C 0 4 1021 1 0
## D 0 0 5 959 0
## E 0 0 0 0 1082
##
## Overall Statistics
##
## Accuracy : 0.9978
## 95% CI : (0.9962, 0.9988)
## No Information Rate : 0.285
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9972
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9982 0.9965 0.9951 0.9990 1.0000
## Specificity 1.0000 0.9994 0.9990 0.9990 1.0000
## Pos Pred Value 1.0000 0.9974 0.9951 0.9948 1.0000
## Neg Pred Value 0.9993 0.9992 0.9990 0.9998 1.0000
## Prevalence 0.2850 0.1937 0.1743 0.1631 0.1839
## Detection Rate 0.2845 0.1930 0.1735 0.1630 0.1839
## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Balanced Accuracy 0.9991 0.9979 0.9970 0.9990 1.0000
accuracy <- postResample(predictRF, validation$classe)
ose <- 1 - as.numeric(confusionMatrix(validation$classe, predictRF)$overall[1])
Below is a comparison of both Decision Tree and Random Forest confusion Matrix
Decision_Tree_Confusion_Matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1489 70 19 81 15
## B 206 671 70 134 58
## C 54 34 823 49 66
## D 76 84 128 618 58
## E 52 111 98 133 688
##
## Overall Statistics
##
## Accuracy : 0.7288
## 95% CI : (0.7172, 0.7401)
## No Information Rate : 0.3189
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6557
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.7933 0.6918 0.7232 0.6089 0.7774
## Specificity 0.9538 0.9048 0.9572 0.9290 0.9212
## Pos Pred Value 0.8895 0.5891 0.8021 0.6411 0.6359
## Neg Pred Value 0.9079 0.9370 0.9352 0.9193 0.9590
## Prevalence 0.3189 0.1648 0.1934 0.1725 0.1504
## Detection Rate 0.2530 0.1140 0.1398 0.1050 0.1169
## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Balanced Accuracy 0.8736 0.7983 0.8402 0.7689 0.8493
Random_Forest_Confusion_Matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1674 0 0 0 0
## B 3 1136 0 0 0
## C 0 4 1021 1 0
## D 0 0 5 959 0
## E 0 0 0 0 1082
##
## Overall Statistics
##
## Accuracy : 0.9978
## 95% CI : (0.9962, 0.9988)
## No Information Rate : 0.285
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9972
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9982 0.9965 0.9951 0.9990 1.0000
## Specificity 1.0000 0.9994 0.9990 0.9990 1.0000
## Pos Pred Value 1.0000 0.9974 0.9951 0.9948 1.0000
## Neg Pred Value 0.9993 0.9992 0.9990 0.9998 1.0000
## Prevalence 0.2850 0.1937 0.1743 0.1631 0.1839
## Detection Rate 0.2845 0.1930 0.1735 0.1630 0.1839
## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Balanced Accuracy 0.9991 0.9979 0.9970 0.9990 1.0000
Let’s use the more accurate random forest model to predict the provided testing set
predict(forestmodel, testing)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E