https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv
https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv
library(knitr)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(rpart)
library(rpart.plot)
library(rattle)
## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(corrplot)
set.seed(301)
TrainUrl <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
TestUrl <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
TrainFile<-"pml-traininig.csv"
TestFile<-"pml-testing.csv"
# download the datasets
if(!file.exists(TrainFile))
{
download.file(TrainUrl,destfile = TrainFile)
}
training <- read.csv(TrainFile)
if(!file.exists(TestFile))
{
download.file(TestUrl,destfile = TestFile)
}
testing <- read.csv(TestFile)
# create a partition using caret with the training dataset on 70,30 ratio
inTrain <- createDataPartition(training$classe, p=0.7, list=FALSE)
TrainSet <- training[inTrain, ]
TestSet <- training[-inTrain, ]
dim(TrainSet)
## [1] 13737 160
dim(TestSet)
## [1] 5885 160
# remove variables with Nearly Zero Variance
NZV <- nearZeroVar(TrainSet)
TrainSet <- TrainSet[, -NZV]
TestSet <- TestSet[, -NZV]
dim(TestSet)
## [1] 5885 106
dim(TrainSet)
## [1] 13737 106
# remove variables that are mostly NA
AllNA <- sapply(TrainSet, function(x) mean(is.na(x))) > 0.95
TrainSet <- TrainSet[, AllNA==FALSE]
TestSet <- TestSet[, AllNA==FALSE]
dim(TestSet)
## [1] 5885 59
dim(TrainSet)
## [1] 13737 59
# remove identification only variables (columns 1 to 5)
TrainSet <- TrainSet[, -(1:5)]
TestSet <- TestSet[, -(1:5)]
dim(TrainSet)
## [1] 13737 54
# model fit
set.seed(301)
controlRF <- trainControl(method="cv", number=3, verboseIter=FALSE)
modFitRandForest <- train(classe ~ ., data=TrainSet, method="rf",
trControl=controlRF)
modFitRandForest$finalModel
##
## Call:
## randomForest(x = x, y = y, mtry = param$mtry)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 27
##
## OOB estimate of error rate: 0.25%
## Confusion matrix:
## A B C D E class.error
## A 3905 0 0 0 1 0.0002560164
## B 4 2649 4 1 0 0.0033860045
## C 0 8 2388 0 0 0.0033388982
## D 0 0 9 2242 1 0.0044404973
## E 0 0 0 7 2518 0.0027722772
# prediction on Test dataset
predictRandForest <- predict(modFitRandForest, newdata=TestSet)
confMatRandForest <- confusionMatrix(predictRandForest, TestSet$classe)
confMatRandForest
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1673 10 0 0 0
## B 1 1128 6 0 0
## C 0 1 1020 1 0
## D 0 0 0 963 0
## E 0 0 0 0 1082
##
## Overall Statistics
##
## Accuracy : 0.9968
## 95% CI : (0.995, 0.9981)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9959
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9994 0.9903 0.9942 0.9990 1.0000
## Specificity 0.9976 0.9985 0.9996 1.0000 1.0000
## Pos Pred Value 0.9941 0.9938 0.9980 1.0000 1.0000
## Neg Pred Value 0.9998 0.9977 0.9988 0.9998 1.0000
## Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Detection Rate 0.2843 0.1917 0.1733 0.1636 0.1839
## Detection Prevalence 0.2860 0.1929 0.1737 0.1636 0.1839
## Balanced Accuracy 0.9985 0.9944 0.9969 0.9995 1.0000
# plot matrix results
plot(confMatRandForest$table, col = confMatRandForest$byClass,
main = paste("Random Forest - Accuracy =",
round(confMatRandForest$overall['Accuracy'], 4)))
#Decision Tree
# model fit
set.seed(301)
modFitDecTree <- rpart(classe ~ ., data=TrainSet, method="class")
fancyRpartPlot(modFitDecTree)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
# prediction on Test dataset
predictDecTree <- predict(modFitDecTree, newdata=TestSet, type="class")
confMatDecTree <- confusionMatrix(predictDecTree, TestSet$classe)
confMatDecTree
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1441 107 2 15 5
## B 156 880 73 80 56
## C 0 48 848 29 0
## D 64 58 98 761 72
## E 13 46 5 79 949
##
## Overall Statistics
##
## Accuracy : 0.8291
## 95% CI : (0.8192, 0.8386)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7843
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.8608 0.7726 0.8265 0.7894 0.8771
## Specificity 0.9694 0.9231 0.9842 0.9407 0.9702
## Pos Pred Value 0.9178 0.7068 0.9168 0.7227 0.8690
## Neg Pred Value 0.9460 0.9442 0.9641 0.9580 0.9723
## Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Detection Rate 0.2449 0.1495 0.1441 0.1293 0.1613
## Detection Prevalence 0.2668 0.2116 0.1572 0.1789 0.1856
## Balanced Accuracy 0.9151 0.8479 0.9053 0.8650 0.9237
# plot matrix results
plot(confMatDecTree$table, col = confMatDecTree$byClass,
main = paste("Decision Tree - Accuracy =",
round(confMatDecTree$overall['Accuracy'], 4)))
# model fit
set.seed(301)
controlGBM <- trainControl(method = "repeatedcv", number = 5, repeats = 1)
modFitGBM <- train(classe ~ ., data=TrainSet, method = "gbm",
trControl = controlGBM, verbose = FALSE)
## Loading required package: gbm
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
## Loading required package: plyr
modFitGBM$finalModel
## A gradient boosted model with multinomial loss function.
## 150 iterations were performed.
## There were 53 predictors of which 45 had non-zero influence.
# prediction on Test dataset
predictGBM <- predict(modFitGBM, newdata=TestSet)
confMatGBM <- confusionMatrix(predictGBM, TestSet$classe)
confMatGBM
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1669 14 0 0 0
## B 1 1112 9 4 6
## C 0 8 1015 12 1
## D 4 5 2 948 9
## E 0 0 0 0 1066
##
## Overall Statistics
##
## Accuracy : 0.9873
## 95% CI : (0.9841, 0.99)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9839
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9970 0.9763 0.9893 0.9834 0.9852
## Specificity 0.9967 0.9958 0.9957 0.9959 1.0000
## Pos Pred Value 0.9917 0.9823 0.9797 0.9793 1.0000
## Neg Pred Value 0.9988 0.9943 0.9977 0.9967 0.9967
## Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Detection Rate 0.2836 0.1890 0.1725 0.1611 0.1811
## Detection Prevalence 0.2860 0.1924 0.1760 0.1645 0.1811
## Balanced Accuracy 0.9968 0.9860 0.9925 0.9897 0.9926
# plot matrix results
plot(confMatGBM$table, col = confMatGBM$byClass,
main = paste("GBM - Accuracy =", round(confMatGBM$overall['Accuracy'], 4)))
predictTEST <- predict(modFitRandForest, newdata=testing)
predictTEST
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E