library(knitr)
library(rpart.plot)
## Loading required package: rpart
library(rpart.plot)
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
library(corrplot)
## corrplot 0.84 loaded
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(rpart)
library(RColorBrewer)
library(gbm)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
library(plyr)
library(rpart)
dt_training <- read.csv("pml-training.csv", na.strings=c("NA","#DIV/0!",""))
dt_testing <- read.csv("pml-testing.csv", na.strings=c("NA","#DIV/0!",""))
options <- names(dt_testing[,colSums(is.na(dt_testing)) == 0])[8:59]
dt_training <- dt_training[,c(options,"classe")]
dt_testing <- dt_testing[,c(options,"problem_id")]
dim(dt_training); dim(dt_testing);
## [1] 19622 53
## [1] 20 53
library(caret)
set.seed(54321);
inTrain <- createDataPartition(dt_training$classe, p=0.6, list=FALSE)
training <- dt_training[inTrain,]
testing <- dt_training[-inTrain,]
dim(training); dim(testing);
## [1] 11776 53
## [1] 7846 53
set.seed(54321)
modFitDT <- rpart(classe ~ ., data = training, method="class", control = rpart.control(method = "cv", number = 10))
fancyRpartPlot(modFitDT)
set.seed(54321)
prediction <- predict(modFitDT, testing, type = "class")
confusionMatrix(prediction, testing$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 2047 317 41 147 65
## B 44 736 62 35 61
## C 60 185 1120 193 176
## D 30 107 77 813 61
## E 51 173 68 98 1079
##
## Overall Statistics
##
## Accuracy : 0.7386
## 95% CI : (0.7287, 0.7483)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6676
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9171 0.48485 0.8187 0.6322 0.7483
## Specificity 0.8985 0.96808 0.9052 0.9581 0.9391
## Pos Pred Value 0.7822 0.78465 0.6459 0.7472 0.7345
## Neg Pred Value 0.9646 0.88680 0.9594 0.9300 0.9431
## Prevalence 0.2845 0.19347 0.1744 0.1639 0.1838
## Detection Rate 0.2609 0.09381 0.1427 0.1036 0.1375
## Detection Prevalence 0.3335 0.11955 0.2210 0.1387 0.1872
## Balanced Accuracy 0.9078 0.72646 0.8620 0.7951 0.8437
set.seed(54321)
modFitRF <- randomForest(classe ~ ., data = training, method = "rf", importance = T, trControl = trainControl(method = "cv", classProbs=TRUE,savePredictions=TRUE,allowParallel=TRUE, number = 10))
plot(modFitRF)
##Predicting with the Random Forest Model
prediction <- predict(modFitRF, testing, type = "class")
confusionMatrix(prediction, testing$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 2225 9 0 0 0
## B 7 1503 9 0 0
## C 0 6 1359 17 0
## D 0 0 0 1269 3
## E 0 0 0 0 1439
##
## Overall Statistics
##
## Accuracy : 0.9935
## 95% CI : (0.9915, 0.9952)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9918
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9969 0.9901 0.9934 0.9868 0.9979
## Specificity 0.9984 0.9975 0.9964 0.9995 1.0000
## Pos Pred Value 0.9960 0.9895 0.9834 0.9976 1.0000
## Neg Pred Value 0.9988 0.9976 0.9986 0.9974 0.9995
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2836 0.1916 0.1732 0.1617 0.1834
## Detection Prevalence 0.2847 0.1936 0.1761 0.1621 0.1834
## Balanced Accuracy 0.9976 0.9938 0.9949 0.9932 0.9990
##install.packages("caret", dependencies = c("Depends","Suggests"))
##install.packages("lubridate")
library("caret")
library("lubridate")
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:plyr':
##
## here
## The following object is masked from 'package:base':
##
## date
modFitBoost <- train(classe ~ ., method = "gbm", data = training,
verbose = F,
trControl = trainControl(method = "cv", number = 10))
modFitBoost
## Stochastic Gradient Boosting
##
## 11776 samples
## 52 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 10599, 10598, 10598, 10599, 10600, 10597, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa
## 1 50 0.7518718 0.6854187
## 1 100 0.8199747 0.7721598
## 1 150 0.8536873 0.8148744
## 2 50 0.8560651 0.8176832
## 2 100 0.9065053 0.8816813
## 2 150 0.9307906 0.9124310
## 3 50 0.8984370 0.8713841
## 3 100 0.9419986 0.9266035
## 3 150 0.9601732 0.9496176
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150,
## interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
plot(modFitBoost)
##Predicting with the Boosting Model
prediction <- predict(modFitBoost, testing)
confusionMatrix(prediction, testing$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 2196 47 0 1 5
## B 31 1431 33 4 8
## C 3 38 1316 48 12
## D 2 0 16 1222 13
## E 0 2 3 11 1404
##
## Overall Statistics
##
## Accuracy : 0.9647
## 95% CI : (0.9604, 0.9687)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9553
## Mcnemar's Test P-Value : 1.068e-05
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9839 0.9427 0.9620 0.9502 0.9736
## Specificity 0.9906 0.9880 0.9844 0.9953 0.9975
## Pos Pred Value 0.9764 0.9496 0.9287 0.9753 0.9887
## Neg Pred Value 0.9936 0.9863 0.9919 0.9903 0.9941
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2799 0.1824 0.1677 0.1557 0.1789
## Detection Prevalence 0.2866 0.1921 0.1806 0.1597 0.1810
## Balanced Accuracy 0.9872 0.9653 0.9732 0.9728 0.9856
predictionDT <- predict(modFitDT, dt_testing)
predictionDT
## A B C D E
## 1 0.03521127 0.130281690 0.518779343 0.15551643 0.160211268
## 2 0.66047591 0.217063262 0.011027278 0.08183401 0.029599536
## 3 0.03301887 0.204402516 0.194968553 0.16981132 0.397798742
## 4 1.00000000 0.000000000 0.000000000 0.00000000 0.000000000
## 5 0.70308123 0.142857143 0.030812325 0.09803922 0.025210084
## 6 0.03521127 0.130281690 0.518779343 0.15551643 0.160211268
## 7 0.05241090 0.125786164 0.027253669 0.70649895 0.088050314
## 8 0.70308123 0.142857143 0.030812325 0.09803922 0.025210084
## 9 0.99487179 0.005128205 0.000000000 0.00000000 0.000000000
## 10 0.66047591 0.217063262 0.011027278 0.08183401 0.029599536
## 11 0.03521127 0.130281690 0.518779343 0.15551643 0.160211268
## 12 0.03301887 0.204402516 0.194968553 0.16981132 0.397798742
## 13 0.03521127 0.130281690 0.518779343 0.15551643 0.160211268
## 14 0.99487179 0.005128205 0.000000000 0.00000000 0.000000000
## 15 0.03301887 0.204402516 0.194968553 0.16981132 0.397798742
## 16 0.04545455 0.342245989 0.005347594 0.13368984 0.473262032
## 17 0.98314607 0.000000000 0.011235955 0.00000000 0.005617978
## 18 0.66047591 0.217063262 0.011027278 0.08183401 0.029599536
## 19 0.66047591 0.217063262 0.011027278 0.08183401 0.029599536
## 20 0.04678363 0.771929825 0.023391813 0.05847953 0.099415205
predictionRF <- predict(modFitRF, dt_testing)
predictionRF
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
predictionBoost <- predict(modFitBoost, dt_testing)
predictionBoost
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
prediction_originaltesting_RF <- predict(modFitRF, testing, type = "class")