One thing that people regularly do is quantify how much of a particular activity they do, but they rarely quantify how well they do it. In this project, your goal will be to use data from accelerometers on the belt, forearm, arm, and dumbell of 6 participants.
The goal of this project is to predict the manner in which they did the exercise. This is the “classe” variable in the training set that we use with other variables to predict with.
set.seed(12345)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
url_train <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
training <- read.csv(url_train, na.strings = c("", "NA", "#DIV/0!"))
url_test <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
testdata<- read.csv(url_test, na.strings = c("", "NA", "#DIV/0!"))
dim(training )
## [1] 19622 160
dim(testdata)
## [1] 20 160
# removing "NA" columns
table(colSums(is.na(training)))
##
## 0 19216 19217 19218 19220 19221 19225 19226 19227 19248 19293 19294 19296
## 60 67 1 1 1 4 1 4 2 2 1 1 2
## 19299 19300 19301 19622
## 1 4 2 6
colselect <- colnames(training)[colSums(is.na(training)) == 0]
colselect
## [1] "X" "user_name" "raw_timestamp_part_1"
## [4] "raw_timestamp_part_2" "cvtd_timestamp" "new_window"
## [7] "num_window" "roll_belt" "pitch_belt"
## [10] "yaw_belt" "total_accel_belt" "gyros_belt_x"
## [13] "gyros_belt_y" "gyros_belt_z" "accel_belt_x"
## [16] "accel_belt_y" "accel_belt_z" "magnet_belt_x"
## [19] "magnet_belt_y" "magnet_belt_z" "roll_arm"
## [22] "pitch_arm" "yaw_arm" "total_accel_arm"
## [25] "gyros_arm_x" "gyros_arm_y" "gyros_arm_z"
## [28] "accel_arm_x" "accel_arm_y" "accel_arm_z"
## [31] "magnet_arm_x" "magnet_arm_y" "magnet_arm_z"
## [34] "roll_dumbbell" "pitch_dumbbell" "yaw_dumbbell"
## [37] "total_accel_dumbbell" "gyros_dumbbell_x" "gyros_dumbbell_y"
## [40] "gyros_dumbbell_z" "accel_dumbbell_x" "accel_dumbbell_y"
## [43] "accel_dumbbell_z" "magnet_dumbbell_x" "magnet_dumbbell_y"
## [46] "magnet_dumbbell_z" "roll_forearm" "pitch_forearm"
## [49] "yaw_forearm" "total_accel_forearm" "gyros_forearm_x"
## [52] "gyros_forearm_y" "gyros_forearm_z" "accel_forearm_x"
## [55] "accel_forearm_y" "accel_forearm_z" "magnet_forearm_x"
## [58] "magnet_forearm_y" "magnet_forearm_z" "classe"
# First 7 columns do not relate to excercise
colselect<- colselect[8: length(colselect)]
training <- training[colselect]
70% for training and 30% validation
inTrain = createDataPartition(training$classe, p = 0.7, list=FALSE)
testing = training[-inTrain,]
training = training[ inTrain,]
dim(training)
## [1] 13737 53
dim(testing)
## [1] 5885 53
training$classe <- factor(training$classe)
testing$classe<- factor(testing$classe)
nzv = nearZeroVar(x = training)
nzv
## integer(0)
We use different model to predict and choose the one with highest accuracy ### Predicting with trees
fitRpart <- train(classe ~., data=training, method="rpart")
print(fitRpart$finalModel)
## n= 13737
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 13737 9831 A (0.28 0.19 0.17 0.16 0.18)
## 2) roll_belt< 130.5 12577 8681 A (0.31 0.21 0.19 0.18 0.11)
## 4) pitch_forearm< -33.95 1107 9 A (0.99 0.0081 0 0 0) *
## 5) pitch_forearm>=-33.95 11470 8672 A (0.24 0.23 0.21 0.2 0.12)
## 10) magnet_dumbbell_y< 439.5 9738 6992 A (0.28 0.18 0.24 0.19 0.11)
## 20) roll_forearm< 123.5 6030 3573 A (0.41 0.18 0.18 0.17 0.062) *
## 21) roll_forearm>=123.5 3708 2471 C (0.078 0.18 0.33 0.23 0.18) *
## 11) magnet_dumbbell_y>=439.5 1732 831 B (0.03 0.52 0.041 0.22 0.19) *
## 3) roll_belt>=130.5 1160 10 E (0.0086 0 0 0 0.99) *
predRpart<- predict(fitRpart, testing)
levels(testing$classe)
## [1] "A" "B" "C" "D" "E"
levels(predRpart)
## [1] "A" "B" "C" "D" "E"
confusionMatrix(testing$classe, predRpart)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1525 29 116 0 4
## B 484 385 270 0 0
## C 499 37 490 0 0
## D 423 187 354 0 0
## E 153 159 289 0 481
##
## Overall Statistics
##
## Accuracy : 0.4895
## 95% CI : (0.4767, 0.5024)
## No Information Rate : 0.524
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3324
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.4945 0.48306 0.32258 NA 0.99175
## Specificity 0.9468 0.85181 0.87723 0.8362 0.88870
## Pos Pred Value 0.9110 0.33802 0.47758 NA 0.44455
## Neg Pred Value 0.6298 0.91319 0.78823 NA 0.99917
## Prevalence 0.5240 0.13543 0.25811 0.0000 0.08241
## Detection Rate 0.2591 0.06542 0.08326 0.0000 0.08173
## Detection Prevalence 0.2845 0.19354 0.17434 0.1638 0.18386
## Balanced Accuracy 0.7206 0.66743 0.59991 NA 0.94023
#fancyRpartPlot(fitRpart$finalModel)
fitRf <- train(classe ~ ., data = training, method = "rf")
predRf <- predict(fitRf, testing)
confusionMatrix(testing$classe, predRf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1672 1 1 0 0
## B 8 1129 2 0 0
## C 0 6 1017 3 0
## D 0 0 7 956 1
## E 0 0 1 1 1080
##
## Overall Statistics
##
## Accuracy : 0.9947
## 95% CI : (0.9925, 0.9964)
## No Information Rate : 0.2855
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9933
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9952 0.9938 0.9893 0.9958 0.9991
## Specificity 0.9995 0.9979 0.9981 0.9984 0.9996
## Pos Pred Value 0.9988 0.9912 0.9912 0.9917 0.9982
## Neg Pred Value 0.9981 0.9985 0.9977 0.9992 0.9998
## Prevalence 0.2855 0.1930 0.1747 0.1631 0.1837
## Detection Rate 0.2841 0.1918 0.1728 0.1624 0.1835
## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Balanced Accuracy 0.9974 0.9959 0.9937 0.9971 0.9993
fitGbm <- train(classe ~., data=training, method="gbm", verbose=FALSE)
predGbm<- predict(fitGbm, testing)
confusionMatrix(testing$classe, predGbm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1638 25 3 8 0
## B 40 1063 31 4 1
## C 0 33 981 11 1
## D 0 2 34 920 8
## E 2 11 6 9 1054
##
## Overall Statistics
##
## Accuracy : 0.9611
## 95% CI : (0.9558, 0.9659)
## No Information Rate : 0.2855
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9508
##
## Mcnemar's Test P-Value : 1.171e-05
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9750 0.9374 0.9299 0.9664 0.9906
## Specificity 0.9914 0.9840 0.9907 0.9911 0.9942
## Pos Pred Value 0.9785 0.9333 0.9561 0.9544 0.9741
## Neg Pred Value 0.9900 0.9850 0.9848 0.9935 0.9979
## Prevalence 0.2855 0.1927 0.1793 0.1618 0.1808
## Detection Rate 0.2783 0.1806 0.1667 0.1563 0.1791
## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Balanced Accuracy 0.9832 0.9607 0.9603 0.9787 0.9924
fitLda <- train(classe ~ ., data = training, method = "lda")
predLda <- predict(fitLda, testing)
confusionMatrix(testing$classe, predLda)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1397 40 110 123 4
## B 170 738 128 47 56
## C 111 112 655 127 21
## D 59 36 121 720 28
## E 41 195 92 103 651
##
## Overall Statistics
##
## Accuracy : 0.7071
## 95% CI : (0.6952, 0.7187)
## No Information Rate : 0.3021
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6289
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.7857 0.6583 0.5922 0.6429 0.8566
## Specificity 0.9326 0.9158 0.9224 0.9488 0.9159
## Pos Pred Value 0.8345 0.6479 0.6384 0.7469 0.6017
## Neg Pred Value 0.9095 0.9193 0.9072 0.9187 0.9773
## Prevalence 0.3021 0.1905 0.1879 0.1903 0.1291
## Detection Rate 0.2374 0.1254 0.1113 0.1223 0.1106
## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Balanced Accuracy 0.8591 0.7871 0.7573 0.7958 0.8862
print(paste0("RPART accuracy = ", confusionMatrix(predRpart, testing$classe)$overall['Accuracy']))
## [1] "RPART accuracy = 0.489549702633815"
print(paste0("RF accuracy = ", confusionMatrix(predRf, testing$classe)$overall['Accuracy']))
## [1] "RF accuracy = 0.994732370433305"
print(paste0("GBM accuracy = ", confusionMatrix(predGbm, testing$classe)$overall['Accuracy']))
## [1] "GBM accuracy = 0.961087510620221"
print(paste0("LDA accuracy = ", confusionMatrix(predLda, testing$classe)$overall['Accuracy']))
## [1] "LDA accuracy = 0.707051826677995"
Random forest has the highiest accuracy, so we use for predicting the test data
testRf <- predict(fitRf, testdata)
testRf
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E