# Loading Library
library(caret)
# loading the data
training <- read.csv("pml-training.csv", header = T, na.strings = c("NA", "", "#DIV0/!")) # add NA to the blank space
testing <- read.csv("pml-testing.csv", header = T, na.strings = c("NA", "", "#DIV0/!"))
dim(training)
## [1] 19622 160
dim(testing)
## [1] 20 160
#names(training)
summary(training$classe)
## A B C D E
## 5580 3797 3422 3216 3607
unique(training$user_name)
## [1] carlitos pedro adelmo charles eurico jeremy
## Levels: adelmo carlitos charles eurico jeremy pedro
summary(training$user_name)
## adelmo carlitos charles eurico jeremy pedro
## 3892 3112 3536 3070 3402 2610
I first remove the columns which contains any missing values to use only the columns which is completed.
# Removing the first columns which is the index
training <- training[, -1]
testing <- testing[, -1]
dim(training)
## [1] 19622 159
# Remove the columns with missing values
training1 <- training[, colSums(is.na(training)) == 0]
dim(training1)
## [1] 19622 59
testing1 <- testing[, colSums(is.na(testing)) == 0]
dim(testing1)
## [1] 20 59
# Checking the remaining variables
#names(training1)
str(training1)
## 'data.frame': 19622 obs. of 59 variables:
## $ user_name : Factor w/ 6 levels "adelmo","carlitos",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ raw_timestamp_part_1: int 1323084231 1323084231 1323084231 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 ...
## $ raw_timestamp_part_2: int 788290 808298 820366 120339 196328 304277 368296 440390 484323 484434 ...
## $ cvtd_timestamp : Factor w/ 20 levels "2/12/2011 13:32",..: 15 15 15 15 15 15 15 15 15 15 ...
## $ new_window : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ num_window : int 11 11 11 12 12 12 12 12 12 12 ...
## $ roll_belt : num 1.41 1.41 1.42 1.48 1.48 1.45 1.42 1.42 1.43 1.45 ...
## $ pitch_belt : num 8.07 8.07 8.07 8.05 8.07 8.06 8.09 8.13 8.16 8.17 ...
## $ yaw_belt : num -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 ...
## $ total_accel_belt : int 3 3 3 3 3 3 3 3 3 3 ...
## $ gyros_belt_x : num 0 0.02 0 0.02 0.02 0.02 0.02 0.02 0.02 0.03 ...
## $ gyros_belt_y : num 0 0 0 0 0.02 0 0 0 0 0 ...
## $ gyros_belt_z : num -0.02 -0.02 -0.02 -0.03 -0.02 -0.02 -0.02 -0.02 -0.02 0 ...
## $ accel_belt_x : int -21 -22 -20 -22 -21 -21 -22 -22 -20 -21 ...
## $ accel_belt_y : int 4 4 5 3 2 4 3 4 2 4 ...
## $ accel_belt_z : int 22 22 23 21 24 21 21 21 24 22 ...
## $ magnet_belt_x : int -3 -7 -2 -6 -6 0 -4 -2 1 -3 ...
## $ magnet_belt_y : int 599 608 600 604 600 603 599 603 602 609 ...
## $ magnet_belt_z : int -313 -311 -305 -310 -302 -312 -311 -313 -312 -308 ...
## $ roll_arm : num -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 ...
## $ pitch_arm : num 22.5 22.5 22.5 22.1 22.1 22 21.9 21.8 21.7 21.6 ...
## $ yaw_arm : num -161 -161 -161 -161 -161 -161 -161 -161 -161 -161 ...
## $ total_accel_arm : int 34 34 34 34 34 34 34 34 34 34 ...
## $ gyros_arm_x : num 0 0.02 0.02 0.02 0 0.02 0 0.02 0.02 0.02 ...
## $ gyros_arm_y : num 0 -0.02 -0.02 -0.03 -0.03 -0.03 -0.03 -0.02 -0.03 -0.03 ...
## $ gyros_arm_z : num -0.02 -0.02 -0.02 0.02 0 0 0 0 -0.02 -0.02 ...
## $ accel_arm_x : int -288 -290 -289 -289 -289 -289 -289 -289 -288 -288 ...
## $ accel_arm_y : int 109 110 110 111 111 111 111 111 109 110 ...
## $ accel_arm_z : int -123 -125 -126 -123 -123 -122 -125 -124 -122 -124 ...
## $ magnet_arm_x : int -368 -369 -368 -372 -374 -369 -373 -372 -369 -376 ...
## $ magnet_arm_y : int 337 337 344 344 337 342 336 338 341 334 ...
## $ magnet_arm_z : int 516 513 513 512 506 513 509 510 518 516 ...
## $ roll_dumbbell : num 13.1 13.1 12.9 13.4 13.4 ...
## $ pitch_dumbbell : num -70.5 -70.6 -70.3 -70.4 -70.4 ...
## $ yaw_dumbbell : num -84.9 -84.7 -85.1 -84.9 -84.9 ...
## $ total_accel_dumbbell: int 37 37 37 37 37 37 37 37 37 37 ...
## $ gyros_dumbbell_x : num 0 0 0 0 0 0 0 0 0 0 ...
## $ gyros_dumbbell_y : num -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 ...
## $ gyros_dumbbell_z : num 0 0 0 -0.02 0 0 0 0 0 0 ...
## $ accel_dumbbell_x : int -234 -233 -232 -232 -233 -234 -232 -234 -232 -235 ...
## $ accel_dumbbell_y : int 47 47 46 48 48 48 47 46 47 48 ...
## $ accel_dumbbell_z : int -271 -269 -270 -269 -270 -269 -270 -272 -269 -270 ...
## $ magnet_dumbbell_x : int -559 -555 -561 -552 -554 -558 -551 -555 -549 -558 ...
## $ magnet_dumbbell_y : int 293 296 298 303 292 294 295 300 292 291 ...
## $ magnet_dumbbell_z : num -65 -64 -63 -60 -68 -66 -70 -74 -65 -69 ...
## $ roll_forearm : num 28.4 28.3 28.3 28.1 28 27.9 27.9 27.8 27.7 27.7 ...
## $ pitch_forearm : num -63.9 -63.9 -63.9 -63.9 -63.9 -63.9 -63.9 -63.8 -63.8 -63.8 ...
## $ yaw_forearm : num -153 -153 -152 -152 -152 -152 -152 -152 -152 -152 ...
## $ total_accel_forearm : int 36 36 36 36 36 36 36 36 36 36 ...
## $ gyros_forearm_x : num 0.03 0.02 0.03 0.02 0.02 0.02 0.02 0.02 0.03 0.02 ...
## $ gyros_forearm_y : num 0 0 -0.02 -0.02 0 -0.02 0 -0.02 0 0 ...
## $ gyros_forearm_z : num -0.02 -0.02 0 0 -0.02 -0.03 -0.02 0 -0.02 -0.02 ...
## $ accel_forearm_x : int 192 192 196 189 189 193 195 193 193 190 ...
## $ accel_forearm_y : int 203 203 204 206 206 203 205 205 204 205 ...
## $ accel_forearm_z : int -215 -216 -213 -214 -214 -215 -215 -213 -214 -215 ...
## $ magnet_forearm_x : int -17 -18 -18 -16 -17 -9 -18 -9 -16 -22 ...
## $ magnet_forearm_y : num 654 661 658 658 655 660 659 660 653 656 ...
## $ magnet_forearm_z : num 476 473 469 469 473 478 470 474 476 473 ...
## $ classe : Factor w/ 5 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...
There are still many columns which contains very small values in comparision with others. We also need to exclude these variables in the regression models.
nzv <- nearZeroVar(training1, saveMetrics = T)
str(nzv)
## 'data.frame': 59 obs. of 4 variables:
## $ freqRatio : num 1.1 1 1 1 47.3 ...
## $ percentUnique: num 0.0306 4.2656 85.5315 0.1019 0.0102 ...
## $ zeroVar : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ nzv : logi FALSE FALSE FALSE FALSE TRUE FALSE ...
training2 <- training1[, nzv$nzv == F] # Keep only the Non-near zero variables
nzv2 <- nearZeroVar(testing1, saveMetrics = T)
testing2 <- testing1[, nzv2$nzv == F] # Keep only the Non-near zero variables
dim(testing2)
## [1] 20 58
dim(training2)
## [1] 19622 58
#str(training2)
#name1 <- names(training1)
#name2 <- names(training2)
#non <- grep(name2, name1)
#testing2
To do Machine Learning and test the model, I split the data into training (70%) and the test (30%) set.
set.seed(3456)
inTrain <- createDataPartition(training2$classe, p = .7, list = F)
myTrain <- training2[inTrain, ]
myTest <- training2[-inTrain, ]
Here I am going to use 3 techniques which has been shown in the class. It includes “Prediction with Decision Trees”, “Prediction with Random forests”, and “Prediction with Generalized Boosted Regression”
set.seed(123456)
library(rattle)
library(rpart)
mod_tree <- train(classe~., method = "rpart", data = myTrain)
pred_tree <- predict(mod_tree, newdata = myTest)
#print(mod_tree$finalModel)
fancyRpartPlot(mod_tree$finalModel)
# Accuracy of the model
(Accuracy_tree <- confusionMatrix(pred_tree, myTest$classe)$overall["Accuracy"])
## Accuracy
## 0.4898895
set.seed(123456)
library(randomForest)
mod_rf <- randomForest(classe~., data = myTrain)
pred_rf <- predict(mod_rf, myTest)
(method_rf <- confusionMatrix(pred_rf, myTest$classe))
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1674 0 0 0 0
## B 0 1139 0 0 0
## C 0 0 1026 1 0
## D 0 0 0 963 0
## E 0 0 0 0 1082
##
## Overall Statistics
##
## Accuracy : 0.9998
## 95% CI : (0.9991, 1)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9998
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 1.0000 1.0000 1.0000 0.9990 1.0000
## Specificity 1.0000 1.0000 0.9998 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 0.9990 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000 0.9998 1.0000
## Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Detection Rate 0.2845 0.1935 0.1743 0.1636 0.1839
## Detection Prevalence 0.2845 0.1935 0.1745 0.1636 0.1839
## Balanced Accuracy 1.0000 1.0000 0.9999 0.9995 1.0000
# Accuracy of the method
method_rf$overall["Accuracy"]
## Accuracy
## 0.9998301
mod_rf
##
## Call:
## randomForest(formula = classe ~ ., data = myTrain)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 7
##
## OOB estimate of error rate: 0.09%
## Confusion matrix:
## A B C D E class.error
## A 3906 0 0 0 0 0.0000000000
## B 2 2656 0 0 0 0.0007524454
## C 0 5 2389 2 0 0.0029215359
## D 0 0 1 2250 1 0.0008880995
## E 0 0 0 2 2523 0.0007920792
# Plotting the fit
plot(mod_rf)
set.seed(12345)
fitControl <- trainControl(method = "repeatedcv",
number = 5,
repeats = 1)
mod_GBR <- train(classe ~., data = myTrain, method = "gbm", trControl = fitControl, verbose = F)
pred_GBR <- predict(mod_GBR, myTest)
confusionMatrix(pred_GBR, myTest$classe)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1674 0 0 0 0
## B 0 1136 0 0 0
## C 0 0 1018 1 0
## D 0 3 8 963 1
## E 0 0 0 0 1081
##
## Overall Statistics
##
## Accuracy : 0.9978
## 95% CI : (0.9962, 0.9988)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9972
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 1.0000 0.9974 0.9922 0.9990 0.9991
## Specificity 1.0000 1.0000 0.9998 0.9976 1.0000
## Pos Pred Value 1.0000 1.0000 0.9990 0.9877 1.0000
## Neg Pred Value 1.0000 0.9994 0.9984 0.9998 0.9998
## Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Detection Rate 0.2845 0.1930 0.1730 0.1636 0.1837
## Detection Prevalence 0.2845 0.1930 0.1732 0.1657 0.1837
## Balanced Accuracy 1.0000 0.9987 0.9960 0.9983 0.9995
plot(mod_GBR)
From the result above we see that the Decision Trees method has a very low accuracy (0.4898895). Whilst both Random Forests and Generalized Boosted methods give us a very high accuracy; 0.9998 and 0.9976, respectively. I therefore, use the Random Forests method for predicting results of the Test Data.
(Predicting_test_data_GBR <- predict(mod_GBR, newdata = testing2))
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E