Loading the data and library

# Loading Library
library(caret)
# loading the data
training <- read.csv("pml-training.csv", header = T, na.strings = c("NA", "", "#DIV0/!")) # add NA to the blank space
testing <- read.csv("pml-testing.csv", header = T, na.strings = c("NA", "", "#DIV0/!"))

dim(training)

## [1] 19622   160

dim(testing)

## [1]  20 160

#names(training)
summary(training$classe)

##    A    B    C    D    E 
## 5580 3797 3422 3216 3607

unique(training$user_name)

## [1] carlitos pedro    adelmo   charles  eurico   jeremy  
## Levels: adelmo carlitos charles eurico jeremy pedro

summary(training$user_name)

##   adelmo carlitos  charles   eurico   jeremy    pedro 
##     3892     3112     3536     3070     3402     2610

Cleaning the Data

Removing the missing values

I first remove the columns which contains any missing values to use only the columns which is completed.

# Removing the first columns which is the index
training <- training[, -1]
testing <- testing[, -1]
dim(training)

## [1] 19622   159

# Remove the columns with missing values
training1 <- training[, colSums(is.na(training)) == 0] 
dim(training1)

## [1] 19622    59

testing1 <- testing[, colSums(is.na(testing)) == 0]
dim(testing1)

## [1] 20 59

# Checking the remaining variables
#names(training1)
str(training1)

## 'data.frame':    19622 obs. of  59 variables:
##  $ user_name           : Factor w/ 6 levels "adelmo","carlitos",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ raw_timestamp_part_1: int  1323084231 1323084231 1323084231 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 1323084232 ...
##  $ raw_timestamp_part_2: int  788290 808298 820366 120339 196328 304277 368296 440390 484323 484434 ...
##  $ cvtd_timestamp      : Factor w/ 20 levels "2/12/2011 13:32",..: 15 15 15 15 15 15 15 15 15 15 ...
##  $ new_window          : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ num_window          : int  11 11 11 12 12 12 12 12 12 12 ...
##  $ roll_belt           : num  1.41 1.41 1.42 1.48 1.48 1.45 1.42 1.42 1.43 1.45 ...
##  $ pitch_belt          : num  8.07 8.07 8.07 8.05 8.07 8.06 8.09 8.13 8.16 8.17 ...
##  $ yaw_belt            : num  -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 -94.4 ...
##  $ total_accel_belt    : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ gyros_belt_x        : num  0 0.02 0 0.02 0.02 0.02 0.02 0.02 0.02 0.03 ...
##  $ gyros_belt_y        : num  0 0 0 0 0.02 0 0 0 0 0 ...
##  $ gyros_belt_z        : num  -0.02 -0.02 -0.02 -0.03 -0.02 -0.02 -0.02 -0.02 -0.02 0 ...
##  $ accel_belt_x        : int  -21 -22 -20 -22 -21 -21 -22 -22 -20 -21 ...
##  $ accel_belt_y        : int  4 4 5 3 2 4 3 4 2 4 ...
##  $ accel_belt_z        : int  22 22 23 21 24 21 21 21 24 22 ...
##  $ magnet_belt_x       : int  -3 -7 -2 -6 -6 0 -4 -2 1 -3 ...
##  $ magnet_belt_y       : int  599 608 600 604 600 603 599 603 602 609 ...
##  $ magnet_belt_z       : int  -313 -311 -305 -310 -302 -312 -311 -313 -312 -308 ...
##  $ roll_arm            : num  -128 -128 -128 -128 -128 -128 -128 -128 -128 -128 ...
##  $ pitch_arm           : num  22.5 22.5 22.5 22.1 22.1 22 21.9 21.8 21.7 21.6 ...
##  $ yaw_arm             : num  -161 -161 -161 -161 -161 -161 -161 -161 -161 -161 ...
##  $ total_accel_arm     : int  34 34 34 34 34 34 34 34 34 34 ...
##  $ gyros_arm_x         : num  0 0.02 0.02 0.02 0 0.02 0 0.02 0.02 0.02 ...
##  $ gyros_arm_y         : num  0 -0.02 -0.02 -0.03 -0.03 -0.03 -0.03 -0.02 -0.03 -0.03 ...
##  $ gyros_arm_z         : num  -0.02 -0.02 -0.02 0.02 0 0 0 0 -0.02 -0.02 ...
##  $ accel_arm_x         : int  -288 -290 -289 -289 -289 -289 -289 -289 -288 -288 ...
##  $ accel_arm_y         : int  109 110 110 111 111 111 111 111 109 110 ...
##  $ accel_arm_z         : int  -123 -125 -126 -123 -123 -122 -125 -124 -122 -124 ...
##  $ magnet_arm_x        : int  -368 -369 -368 -372 -374 -369 -373 -372 -369 -376 ...
##  $ magnet_arm_y        : int  337 337 344 344 337 342 336 338 341 334 ...
##  $ magnet_arm_z        : int  516 513 513 512 506 513 509 510 518 516 ...
##  $ roll_dumbbell       : num  13.1 13.1 12.9 13.4 13.4 ...
##  $ pitch_dumbbell      : num  -70.5 -70.6 -70.3 -70.4 -70.4 ...
##  $ yaw_dumbbell        : num  -84.9 -84.7 -85.1 -84.9 -84.9 ...
##  $ total_accel_dumbbell: int  37 37 37 37 37 37 37 37 37 37 ...
##  $ gyros_dumbbell_x    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ gyros_dumbbell_y    : num  -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 -0.02 ...
##  $ gyros_dumbbell_z    : num  0 0 0 -0.02 0 0 0 0 0 0 ...
##  $ accel_dumbbell_x    : int  -234 -233 -232 -232 -233 -234 -232 -234 -232 -235 ...
##  $ accel_dumbbell_y    : int  47 47 46 48 48 48 47 46 47 48 ...
##  $ accel_dumbbell_z    : int  -271 -269 -270 -269 -270 -269 -270 -272 -269 -270 ...
##  $ magnet_dumbbell_x   : int  -559 -555 -561 -552 -554 -558 -551 -555 -549 -558 ...
##  $ magnet_dumbbell_y   : int  293 296 298 303 292 294 295 300 292 291 ...
##  $ magnet_dumbbell_z   : num  -65 -64 -63 -60 -68 -66 -70 -74 -65 -69 ...
##  $ roll_forearm        : num  28.4 28.3 28.3 28.1 28 27.9 27.9 27.8 27.7 27.7 ...
##  $ pitch_forearm       : num  -63.9 -63.9 -63.9 -63.9 -63.9 -63.9 -63.9 -63.8 -63.8 -63.8 ...
##  $ yaw_forearm         : num  -153 -153 -152 -152 -152 -152 -152 -152 -152 -152 ...
##  $ total_accel_forearm : int  36 36 36 36 36 36 36 36 36 36 ...
##  $ gyros_forearm_x     : num  0.03 0.02 0.03 0.02 0.02 0.02 0.02 0.02 0.03 0.02 ...
##  $ gyros_forearm_y     : num  0 0 -0.02 -0.02 0 -0.02 0 -0.02 0 0 ...
##  $ gyros_forearm_z     : num  -0.02 -0.02 0 0 -0.02 -0.03 -0.02 0 -0.02 -0.02 ...
##  $ accel_forearm_x     : int  192 192 196 189 189 193 195 193 193 190 ...
##  $ accel_forearm_y     : int  203 203 204 206 206 203 205 205 204 205 ...
##  $ accel_forearm_z     : int  -215 -216 -213 -214 -214 -215 -215 -213 -214 -215 ...
##  $ magnet_forearm_x    : int  -17 -18 -18 -16 -17 -9 -18 -9 -16 -22 ...
##  $ magnet_forearm_y    : num  654 661 658 658 655 660 659 660 653 656 ...
##  $ magnet_forearm_z    : num  476 473 469 469 473 478 470 474 476 473 ...
##  $ classe              : Factor w/ 5 levels "A","B","C","D",..: 1 1 1 1 1 1 1 1 1 1 ...

Removing the near zero values

There are still many columns which contains very small values in comparision with others. We also need to exclude these variables in the regression models.

nzv <- nearZeroVar(training1, saveMetrics = T)
str(nzv)

## 'data.frame':    59 obs. of  4 variables:
##  $ freqRatio    : num  1.1 1 1 1 47.3 ...
##  $ percentUnique: num  0.0306 4.2656 85.5315 0.1019 0.0102 ...
##  $ zeroVar      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ nzv          : logi  FALSE FALSE FALSE FALSE TRUE FALSE ...

training2 <- training1[, nzv$nzv == F] # Keep only the Non-near zero variables

nzv2 <- nearZeroVar(testing1, saveMetrics = T)
testing2 <- testing1[, nzv2$nzv == F] # Keep only the Non-near zero variables
dim(testing2)

## [1] 20 58

dim(training2)

## [1] 19622    58

#str(training2)
#name1 <- names(training1)
#name2 <- names(training2)
#non <- grep(name2, name1)

#testing2

Data splitting

To do Machine Learning and test the model, I split the data into training (70%) and the test (30%) set.

set.seed(3456)
inTrain <- createDataPartition(training2$classe, p = .7, list = F)
myTrain <- training2[inTrain, ]
myTest <-  training2[-inTrain, ]

Machine Learning: Prediction Algorithims

Here I am going to use 3 techniques which has been shown in the class. It includes “Prediction with Decision Trees”, “Prediction with Random forests”, and “Prediction with Generalized Boosted Regression”

## Prediction with Decission Trees

set.seed(123456)
library(rattle)
library(rpart)
mod_tree <- train(classe~., method = "rpart", data = myTrain)
pred_tree <- predict(mod_tree, newdata = myTest)
#print(mod_tree$finalModel)
fancyRpartPlot(mod_tree$finalModel)

# Accuracy of the model
(Accuracy_tree <- confusionMatrix(pred_tree, myTest$classe)$overall["Accuracy"])

##  Accuracy 
## 0.4898895

Prediction with Random Forests

set.seed(123456)
library(randomForest)

mod_rf <- randomForest(classe~., data = myTrain)
pred_rf <- predict(mod_rf, myTest)
(method_rf <- confusionMatrix(pred_rf, myTest$classe))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1674    0    0    0    0
##          B    0 1139    0    0    0
##          C    0    0 1026    1    0
##          D    0    0    0  963    0
##          E    0    0    0    0 1082
## 
## Overall Statistics
##                                      
##                Accuracy : 0.9998     
##                  95% CI : (0.9991, 1)
##     No Information Rate : 0.2845     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 0.9998     
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   1.0000   1.0000   0.9990   1.0000
## Specificity            1.0000   1.0000   0.9998   1.0000   1.0000
## Pos Pred Value         1.0000   1.0000   0.9990   1.0000   1.0000
## Neg Pred Value         1.0000   1.0000   1.0000   0.9998   1.0000
## Prevalence             0.2845   0.1935   0.1743   0.1638   0.1839
## Detection Rate         0.2845   0.1935   0.1743   0.1636   0.1839
## Detection Prevalence   0.2845   0.1935   0.1745   0.1636   0.1839
## Balanced Accuracy      1.0000   1.0000   0.9999   0.9995   1.0000

# Accuracy of the method
method_rf$overall["Accuracy"]

##  Accuracy 
## 0.9998301

mod_rf

## 
## Call:
##  randomForest(formula = classe ~ ., data = myTrain) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 7
## 
##         OOB estimate of  error rate: 0.09%
## Confusion matrix:
##      A    B    C    D    E  class.error
## A 3906    0    0    0    0 0.0000000000
## B    2 2656    0    0    0 0.0007524454
## C    0    5 2389    2    0 0.0029215359
## D    0    0    1 2250    1 0.0008880995
## E    0    0    0    2 2523 0.0007920792

# Plotting the fit
plot(mod_rf)

Prediction with Generalized Boosted Regression

set.seed(12345)
fitControl <- trainControl(method = "repeatedcv",
                           number = 5,
                           repeats = 1)
mod_GBR <- train(classe ~., data = myTrain, method = "gbm", trControl = fitControl, verbose = F)
pred_GBR <- predict(mod_GBR, myTest)
confusionMatrix(pred_GBR, myTest$classe)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1674    0    0    0    0
##          B    0 1136    0    0    0
##          C    0    0 1018    1    0
##          D    0    3    8  963    1
##          E    0    0    0    0 1081
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9978          
##                  95% CI : (0.9962, 0.9988)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9972          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            1.0000   0.9974   0.9922   0.9990   0.9991
## Specificity            1.0000   1.0000   0.9998   0.9976   1.0000
## Pos Pred Value         1.0000   1.0000   0.9990   0.9877   1.0000
## Neg Pred Value         1.0000   0.9994   0.9984   0.9998   0.9998
## Prevalence             0.2845   0.1935   0.1743   0.1638   0.1839
## Detection Rate         0.2845   0.1930   0.1730   0.1636   0.1837
## Detection Prevalence   0.2845   0.1930   0.1732   0.1657   0.1837
## Balanced Accuracy      1.0000   0.9987   0.9960   0.9983   0.9995

plot(mod_GBR)

Discussion

From the result above we see that the Decision Trees method has a very low accuracy (0.4898895). Whilst both Random Forests and Generalized Boosted methods give us a very high accuracy; 0.9998 and 0.9976, respectively. I therefore, use the Random Forests method for predicting results of the Test Data.

predicting results on the Test Data

(Predicting_test_data_GBR <- predict(mod_GBR, newdata = testing2))

##  [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E

Practical Machine Learning Course Project

Duc Nguyen