ML Assignment

Download training and testing data and cleaning the data

url_train <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
training <- read.csv(url_train, na.strings = c("", "NA", "#DIV/0!"))
url_test <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
testdata<- read.csv(url_test, na.strings = c("", "NA", "#DIV/0!"))

dim(training )

## [1] 19622   160

dim(testdata)

## [1]  20 160

# removing "NA" columns
table(colSums(is.na(training)))

## 
##     0 19216 19217 19218 19220 19221 19225 19226 19227 19248 19293 19294 19296 
##    60    67     1     1     1     4     1     4     2     2     1     1     2 
## 19299 19300 19301 19622 
##     1     4     2     6

colselect <- colnames(training)[colSums(is.na(training)) == 0]
colselect

##  [1] "X"                    "user_name"            "raw_timestamp_part_1"
##  [4] "raw_timestamp_part_2" "cvtd_timestamp"       "new_window"          
##  [7] "num_window"           "roll_belt"            "pitch_belt"          
## [10] "yaw_belt"             "total_accel_belt"     "gyros_belt_x"        
## [13] "gyros_belt_y"         "gyros_belt_z"         "accel_belt_x"        
## [16] "accel_belt_y"         "accel_belt_z"         "magnet_belt_x"       
## [19] "magnet_belt_y"        "magnet_belt_z"        "roll_arm"            
## [22] "pitch_arm"            "yaw_arm"              "total_accel_arm"     
## [25] "gyros_arm_x"          "gyros_arm_y"          "gyros_arm_z"         
## [28] "accel_arm_x"          "accel_arm_y"          "accel_arm_z"         
## [31] "magnet_arm_x"         "magnet_arm_y"         "magnet_arm_z"        
## [34] "roll_dumbbell"        "pitch_dumbbell"       "yaw_dumbbell"        
## [37] "total_accel_dumbbell" "gyros_dumbbell_x"     "gyros_dumbbell_y"    
## [40] "gyros_dumbbell_z"     "accel_dumbbell_x"     "accel_dumbbell_y"    
## [43] "accel_dumbbell_z"     "magnet_dumbbell_x"    "magnet_dumbbell_y"   
## [46] "magnet_dumbbell_z"    "roll_forearm"         "pitch_forearm"       
## [49] "yaw_forearm"          "total_accel_forearm"  "gyros_forearm_x"     
## [52] "gyros_forearm_y"      "gyros_forearm_z"      "accel_forearm_x"     
## [55] "accel_forearm_y"      "accel_forearm_z"      "magnet_forearm_x"    
## [58] "magnet_forearm_y"     "magnet_forearm_z"     "classe"

# First 7 columns do not relate to excercise
colselect<- colselect[8: length(colselect)]
training <- training[colselect]

Create training and test sets

70% for training and 30% validation

inTrain = createDataPartition(training$classe, p = 0.7, list=FALSE)

testing = training[-inTrain,]
training = training[ inTrain,]


dim(training)

## [1] 13737    53

dim(testing)

## [1] 5885   53

training$classe <- factor(training$classe)
testing$classe<- factor(testing$classe)
nzv = nearZeroVar(x = training)
nzv

## integer(0)

We use different model to predict and choose the one with highest accuracy ### Predicting with trees

fitRpart <- train(classe ~., data=training, method="rpart")
print(fitRpart$finalModel)

## n= 13737 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 13737 9831 A (0.28 0.19 0.17 0.16 0.18)  
##    2) roll_belt< 130.5 12577 8681 A (0.31 0.21 0.19 0.18 0.11)  
##      4) pitch_forearm< -33.95 1107    9 A (0.99 0.0081 0 0 0) *
##      5) pitch_forearm>=-33.95 11470 8672 A (0.24 0.23 0.21 0.2 0.12)  
##       10) magnet_dumbbell_y< 439.5 9738 6992 A (0.28 0.18 0.24 0.19 0.11)  
##         20) roll_forearm< 123.5 6030 3573 A (0.41 0.18 0.18 0.17 0.062) *
##         21) roll_forearm>=123.5 3708 2471 C (0.078 0.18 0.33 0.23 0.18) *
##       11) magnet_dumbbell_y>=439.5 1732  831 B (0.03 0.52 0.041 0.22 0.19) *
##    3) roll_belt>=130.5 1160   10 E (0.0086 0 0 0 0.99) *

predRpart<- predict(fitRpart, testing)

levels(testing$classe)

## [1] "A" "B" "C" "D" "E"

levels(predRpart)

## [1] "A" "B" "C" "D" "E"

confusionMatrix(testing$classe, predRpart)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1525   29  116    0    4
##          B  484  385  270    0    0
##          C  499   37  490    0    0
##          D  423  187  354    0    0
##          E  153  159  289    0  481
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4895          
##                  95% CI : (0.4767, 0.5024)
##     No Information Rate : 0.524           
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3324          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.4945  0.48306  0.32258       NA  0.99175
## Specificity            0.9468  0.85181  0.87723   0.8362  0.88870
## Pos Pred Value         0.9110  0.33802  0.47758       NA  0.44455
## Neg Pred Value         0.6298  0.91319  0.78823       NA  0.99917
## Prevalence             0.5240  0.13543  0.25811   0.0000  0.08241
## Detection Rate         0.2591  0.06542  0.08326   0.0000  0.08173
## Detection Prevalence   0.2845  0.19354  0.17434   0.1638  0.18386
## Balanced Accuracy      0.7206  0.66743  0.59991       NA  0.94023

#fancyRpartPlot(fitRpart$finalModel)

Random forests

fitRf <- train(classe ~ ., data = training, method = "rf")
predRf <- predict(fitRf, testing)
confusionMatrix(testing$classe, predRf)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1672    1    1    0    0
##          B    8 1129    2    0    0
##          C    0    6 1017    3    0
##          D    0    0    7  956    1
##          E    0    0    1    1 1080
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9947          
##                  95% CI : (0.9925, 0.9964)
##     No Information Rate : 0.2855          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9933          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9952   0.9938   0.9893   0.9958   0.9991
## Specificity            0.9995   0.9979   0.9981   0.9984   0.9996
## Pos Pred Value         0.9988   0.9912   0.9912   0.9917   0.9982
## Neg Pred Value         0.9981   0.9985   0.9977   0.9992   0.9998
## Prevalence             0.2855   0.1930   0.1747   0.1631   0.1837
## Detection Rate         0.2841   0.1918   0.1728   0.1624   0.1835
## Detection Prevalence   0.2845   0.1935   0.1743   0.1638   0.1839
## Balanced Accuracy      0.9974   0.9959   0.9937   0.9971   0.9993

Boosted trees

fitGbm <- train(classe ~., data=training, method="gbm", verbose=FALSE)
predGbm<- predict(fitGbm, testing)
confusionMatrix(testing$classe, predGbm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1638   25    3    8    0
##          B   40 1063   31    4    1
##          C    0   33  981   11    1
##          D    0    2   34  920    8
##          E    2   11    6    9 1054
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9611          
##                  95% CI : (0.9558, 0.9659)
##     No Information Rate : 0.2855          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9508          
##                                           
##  Mcnemar's Test P-Value : 1.171e-05       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9750   0.9374   0.9299   0.9664   0.9906
## Specificity            0.9914   0.9840   0.9907   0.9911   0.9942
## Pos Pred Value         0.9785   0.9333   0.9561   0.9544   0.9741
## Neg Pred Value         0.9900   0.9850   0.9848   0.9935   0.9979
## Prevalence             0.2855   0.1927   0.1793   0.1618   0.1808
## Detection Rate         0.2783   0.1806   0.1667   0.1563   0.1791
## Detection Prevalence   0.2845   0.1935   0.1743   0.1638   0.1839
## Balanced Accuracy      0.9832   0.9607   0.9603   0.9787   0.9924

Linear Discriminant Analysis (lda)

fitLda <- train(classe ~ ., data = training, method = "lda")
predLda <- predict(fitLda, testing)
confusionMatrix(testing$classe, predLda)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1397   40  110  123    4
##          B  170  738  128   47   56
##          C  111  112  655  127   21
##          D   59   36  121  720   28
##          E   41  195   92  103  651
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7071          
##                  95% CI : (0.6952, 0.7187)
##     No Information Rate : 0.3021          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6289          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.7857   0.6583   0.5922   0.6429   0.8566
## Specificity            0.9326   0.9158   0.9224   0.9488   0.9159
## Pos Pred Value         0.8345   0.6479   0.6384   0.7469   0.6017
## Neg Pred Value         0.9095   0.9193   0.9072   0.9187   0.9773
## Prevalence             0.3021   0.1905   0.1879   0.1903   0.1291
## Detection Rate         0.2374   0.1254   0.1113   0.1223   0.1106
## Detection Prevalence   0.2845   0.1935   0.1743   0.1638   0.1839
## Balanced Accuracy      0.8591   0.7871   0.7573   0.7958   0.8862

Out of Sample error

Accuracy

print(paste0("RPART accuracy = ", confusionMatrix(predRpart, testing$classe)$overall['Accuracy']))

## [1] "RPART accuracy = 0.489549702633815"

print(paste0("RF accuracy = ", confusionMatrix(predRf, testing$classe)$overall['Accuracy']))

## [1] "RF accuracy = 0.994732370433305"

print(paste0("GBM accuracy = ", confusionMatrix(predGbm, testing$classe)$overall['Accuracy']))

## [1] "GBM accuracy = 0.961087510620221"

print(paste0("LDA accuracy = ", confusionMatrix(predLda, testing$classe)$overall['Accuracy']))

## [1] "LDA accuracy = 0.707051826677995"

Random forest has the highiest accuracy, so we use for predicting the test data

testRf <- predict(fitRf, testdata)
testRf

##  [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E

ML Assignment

Demba Coulibaly

2025-01-14

Introduction

Problem

Objective