Summary

This exercise presents summary and analysis of the weight lifting exercises dataset as part of the Coursera Practical Machine Learning course.

The data is here. It consists of a series of measures on 6 healthy volunteers of 10 repetitions of Unilateral Dumbell Curls (UDC) performed in 5 different ways of which one was correct. The objective of the analysis is to predict which form of UDC each of 20 test subjects performed.

First we need to downoad and examine the data.

Downloads

train <- read.csv("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv", na.strings = c("NA","#DIV/0!"))

test <- read.csv("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv", na.strings = c("NA","#DIV/0!"))
dim (train)
## [1] 19622   160

Extract the sensor data

We need to limit the dataset to the measure variables only, and we will exclude those variables with high proportions of missing data.

sensordata <- grep(pattern = "_belt|_arm|_dumbbell|_forearm", names(train))
data <- train[,c(sensordata, 160)]
datatest <- test[,c(sensordata, 160)]

nadata<- apply(data, 2, function(x) mean(is.na(x)))
t <- which(nadata >0)
train1 <- data[,-t]

nadatatest<- apply(datatest, 2, function(x) mean(is.na(x)))
t <- which(nadatatest >0)
test1 <- datatest[,-t]

dim(train1); dim(test1)
## [1] 19622    53
## [1] 20 53

Loading necessary packages

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(corrplot)

Setting up training and test sets

## partition data
set.seed(123)
train <- createDataPartition(train1$classe, p=0.6, list = FALSE)
inTrain <- train1[train,]
inTest <- train1[-train,]
dim(inTrain); dim(inTest)
## [1] 11776    53
## [1] 7846   53

Examine relationships between variables

library(corrplot)
cor <- cor(inTrain[, -53])
corrplot(cor, order = "FPC", method = "color", type = "lower", tl.cex = 0.7, tl.col = rgb(0, 0, 0))

Tree approach

modtree <- train(classe~., data = inTrain, method = "rpart")
## Loading required package: rpart
plot(modtree$finalModel)
text(modtree$finalModel, pretty = 1)

predtree <- predict(modtree, inTest)
confusionMatrix(predtree, inTest$classe)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1356  239   42   68   20
##          B    5  264   34    8    6
##          C  441  263  907  397  269
##          D  422  752  385  813  510
##          E    8    0    0    0  637
## 
## Overall Statistics
##                                          
##                Accuracy : 0.5069         
##                  95% CI : (0.4958, 0.518)
##     No Information Rate : 0.2845         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.3865         
##  Mcnemar's Test P-Value : < 2.2e-16      
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.6075  0.17391   0.6630   0.6322  0.44175
## Specificity            0.9343  0.99162   0.7885   0.6846  0.99875
## Pos Pred Value         0.7861  0.83281   0.3983   0.2821  0.98760
## Neg Pred Value         0.8569  0.83344   0.9172   0.9047  0.88821
## Prevalence             0.2845  0.19347   0.1744   0.1639  0.18379
## Detection Rate         0.1728  0.03365   0.1156   0.1036  0.08119
## Detection Prevalence   0.2199  0.04040   0.2902   0.3673  0.08221
## Balanced Accuracy      0.7709  0.58277   0.7258   0.6584  0.72025

Overall accuracy is poor at 0.51.

set.seed(321)
modrf <-randomForest(classe ~., data = inTrain, ntree = 500)
modrf
## 
## Call:
##  randomForest(formula = classe ~ ., data = inTrain, ntree = 500) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 7
## 
##         OOB estimate of  error rate: 0.69%
## Confusion matrix:
##      A    B    C    D    E class.error
## A 3343    2    1    1    1 0.001493429
## B   10 2264    5    0    0 0.006581834
## C    1   19 2029    5    0 0.012171373
## D    1    0   24 1903    2 0.013989637
## E    0    0    2    7 2156 0.004157044
varImpPlot(modrf, cex = 0.5)

Random Forest is much more accurate

pred <- predict(modrf, inTest)
confusionMatrix(pred, inTest$classe)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 2230    9    0    0    0
##          B    2 1506    9    0    0
##          C    0    3 1357   12    3
##          D    0    0    2 1274    4
##          E    0    0    0    0 1435
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9944          
##                  95% CI : (0.9925, 0.9959)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9929          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9991   0.9921   0.9920   0.9907   0.9951
## Specificity            0.9984   0.9983   0.9972   0.9991   1.0000
## Pos Pred Value         0.9960   0.9927   0.9869   0.9953   1.0000
## Neg Pred Value         0.9996   0.9981   0.9983   0.9982   0.9989
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2842   0.1919   0.1730   0.1624   0.1829
## Detection Prevalence   0.2854   0.1933   0.1752   0.1631   0.1829
## Balanced Accuracy      0.9988   0.9952   0.9946   0.9949   0.9976

Predict class in test subjects using random forest model

pred1 <- predict(modrf, test1)
pred1
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
##  B  A  B  A  A  E  D  B  A  A  B  C  B  A  E  E  A  B  B  B 
## Levels: A B C D E