Classification of Accelerometer Data with Machine Learning

Load Necessary Libraries

library(caret)
library(GGally)
library(rpart)
library(rpart.plot)
library(party)
library(RGtk2)
library(rattle)
library(xgboost)
library(formattable)
library(dplyr)
library(tidyr)
library(tibble)
library(ggthemes)

Download Data

train <- read.csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv')
test <- read.csv('https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv')

Create training / test partition for model validation

inTrain <- createDataPartition(train$classe, p=0.75, list = F)
training <- train[inTrain,]
testing <- train[-inTrain,]

Identify and remove near zero variance predictors

#Find near zero variance predictors
nzvs <- nearZeroVar(training, saveMetrics = T)
nzvars <- nzvs[nzvs$nzv==T,0]

#Remove near zero variance predictors from train and test sets
smallTrain <- training[,!colnames(training) %in% rownames(nzvars)]
smallTest <- testing[,!colnames(testing) %in% rownames(nzvars)]

Find predictors with large amount of NA values

x <- array()
for(i in 1:ncol(smallTrain)){
        x[i] <- sum(is.na(smallTrain[,i]))
}
print(x)

##   [1]     0     0     0     0     0     0     0     0     0     0 14405
##  [12] 14405 14405 14405 14405 14405 14405 14405 14405 14405 14405 14405
##  [23] 14405 14405 14405 14405     0     0     0     0     0     0     0
##  [34]     0     0     0     0     0     0 14405     0     0     0     0
##  [45]     0     0     0     0     0 14405 14405 14405 14405 14405 14405
##  [56]     0     0     0 14405 14405 14405 14405 14405 14405     0 14405
##  [67] 14405 14405 14405 14405 14405 14405 14405 14405 14405     0     0
##  [78]     0     0     0     0     0     0     0     0     0     0 14405
##  [89] 14405 14405     0 14405     0     0     0     0     0     0     0
## [100]     0     0     0

table(x)

## x
##     0 14405 
##    59    43

The distribution of NA values across predictors with any NA value is skewed where every column with any NA value has 97.87% of the total values missing, so we forego establishing a percentage NA threshold, and instead simply eliminate the predictors with any NA value at all.

Remove Predictors with NA values

#subset train and test sets by rule above eliminating NA columns
smallerTrain <- smallTrain[,colSums(is.na(smallTrain)) == 0]
smallerTest <- smallTest[,colSums(is.na(smallTest)) == 0]

Final Data Cleaning Steps

#get rid of id number which is duplicate of row index

sTrain <- smallerTrain[,-1]
sTest <- smallerTest[,-1]

Prediction with Decision Trees

#--------------
# Decision Tree
#--------------
set.seed(867)

tree <- rpart(classe ~ ., data = sTrain, method = 'class')
fancyRpartPlot(tree)

#Predict using tree
TreeFit <- predict(tree, sTest, type = 'class')
TreeResults <- confusionMatrix(TreeFit, testing$classe)

#Tree Accuracy
TreeResults$overall[1]

##  Accuracy 
## 0.8711256

Decision Tree Results

#Display confusion matrix results
tcm <- as.data.frame(TreeResults$table)

ggplot(tcm, aes(Prediction, Reference)) + geom_tile(aes(fill=Freq)) +
    geom_text(aes(label=digits(Freq,0))) +
    theme_hc()+
    scale_fill_gradient(low ="ivory3", high = "palegreen3") +
    ggtitle(label = paste("Decision Tree Accuracy:",round(TreeResults$overall['Accuracy'],4)),
            subtitle = "Confusion Matrix Plot") +
    theme(plot.title = element_text(hjust = 0.5, size = 12),
          plot.subtitle = element_text(hjust = 0.5, size = 10))

Prediction with Random Forests

Using repeated cross-validation to test for out-of-sample error, fit a random forest model to the data.

set.seed(867)
control <- trainControl(method="repeatedcv", number=10, repeats=3)
forest <- train(classe ~ ., data = sTrain, method = 'rf', trControl = control)
print(forest)

## Random Forest 
## 
## 14718 samples
##    57 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 13247, 13247, 13247, 13245, 13248, 13245, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9902388  0.9876513
##   40    0.9991619  0.9989399
##   79    0.9987088  0.9983668
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 40.

Predict using random forest model created above.

forestFit <- predict(forest, sTest)
forestResults <- confusionMatrix(forestFit, testing$classe)
forestResults$overall[1]

##  Accuracy 
## 0.9991843

Random Forest Results

fcm <- as.data.frame(forestResults$table)
ggplot(fcm, aes(Prediction, Reference)) + geom_tile(aes(fill=Freq)) +
    geom_text(aes(label=digits(Freq,0))) +
    theme_hc()+
    scale_fill_gradient(low = "lemonchiffon1", high = "chartreuse3") +
    ggtitle(label = paste("Random Forest Accuracy:",round(forestResults$overall['Accuracy'],4)),
            subtitle = "Confusion Matrix Plot") +
    theme(plot.title = element_text(hjust = 0.5, size = 12),
          plot.subtitle = element_text(hjust = 0.5, size = 10))

The best fit comes from Random Forests with an accuracy of 99.92%.

Apply Random Forest model to test set

sample(predict(forest,testing),50)

##  [1] C B C E A A E C E C C A A A B B B B D B A A A A C C A A C D A D B B A
## [36] A C D D B C A B E A E D C A D
## Levels: A B C D E

Classification of Accelerometer Data with Machine Learning

Neil Kutty

Load Necessary Libraries

Download Data

Create training / test partition for model validation

Identify and remove near zero variance predictors

Find predictors with large amount of NA values

Remove Predictors with NA values

Final Data Cleaning Steps

Prediction with Decision Trees

Decision Tree Results

Prediction with Random Forests

Random Forest Results

Apply Random Forest model to test set