Introduction

The goal of this project is to predict the manner in which participants performed weight lifting exercises. The outcome variable is classe.

Load packages

library(caret)
library(randomForest)
training_url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
testing_url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"

training <- read.csv(training_url, na.strings = c("NA", "", "#DIV/0!"))
testing <- read.csv(testing_url, na.strings = c("NA", "", "#DIV/0!"))

dim(training)
## [1] 19622   160
dim(testing)
## [1]  20 160
# remove variables with many missing values
keep_cols <- colSums(is.na(training)) == 0
training_clean <- training[, keep_cols]
testing_clean <- testing[, keep_cols]

# remove non-predictor columns
training_clean <- training_clean[, -c(1:7)]
testing_clean <- testing_clean[, -c(1:7)]

# make classe a factor
training_clean$classe <- as.factor(training_clean$classe)

dim(training_clean)
## [1] 19622    53
dim(testing_clean)
## [1] 20 53
set.seed(123)

inTrain <- createDataPartition(training_clean$classe, p = 0.7, list = FALSE)

trainData <- training_clean[inTrain, ]
validData <- training_clean[-inTrain, ]

dim(trainData)
## [1] 13737    53
dim(validData)
## [1] 5885   53
set.seed(123)

model_rf <- randomForest(classe ~ ., data = trainData, ntree = 100)
model_rf
## 
## Call:
##  randomForest(formula = classe ~ ., data = trainData, ntree = 100) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 7
## 
##         OOB estimate of  error rate: 0.66%
## Confusion matrix:
##      A    B    C    D    E class.error
## A 3902    3    0    0    1 0.001024066
## B   17 2636    5    0    0 0.008276900
## C    1   17 2373    5    0 0.009599332
## D    0    0   32 2218    2 0.015097691
## E    0    1    2    4 2518 0.002772277
pred_valid <- predict(model_rf, validData)

conf_matrix <- confusionMatrix(pred_valid, validData$classe)
conf_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1673    4    0    0    0
##          B    1 1132    3    0    1
##          C    0    3 1023   10    4
##          D    0    0    0  954    4
##          E    0    0    0    0 1073
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9949          
##                  95% CI : (0.9927, 0.9966)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9936          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9994   0.9939   0.9971   0.9896   0.9917
## Specificity            0.9991   0.9989   0.9965   0.9992   1.0000
## Pos Pred Value         0.9976   0.9956   0.9837   0.9958   1.0000
## Neg Pred Value         0.9998   0.9985   0.9994   0.9980   0.9981
## Prevalence             0.2845   0.1935   0.1743   0.1638   0.1839
## Detection Rate         0.2843   0.1924   0.1738   0.1621   0.1823
## Detection Prevalence   0.2850   0.1932   0.1767   0.1628   0.1823
## Balanced Accuracy      0.9992   0.9964   0.9968   0.9944   0.9958
accuracy <- conf_matrix$overall["Accuracy"]
out_of_sample_error <- 1 - accuracy

accuracy
##  Accuracy 
## 0.9949023
out_of_sample_error
##    Accuracy 
## 0.005097706
final_predictions <- predict(model_rf, testing_clean)
final_predictions
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
##  B  A  B  A  A  E  D  B  A  A  B  C  B  A  E  E  A  B  B  B 
## Levels: A B C D E