Project Summary

The general overview for this project is to analyze how well the 6 participants lifting their barbells where the accelerometers are placed on the belt, forearm, arm and dumbbell. While the performance of barbell lifts correctly and incorrectly in 5 different ways.

Summary

Based on the following investigation code below, the following rule of thumb has been initiated.

  1. Data is cleaned to avoid coercion with NA values.
  2. 19,622 experiments for training data are divided by 60/40 portion to generate the models and accuracy of measurement.
  3. Based on classification tree model, the accuracy is roughly 49% and it’s insufficient to predict the data under 95% confidence of interval.
  4. Based on the k-fold = 5 while using the random forest model to predict with test data. The accuracy is 99% for the prediction under 95% confidence interval.

Load Libraries and Read Data

Library Loading & Setup

library(caret)
library(rpart)
library(rattle)
library(parallel)
library(doParallel)
library(randomForest)

Obtain & Examine Data

# URL setup and download

train_URL <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
test_URL <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
path <- getwd()
download.file(train_URL, file.path(path,'train.csv'))
download.file(test_URL, file.path(path,'test.csv'))

# Load data while neglecting the NA string 

train_set <- read.csv("train.csv", na.strings = c("NA", "#DIV/0!", ""), header = TRUE)
test_set <- read.csv("test.csv", na.strings = c("NA", "#DIV/0!", ""), header = TRUE)

# Verify Dimension for the data set
dim(train_set)
## [1] 19622   160
dim(test_set)
## [1]  20 160

Data Filtering & Subsetting

# Eliminate the columns with NA values by using is.na is true for columns 

train_check <- train_set[,colSums(is.na(train_set)) == 0]
test_check <- test_set[,colSums(is.na(test_set)) == 0]

# Remove columns where the information is invalid (from participants and time stamps) while set them as factors for the classe

train_clean <- train_check[,-c(1:7)]
test_clean <- test_check[,-c(1:7)]

# Re-check the dimensions for the filtered dataset 

dim(train_clean)
## [1] 19622    53
dim(test_clean)
## [1] 20 53
# validating the samples for each classe
table(train_clean$classe)
## 
##    A    B    C    D    E 
## 5580 3797 3422 3216 3607

Data Partitioning

# Use the training set for training & validation with 60/40 proportion
train_div <- createDataPartition(train_clean$classe, p = 0.60)[[1]]
train_partial <- train_clean[train_div,]
valid_partial <- train_clean[-train_div,]

Prediction_Classification Tree Model

ct_model <- train(classe~., data = train_partial, method = "rpart")
fancyRpartPlot(ct_model$finalModel)

predict_val <- predict(ct_model, newdata = valid_partial)
cm_ct <- confusionMatrix(predict_val,as.factor(valid_partial$classe))
cm_ct$overall["Accuracy"]
##  Accuracy 
## 0.4987255

Prediction_Random Forest Model

# Use k_fold = 5 for cross validation
cluster <- makeCluster(detectCores() - 1)
registerDoParallel(cluster)
cross_val <- trainControl(method = "cv", number = 5, allowParallel = TRUE)
rf_model <- train(classe~., data = train_partial,method="rf",trControl = cross_val)

# Terminate parallel computing
stopCluster(cluster)
registerDoSEQ()
predict_val_rf<- predict(rf_model, newdata = valid_partial)
cm_rf<-confusionMatrix(predict_val_rf,as.factor(valid_partial$classe))
cm_rf$overall["Accuracy"]
##  Accuracy 
## 0.9882743
plot(rf_model$finalModel,main="Random Forest Model Error Rate")

varImp(rf_model)
## rf variable importance
## 
##   only 20 most important variables shown (out of 52)
## 
##                      Overall
## roll_belt             100.00
## pitch_forearm          60.58
## yaw_belt               54.10
## magnet_dumbbell_y      46.02
## magnet_dumbbell_z      44.21
## pitch_belt             42.95
## roll_forearm           39.22
## accel_dumbbell_y       23.89
## magnet_dumbbell_x      18.45
## roll_dumbbell          18.28
## accel_forearm_x        16.88
## magnet_belt_z          15.98
## magnet_forearm_z       15.00
## accel_dumbbell_z       14.04
## magnet_belt_y          13.97
## accel_belt_z           13.87
## total_accel_dumbbell   13.14
## gyros_belt_z           10.98
## yaw_arm                10.86
## magnet_belt_x          10.24

Predict with Test Data

# Use test data to predict with rf model
test_predict <- predict(rf_model, test_clean)
print(test_predict)
##  [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
# write off results
write.csv(test_predict, file.path(path,'results.csv'))