The general overview for this project is to analyze how well the 6 participants lifting their barbells where the accelerometers are placed on the belt, forearm, arm and dumbbell. While the performance of barbell lifts correctly and incorrectly in 5 different ways.
Based on the following investigation code below, the following rule of thumb has been initiated.
NA values. library(caret)
library(rpart)
library(rattle)
library(parallel)
library(doParallel)
library(randomForest)
# URL setup and download
train_URL <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
test_URL <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
path <- getwd()
download.file(train_URL, file.path(path,'train.csv'))
download.file(test_URL, file.path(path,'test.csv'))
# Load data while neglecting the NA string
train_set <- read.csv("train.csv", na.strings = c("NA", "#DIV/0!", ""), header = TRUE)
test_set <- read.csv("test.csv", na.strings = c("NA", "#DIV/0!", ""), header = TRUE)
# Verify Dimension for the data set
dim(train_set)
## [1] 19622 160
dim(test_set)
## [1] 20 160
# Eliminate the columns with NA values by using is.na is true for columns
train_check <- train_set[,colSums(is.na(train_set)) == 0]
test_check <- test_set[,colSums(is.na(test_set)) == 0]
# Remove columns where the information is invalid (from participants and time stamps) while set them as factors for the classe
train_clean <- train_check[,-c(1:7)]
test_clean <- test_check[,-c(1:7)]
# Re-check the dimensions for the filtered dataset
dim(train_clean)
## [1] 19622 53
dim(test_clean)
## [1] 20 53
# validating the samples for each classe
table(train_clean$classe)
##
## A B C D E
## 5580 3797 3422 3216 3607
# Use the training set for training & validation with 60/40 proportion
train_div <- createDataPartition(train_clean$classe, p = 0.60)[[1]]
train_partial <- train_clean[train_div,]
valid_partial <- train_clean[-train_div,]
ct_model <- train(classe~., data = train_partial, method = "rpart")
fancyRpartPlot(ct_model$finalModel)
predict_val <- predict(ct_model, newdata = valid_partial)
cm_ct <- confusionMatrix(predict_val,as.factor(valid_partial$classe))
cm_ct$overall["Accuracy"]
## Accuracy
## 0.4987255
# Use k_fold = 5 for cross validation
cluster <- makeCluster(detectCores() - 1)
registerDoParallel(cluster)
cross_val <- trainControl(method = "cv", number = 5, allowParallel = TRUE)
rf_model <- train(classe~., data = train_partial,method="rf",trControl = cross_val)
# Terminate parallel computing
stopCluster(cluster)
registerDoSEQ()
predict_val_rf<- predict(rf_model, newdata = valid_partial)
cm_rf<-confusionMatrix(predict_val_rf,as.factor(valid_partial$classe))
cm_rf$overall["Accuracy"]
## Accuracy
## 0.9882743
plot(rf_model$finalModel,main="Random Forest Model Error Rate")
varImp(rf_model)
## rf variable importance
##
## only 20 most important variables shown (out of 52)
##
## Overall
## roll_belt 100.00
## pitch_forearm 60.58
## yaw_belt 54.10
## magnet_dumbbell_y 46.02
## magnet_dumbbell_z 44.21
## pitch_belt 42.95
## roll_forearm 39.22
## accel_dumbbell_y 23.89
## magnet_dumbbell_x 18.45
## roll_dumbbell 18.28
## accel_forearm_x 16.88
## magnet_belt_z 15.98
## magnet_forearm_z 15.00
## accel_dumbbell_z 14.04
## magnet_belt_y 13.97
## accel_belt_z 13.87
## total_accel_dumbbell 13.14
## gyros_belt_z 10.98
## yaw_arm 10.86
## magnet_belt_x 10.24
# Use test data to predict with rf model
test_predict <- predict(rf_model, test_clean)
print(test_predict)
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
# write off results
write.csv(test_predict, file.path(path,'results.csv'))