library(caret)
library(gbm) # For Gradient Boosted Machines (decision tree-based)
library(dplyr)
library(corrplot) # For correlation analysis
# Create data folder
if(!file.exists("data")) dir.create("data")
# Download files
train_url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
test_url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"
download.file(train_url, "./data/pml-training.csv")
download.file(test_url, "./data/pml-testing.csv")
# Load data (treat empty strings, #DIV/0!, and NA as NA)
training <- read.csv("./data/pml-training.csv", na.strings = c("NA", "#DIV/0!", ""))
testing <- read.csv("./data/pml-testing.csv", na.strings = c("NA", "#DIV/0!", ""))
# Remove columns with >95% NA
mostlyNA <- sapply(training, function(x) mean(is.na(x))) > 0.95
training <- training[, mostlyNA == FALSE]
testing <- testing[, mostlyNA == FALSE]
# Remove near-zero variance variables
nzv <- nearZeroVar(training)
training <- training[, -nzv]
testing <- testing[, -nzv]
# Remove identification columns (first 5 columns: timestamps, user names, etc.)
training <- training[, -(1:5)]
testing <- testing[, -(1:5)]
dim(training) # Should be 19622 x 54
## [1] 19622 54
# Select numeric variables (exclude classe)
numeric_vars <- sapply(training, is.numeric)
corr_matrix <- cor(training[, numeric_vars])
# Plot correlation matrix (figure 1 of <5)
corrplot(corr_matrix, method = "color", type = "lower", tl.cex = 0.4, tl.srt = 45,
title = "Correlation Matrix of Numeric Predictors", mar = c(0,0,1,0))

set.seed(12345)
inTrain <- createDataPartition(training$classe, p = 0.7, list = FALSE)
train_set <- training[inTrain, ]
valid_set <- training[-inTrain, ]
control <- trainControl(method = "repeatedcv", number = 3, repeats = 1, verboseIter = FALSE)
model_gbm <- train(classe ~ .,
data = train_set,
method = "gbm",
trControl = control,
verbose = FALSE)
model_gbm
## Stochastic Gradient Boosting
##
## 13737 samples
## 53 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 1 times)
## Summary of sample sizes: 9159, 9157, 9158
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa
## 1 50 0.7574451 0.6920818
## 1 100 0.8311869 0.7862225
## 1 150 0.8724616 0.8385883
## 2 50 0.8854921 0.8550078
## 2 100 0.9399436 0.9240078
## 2 150 0.9610542 0.9507155
## 3 50 0.9337557 0.9161528
## 3 100 0.9697895 0.9617725
## 3 150 0.9823106 0.9776217
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150, interaction.depth =
## 3, shrinkage = 0.1 and n.minobsinnode = 10.
# Plot top 10 important variables (figure 2 of <5)
varImp_gbm <- varImp(model_gbm)
plot(varImp_gbm, top = 10, main = "Top 10 Important Variables in GBM")

final_predictions <- predict(model_gbm, testing)
final_predictions
## [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
pml_write_files <- function(x) {
n <- length(x)
path <- "./predictions"
if(!file.exists(path)) dir.create(path)
for(i in 1:n){
filename <- paste0("problem_id_", i, ".txt")
write.table(x[i], file = file.path(path, filename), quote = FALSE, row.names = FALSE, col.names = FALSE)
}
}
pml_write_files(final_predictions)