library(caret)
library(gbm)  # For Gradient Boosted Machines (decision tree-based)
library(dplyr)
library(corrplot)  # For correlation analysis
# Create data folder
if(!file.exists("data")) dir.create("data")
# Download files
train_url <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv"
test_url  <- "https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv"

download.file(train_url, "./data/pml-training.csv")
download.file(test_url,  "./data/pml-testing.csv")

# Load data (treat empty strings, #DIV/0!, and NA as NA)
training <- read.csv("./data/pml-training.csv", na.strings = c("NA", "#DIV/0!", ""))
testing  <- read.csv("./data/pml-testing.csv",  na.strings = c("NA", "#DIV/0!", ""))
# Remove columns with >95% NA
mostlyNA <- sapply(training, function(x) mean(is.na(x))) > 0.95
training <- training[, mostlyNA == FALSE]
testing  <- testing[, mostlyNA == FALSE]

# Remove near-zero variance variables
nzv <- nearZeroVar(training)
training <- training[, -nzv]
testing  <- testing[, -nzv]

# Remove identification columns (first 5 columns: timestamps, user names, etc.)
training <- training[, -(1:5)]
testing  <- testing[, -(1:5)]

dim(training)  # Should be 19622 x 54
## [1] 19622    54
# Select numeric variables (exclude classe)
numeric_vars <- sapply(training, is.numeric)
corr_matrix <- cor(training[, numeric_vars])

# Plot correlation matrix (figure 1 of <5)
corrplot(corr_matrix, method = "color", type = "lower", tl.cex = 0.4, tl.srt = 45,
         title = "Correlation Matrix of Numeric Predictors", mar = c(0,0,1,0))

set.seed(12345)
inTrain <- createDataPartition(training$classe, p = 0.7, list = FALSE)
train_set <- training[inTrain, ]
valid_set <- training[-inTrain, ]
control <- trainControl(method = "repeatedcv", number = 3, repeats = 1, verboseIter = FALSE)

model_gbm <- train(classe ~ ., 
                   data = train_set, 
                   method = "gbm", 
                   trControl = control,
                   verbose = FALSE)

model_gbm
## Stochastic Gradient Boosting 
## 
## 13737 samples
##    53 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 1 times) 
## Summary of sample sizes: 9159, 9157, 9158 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa    
##   1                   50      0.7574451  0.6920818
##   1                  100      0.8311869  0.7862225
##   1                  150      0.8724616  0.8385883
##   2                   50      0.8854921  0.8550078
##   2                  100      0.9399436  0.9240078
##   2                  150      0.9610542  0.9507155
##   3                   50      0.9337557  0.9161528
##   3                  100      0.9697895  0.9617725
##   3                  150      0.9823106  0.9776217
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150, interaction.depth =
##  3, shrinkage = 0.1 and n.minobsinnode = 10.
# Plot top 10 important variables (figure 2 of <5)
varImp_gbm <- varImp(model_gbm)
plot(varImp_gbm, top = 10, main = "Top 10 Important Variables in GBM")

final_predictions <- predict(model_gbm, testing)
final_predictions
##  [1] B A B A A E D B A A B C B A E E A B B B
## Levels: A B C D E
pml_write_files <- function(x) {
  n <- length(x)
  path <- "./predictions"
  if(!file.exists(path)) dir.create(path)
  for(i in 1:n){
    filename <- paste0("problem_id_", i, ".txt")
    write.table(x[i], file = file.path(path, filename), quote = FALSE, row.names = FALSE, col.names = FALSE)
  }
}

pml_write_files(final_predictions)