1 Permodelan Klasifikasi

1.0.1 Studi Kasus

Kasus ini menggunakan data yang diberikan pada UAS tahun 2022

LINK

library(readxl)
data <- read_excel("C:/Users/Legion/Downloads/data_STA1581.xlsx")
print(data)
## # A tibble: 300 × 6
##       AP  XRay Stage Grade   Age     Y
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1   128     0     1     1    54     1
##  2    46     0     1     1    55     0
##  3   165     0     0     1    53     1
##  4   102     0     1     1    50     1
##  5    42     0     1     0    61     0
##  6    52     0     1     0    55     0
##  7   112     0     0     0    52     0
##  8   146     0     1     0    55     1
##  9   157     0     1     1    59     1
## 10   174     0     1     0    60     1
## # ℹ 290 more rows
set.seed(2024)
bootstrap_indices <- sample(1:nrow(data), size = 300, replace = TRUE, set.seed(2024))
data <- data[bootstrap_indices, ]

cat("\nData setelah resampling (bootstrap n=300):\n")
## 
## Data setelah resampling (bootstrap n=300):
print(head(data))
## # A tibble: 6 × 6
##      AP  XRay Stage Grade   Age     Y
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1   114     0     1     1    53     1
## 2    82     1     1     1    62     1
## 3   141     0     0     1    51     1
## 4   122     0     1     0    67     0
## 5    73     1     0     1    58     0
## 6   188     0     1     1    53     1
cat("\nDimensi data resampled:", dim(data), "\n")
## 
## Dimensi data resampled: 300 6
cat("Proporsi kelas Y setelah resampling:\n")
## Proporsi kelas Y setelah resampling:
print(prop.table(table(data$Y)))
## 
##    0    1 
## 0.34 0.66

1.0.2 Permodelan dengan boostraping

1.0.3 Definisi Function, Library dan Dataframe

library(tidyverse)
library(caret)
library(pROC)
library(e1071)      
library(randomForest)
library(gbm)        
library(rpart)      
library(rpart.plot)
library(nnet)       
library(kknn)       

data$Y <- factor(data$Y, levels = c("0", "1"))
set.seed(2024)
train_index <- createDataPartition(data$Y, p = 0.8, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]

train_data$Y <- factor(train_data$Y, levels = levels(data$Y))
test_data$Y <- factor(test_data$Y, levels = levels(data$Y))

ctrl_cv5 <- trainControl(method = "cv", 
                         number = 5,
                         classProbs = TRUE,
                         summaryFunction = twoClassSummary,
                         savePredictions = TRUE)

train_data_cv <- train_data
test_data_cv <- test_data
levels(train_data_cv$Y) <- c("No", "Yes")
levels(test_data_cv$Y) <- c("No", "Yes")

evaluate_model <- function(predictions, actual, model_name, data_type, plot_cm = TRUE) {
  # Pastikan predictions dan actual memiliki level yang sama
  if(!is.factor(predictions)) {
    predictions <- factor(predictions, levels = levels(actual))
  }
  
  cm <- confusionMatrix(predictions, actual)
  
  # Hitung metrik tambahan
  precision <- cm$byClass["Pos Pred Value"]
  recall <- cm$byClass["Sensitivity"]
  f1_score <- 2 * (precision * recall) / (precision + recall)
  
  # Print evaluasi
  cat("\n", strrep("=", 50), "\n")
  cat(model_name, "-", data_type, "\n")
  cat(strrep("=", 50), "\n")
  
  cat("Confusion Matrix:\n")
  print(cm$table)
  
  cat("\nEvaluation Metrics:\n")
  cat("Accuracy    :", round(cm$overall["Accuracy"], 4), "\n")
  cat("Sensitivity :", round(cm$byClass["Sensitivity"], 4), "\n")
  cat("Specificity :", round(cm$byClass["Specificity"], 4), "\n")
  cat("Precision   :", round(precision, 4), "\n")
  cat("F1-Score    :", round(f1_score, 4), "\n")
  
  # Plot Confusion Matrix jika diminta
  if(plot_cm) {
    plot_confusion_matrix(cm, model_name, data_type)
  }
  
  return(list(
    confusion = cm$table,
    accuracy = cm$overall["Accuracy"],
    sensitivity = cm$byClass["Sensitivity"],
    specificity = cm$byClass["Specificity"],
    precision = precision,
    f1_score = f1_score
  ))
}

plot_confusion_matrix <- function(cm, model_name, data_type) {
  cm_matrix <- as.matrix(cm$table)
  
  # Hitung persentase berdasarkan total
  total <- sum(cm_matrix)
  cm_percent <- round((cm_matrix / total) * 100, 1)
  
  # Buat plot yang lebih jelas
  par(mfrow = c(1, 1), mar = c(5, 5, 5, 5))
  
  # Buat warna yang kontras
  colors <- colorRampPalette(c("#F6F6F6", "#FF6B6B", "#4ECDC4"))(100)
  
  # Plot heatmap
  image(1:ncol(cm_matrix), 1:nrow(cm_matrix), 
        t(cm_matrix), 
        col = colors,
        xlab = "Predicted", ylab = "Actual",
        xaxt = "n", yaxt = "n",
        main = paste("Confusion Matrix\n", model_name, "-", data_type),
        cex.main = 1.2)
  
  # Tambahkan axis labels
  axis(1, at = 1:ncol(cm_matrix), labels = c("0", "1"))
  axis(2, at = 1:nrow(cm_matrix), labels = c("0", "1"), las = 2)
  
  # Tambahkan grid
  abline(h = seq(0.5, nrow(cm_matrix)+0.5, 1), col = "gray", lty = 3)
  abline(v = seq(0.5, ncol(cm_matrix)+0.5, 1), col = "gray", lty = 3)
  
  # Tambahkan teks di setiap sel
  for(i in 1:nrow(cm_matrix)) {
    for(j in 1:ncol(cm_matrix)) {
      text(j, i, 
           paste(cm_matrix[i, j], "\n(", cm_percent[i, j], "%)", sep = ""),
           cex = 1, 
           col = ifelse(cm_matrix[i, j] > max(cm_matrix)/2, "white", "black"),
           font = 2)
    }
  }
  
  # Tambahkan metrik di sudut
  mtext(paste("Accuracy:", round(cm$overall["Accuracy"], 4)), 
        side = 3, line = 0.5, adj = 0.05, cex = 0.9, col = "darkblue")
}

plot_roc_curve <- function(probs, actual, model_name, color) {
  roc_obj <- roc(actual, probs)
  auc_val <- auc(roc_obj)
  
  # Plot ROC dengan pengaturan yang lebih baik
  plot(roc_obj, 
       main = paste("ROC Curve -", model_name),
       col = color, 
       lwd = 3,
       print.auc = TRUE, 
       auc.polygon = TRUE,
       auc.polygon.col = adjustcolor(color, 0.2),
       print.auc.col = "black",  # Warna teks AUC hitam
       print.auc.cex = 1.1,      # Ukuran teks AUC
       legacy.axes = TRUE,       # False Positive Rate di sumbu x
       xlab = "False Positive Rate",
       ylab = "True Positive Rate",
       grid = TRUE,              # Tambahkan grid
       grid.col = "gray90",
       xlim = c(0, 1),
       ylim = c(0, 1))
  
  # Tambahkan diagonal line
  abline(a = 0, b = 1, col = "gray", lty = 2, lwd = 2)
  
  # Tambahkan judul AUC dengan background
  legend("bottomright", 
         legend = paste("AUC =", round(auc_val, 4)),
         bty = "n",
         text.col = "darkblue",
         cex = 1.1,
         bg = "white",
         box.col = "black")
  
  return(auc_val)
}

results <- list()
cv_results <- list()

1.0.4 Regresi Logistik

# Training dengan CV
logit_cv_model <- train(Y ~ ., 
                        data = train_data_cv,
                        method = "glm",
                        family = binomial,
                        trControl = ctrl_cv5,
                        metric = "ROC", set.seed(2024))

# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")
## 
## Cross-Validation Results:
print(logit_cv_model$results)
##   parameter       ROC      Sens      Spec      ROCSD    SensSD     SpecSD
## 1      none 0.9058675 0.6705882 0.8364919 0.04109693 0.1798955 0.02594077
# Prediksi untuk train data (dari model CV)
logit_train_pred <- predict(logit_cv_model, train_data_cv)
logit_test_pred <- predict(logit_cv_model, test_data_cv)

# Konversi kembali ke 0/1 untuk evaluasi
logit_train_pred_01 <- factor(ifelse(logit_train_pred == "Yes", "1", "0"), 
                              levels = c("0", "1"))
logit_test_pred_01 <- factor(ifelse(logit_test_pred == "Yes", "1", "0"), 
                             levels = c("0", "1"))

# Evaluasi dengan plot
results$logit$train <- evaluate_model(logit_train_pred_01, train_data$Y, 
                                      "Logistic Regression", "Train Data (CV)")
## 
##  ================================================== 
## Logistic Regression - Train Data (CV) 
## ================================================== 
## Confusion Matrix:
##           Reference
## Prediction   0   1
##          0  53  25
##          1  29 134
## 
## Evaluation Metrics:
## Accuracy    : 0.7759 
## Sensitivity : 0.6463 
## Specificity : 0.8428 
## Precision   : 0.6795 
## F1-Score    : 0.6625

results$logit$test <- evaluate_model(logit_test_pred_01, test_data$Y, 
                                     "Logistic Regression", "Test Data")
## 
##  ================================================== 
## Logistic Regression - Test Data 
## ================================================== 
## Confusion Matrix:
##           Reference
## Prediction  0  1
##          0 16  9
##          1  4 30
## 
## Evaluation Metrics:
## Accuracy    : 0.7797 
## Sensitivity : 0.8 
## Specificity : 0.7692 
## Precision   : 0.64 
## F1-Score    : 0.7111

# Simpan probabilitas untuk ROC
logit_test_prob <- predict(logit_cv_model, test_data_cv, type = "prob")[, "Yes"]
cv_results$logit <- logit_cv_model

1.0.5 SVM

# Training SVM dengan CV
svm_cv_model <- train(Y ~ ., 
                      data = train_data_cv,
                      method = "svmRadial",
                      trControl = ctrl_cv5,
                      tuneLength = 5,
                      metric = "ROC",set.seed(2024))

# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")
## 
## Cross-Validation Results:
print(svm_cv_model$results)
##       sigma    C       ROC      Sens      Spec      ROCSD     SensSD     SpecSD
## 1 0.1484624 0.25 0.8776743 0.6102941 0.8872984 0.04972337 0.07915092 0.06058208
## 2 0.1484624 0.50 0.8858649 0.5852941 0.8872984 0.04122446 0.02565639 0.06058208
## 3 0.1484624 1.00 0.8806407 0.5360294 0.8808468 0.04520850 0.03894283 0.05080185
## 4 0.1484624 2.00 0.8767048 0.5860294 0.8995968 0.04411341 0.09753393 0.05980211
## 5 0.1484624 4.00 0.8798254 0.5492647 0.8997984 0.03714262 0.06333781 0.07101884
# Prediksi
svm_train_pred <- predict(svm_cv_model, train_data_cv)
svm_test_pred <- predict(svm_cv_model, test_data_cv)

# Konversi kembali ke 0/1
svm_train_pred_01 <- factor(ifelse(svm_train_pred == "Yes", "1", "0"), 
                            levels = c("0", "1"))
svm_test_pred_01 <- factor(ifelse(svm_test_pred == "Yes", "1", "0"), 
                           levels = c("0", "1"))

# Evaluasi
results$svm$train <- evaluate_model(svm_train_pred_01, train_data$Y, "SVM", "Train Data (CV)")
## 
##  ================================================== 
## SVM - Train Data (CV) 
## ================================================== 
## Confusion Matrix:
##           Reference
## Prediction   0   1
##          0  58  16
##          1  24 143
## 
## Evaluation Metrics:
## Accuracy    : 0.834 
## Sensitivity : 0.7073 
## Specificity : 0.8994 
## Precision   : 0.7838 
## F1-Score    : 0.7436

results$svm$test <- evaluate_model(svm_test_pred_01, test_data$Y, "SVM", "Test Data")
## 
##  ================================================== 
## SVM - Test Data 
## ================================================== 
## Confusion Matrix:
##           Reference
## Prediction  0  1
##          0 14  6
##          1  6 33
## 
## Evaluation Metrics:
## Accuracy    : 0.7966 
## Sensitivity : 0.7 
## Specificity : 0.8462 
## Precision   : 0.7 
## F1-Score    : 0.7

# Probabilitas untuk ROC
svm_test_prob <- predict(svm_cv_model, test_data_cv, type = "prob")[, "Yes"]
cv_results$svm <- svm_cv_model

1.0.6 Naive Bayes

# Training Naive Bayes dengan CV
nb_cv_model <- train(Y ~ ., 
                     data = train_data_cv,
                     method = "naive_bayes",
                     trControl = ctrl_cv5,
                     tuneLength = 5,
                     metric = "ROC",set.seed(2024))

# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")
## 
## Cross-Validation Results:
print(nb_cv_model$results)
##   usekernel laplace adjust       ROC      Sens      Spec      ROCSD    SensSD
## 1     FALSE       0      1 0.8800381 0.6345588 0.8556452 0.05872925 0.0932114
## 2      TRUE       0      1 0.8796000 0.5250000 0.9622984 0.07430989 0.1312608
##       SpecSD
## 1 0.07142832
## 2 0.02608921
# Prediksi
nb_train_pred <- predict(nb_cv_model, train_data_cv)
nb_test_pred <- predict(nb_cv_model, test_data_cv)

# Konversi kembali ke 0/1
nb_train_pred_01 <- factor(ifelse(nb_train_pred == "Yes", "1", "0"), 
                           levels = c("0", "1"))
nb_test_pred_01 <- factor(ifelse(nb_test_pred == "Yes", "1", "0"), 
                          levels = c("0", "1"))

# Evaluasi
results$nb$train <- evaluate_model(nb_train_pred_01, train_data$Y, "Naive Bayes", "Train Data (CV)")
## 
##  ================================================== 
## Naive Bayes - Train Data (CV) 
## ================================================== 
## Confusion Matrix:
##           Reference
## Prediction   0   1
##          0  52  21
##          1  30 138
## 
## Evaluation Metrics:
## Accuracy    : 0.7884 
## Sensitivity : 0.6341 
## Specificity : 0.8679 
## Precision   : 0.7123 
## F1-Score    : 0.671

results$nb$test <- evaluate_model(nb_test_pred_01, test_data$Y, "Naive Bayes", "Test Data")
## 
##  ================================================== 
## Naive Bayes - Test Data 
## ================================================== 
## Confusion Matrix:
##           Reference
## Prediction  0  1
##          0 16  8
##          1  4 31
## 
## Evaluation Metrics:
## Accuracy    : 0.7966 
## Sensitivity : 0.8 
## Specificity : 0.7949 
## Precision   : 0.6667 
## F1-Score    : 0.7273

# Probabilitas untuk ROC
nb_test_prob <- predict(nb_cv_model, test_data_cv, type = "prob")[, "Yes"]
cv_results$nb <- nb_cv_model

1.0.7 KNN

# Training KNN dengan CV
knn_cv_model <- train(Y ~ ., 
                      data = train_data_cv,
                      method = "knn",
                      trControl = ctrl_cv5,
                      tuneGrid = expand.grid(k = c(3, 5, 7, 9, 11)),
                      metric = "ROC",set.seed(2024))

# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")
## 
## Cross-Validation Results:
print(knn_cv_model$results)
##    k       ROC      Sens      Spec      ROCSD    SensSD     SpecSD
## 1  3 0.8095814 0.5992647 0.8118952 0.03394265 0.1555456 0.08182914
## 2  5 0.8211812 0.6338235 0.8429435 0.03811643 0.1468887 0.03748455
## 3  7 0.8228393 0.5250000 0.8743952 0.04595390 0.1620861 0.07624927
## 4  9 0.8049340 0.5375000 0.8431452 0.05703219 0.1335829 0.08786252
## 5 11 0.8070149 0.5125000 0.8429435 0.05013775 0.1482397 0.11021470
cat("\nBest K value:", knn_cv_model$bestTune$k, "\n")
## 
## Best K value: 7
# Prediksi
knn_train_pred <- predict(knn_cv_model, train_data_cv)
knn_test_pred <- predict(knn_cv_model, test_data_cv)

# Konversi kembali ke 0/1
knn_train_pred_01 <- factor(ifelse(knn_train_pred == "Yes", "1", "0"), 
                            levels = c("0", "1"))
knn_test_pred_01 <- factor(ifelse(knn_test_pred == "Yes", "1", "0"), 
                           levels = c("0", "1"))

# Evaluasi
results$knn$train <- evaluate_model(knn_train_pred_01, train_data$Y, "KNN", "Train Data (CV)")
## 
##  ================================================== 
## KNN - Train Data (CV) 
## ================================================== 
## Confusion Matrix:
##           Reference
## Prediction   0   1
##          0  57  20
##          1  25 139
## 
## Evaluation Metrics:
## Accuracy    : 0.8133 
## Sensitivity : 0.6951 
## Specificity : 0.8742 
## Precision   : 0.7403 
## F1-Score    : 0.717

results$knn$test <- evaluate_model(knn_test_pred_01, test_data$Y, "KNN", "Test Data")
## 
##  ================================================== 
## KNN - Test Data 
## ================================================== 
## Confusion Matrix:
##           Reference
## Prediction  0  1
##          0 15  9
##          1  5 30
## 
## Evaluation Metrics:
## Accuracy    : 0.7627 
## Sensitivity : 0.75 
## Specificity : 0.7692 
## Precision   : 0.625 
## F1-Score    : 0.6818

# Probabilitas untuk ROC
knn_test_prob <- predict(knn_cv_model, test_data_cv, type = "prob")[, "Yes"]
cv_results$knn <- knn_cv_model

1.0.8 Decision Tree

# Training Decision Tree dengan CV
dt_cv_model <- train(Y ~ ., 
                     data = train_data_cv,
                     method = "rpart",
                     trControl = ctrl_cv5,
                     tuneLength = 10,
                     metric = "ROC",set.seed(2024))

# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")
## 
## Cross-Validation Results:
print(dt_cv_model$results)
##            cp       ROC      Sens      Spec      ROCSD    SensSD     SpecSD
## 1  0.00000000 0.7937404 0.6654412 0.7739919 0.11028534 0.2605111 0.09645971
## 2  0.03387534 0.7685699 0.7227941 0.8046371 0.10308187 0.1447335 0.04358874
## 3  0.06775068 0.7586089 0.6852941 0.7796371 0.09374303 0.1467045 0.06733162
## 4  0.10162602 0.7667398 0.7852941 0.7481855 0.11155099 0.1921391 0.05980126
## 5  0.13550136 0.7667398 0.7852941 0.7481855 0.11155099 0.1921391 0.05980126
## 6  0.16937669 0.7667398 0.7852941 0.7481855 0.11155099 0.1921391 0.05980126
## 7  0.20325203 0.7667398 0.7852941 0.7481855 0.11155099 0.1921391 0.05980126
## 8  0.23712737 0.6823648 0.5852941 0.7794355 0.12375579 0.3599507 0.12619422
## 9  0.27100271 0.6823648 0.5852941 0.7794355 0.12375579 0.3599507 0.12619422
## 10 0.30487805 0.5636148 0.2352941 0.8919355 0.08739931 0.3221897 0.14865859
# Plot decision tree
par(mfrow = c(1, 1))
rpart.plot(dt_cv_model$finalModel, 
           main = "Decision Tree (CV Optimized)",
           type = 4, 
           extra = 104, 
           box.palette = "GnBu")

rpart.plot(dt_cv_model$finalModel, 
           main = "Decision Tree Structure",
           type = 0,  
           extra = 101,
           box.palette = "RdYlGn", # Red-Yellow-Green palette
           shadow.col = "gray",
           branch.lty = 3,
           fallen.leaves = TRUE,
           cex = 0.8)

# Plot 2: Decision tree dengan informasi probabilitas
rpart.plot(dt_cv_model$finalModel, 
           main = "Decision Tree with Class Probabilities",
           type = 2,  # Display split variable names
           extra = 104, # Display probability per class and percentage
           box.palette = "GnBu",
           shadow.col = "gray",
           branch = 0.3,
           under = TRUE,
           cex = 0.7)

# Prediksi
dt_train_pred <- predict(dt_cv_model, train_data_cv)
dt_test_pred <- predict(dt_cv_model, test_data_cv)

# Konversi kembali ke 0/1
dt_train_pred_01 <- factor(ifelse(dt_train_pred == "Yes", "1", "0"), 
                           levels = c("0", "1"))
dt_test_pred_01 <- factor(ifelse(dt_test_pred == "Yes", "1", "0"), 
                          levels = c("0", "1"))

# Evaluasi
results$dt$train <- evaluate_model(dt_train_pred_01, train_data$Y, "Decision Tree", "Train Data (CV)")
## 
##  ================================================== 
## Decision Tree - Train Data (CV) 
## ================================================== 
## Confusion Matrix:
##           Reference
## Prediction   0   1
##          0  58  13
##          1  24 146
## 
## Evaluation Metrics:
## Accuracy    : 0.8465 
## Sensitivity : 0.7073 
## Specificity : 0.9182 
## Precision   : 0.8169 
## F1-Score    : 0.7582

results$dt$test <- evaluate_model(dt_test_pred_01, test_data$Y, "Decision Tree", "Test Data")
## 
##  ================================================== 
## Decision Tree - Test Data 
## ================================================== 
## Confusion Matrix:
##           Reference
## Prediction  0  1
##          0 12  5
##          1  8 34
## 
## Evaluation Metrics:
## Accuracy    : 0.7797 
## Sensitivity : 0.6 
## Specificity : 0.8718 
## Precision   : 0.7059 
## F1-Score    : 0.6486

# Probabilitas untuk ROC
dt_test_prob <- predict(dt_cv_model, test_data_cv, type = "prob")[, "Yes"]
cv_results$dt <- dt_cv_model

1.0.9 Random Forest

# Training Random Forest dengan CV
rf_cv_model <- train(Y ~ ., 
                     data = train_data_cv,
                     method = "rf",
                     trControl = ctrl_cv5,
                     tuneLength = 3,
                     ntree = 500,
                     metric = "ROC",set.seed(2024))

# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")
## 
## Cross-Validation Results:
print(rf_cv_model$results)
##   mtry       ROC      Sens      Spec      ROCSD     SensSD     SpecSD
## 1    2 0.9443122 0.7801471 0.9179435 0.02359530 0.05656271 0.06166356
## 2    3 0.9380793 0.8051471 0.9054435 0.02137653 0.02310626 0.05902732
## 3    5 0.9219940 0.7566176 0.9118952 0.02140848 0.09630669 0.05600250
# Plot variable importance
par(mfrow = c(1, 1))
varImpPlot <- varImp(rf_cv_model, scale = FALSE)
plot(varImpPlot, 
     main = "Random Forest - Variable Importance (CV)",
     top = 5)

# Prediksi
rf_train_pred <- predict(rf_cv_model, train_data_cv)
rf_test_pred <- predict(rf_cv_model, test_data_cv)

# Konversi kembali ke 0/1
rf_train_pred_01 <- factor(ifelse(rf_train_pred == "Yes", "1", "0"), 
                           levels = c("0", "1"))
rf_test_pred_01 <- factor(ifelse(rf_test_pred == "Yes", "1", "0"), 
                          levels = c("0", "1"))

# Evaluasi
results$rf$train <- evaluate_model(rf_train_pred_01, train_data$Y, "Random Forest", "Train Data (CV)")
## 
##  ================================================== 
## Random Forest - Train Data (CV) 
## ================================================== 
## Confusion Matrix:
##           Reference
## Prediction   0   1
##          0  80   0
##          1   2 159
## 
## Evaluation Metrics:
## Accuracy    : 0.9917 
## Sensitivity : 0.9756 
## Specificity : 1 
## Precision   : 1 
## F1-Score    : 0.9877

results$rf$test <- evaluate_model(rf_test_pred_01, test_data$Y, "Random Forest", "Test Data")
## 
##  ================================================== 
## Random Forest - Test Data 
## ================================================== 
## Confusion Matrix:
##           Reference
## Prediction  0  1
##          0 17  3
##          1  3 36
## 
## Evaluation Metrics:
## Accuracy    : 0.8983 
## Sensitivity : 0.85 
## Specificity : 0.9231 
## Precision   : 0.85 
## F1-Score    : 0.85

# Probabilitas untuk ROC
rf_test_prob <- predict(rf_cv_model, test_data_cv, type = "prob")[, "Yes"]
cv_results$rf <- rf_cv_model

1.0.10 Gradient Boosting

# Training Gradient Boosting dengan CV
gbm_cv_model <- train(Y ~ ., 
                      data = train_data_cv,
                      method = "gbm",
                      trControl = ctrl_cv5,
                      verbose = FALSE,
                      tuneGrid = expand.grid(
                        n.trees = c(100, 200, 300),
                        interaction.depth = c(3, 5, 7),
                        shrinkage = c(0.01, 0.1, 0.3),
                        n.minobsinnode = 10
                      ),
                      metric = "ROC",set.seed(2024))

# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")
## 
## Cross-Validation Results:
print(gbm_cv_model$results)
##    shrinkage interaction.depth n.minobsinnode n.trees       ROC      Sens
## 1       0.01                 3             10     100 0.8618470 0.5117647
## 10      0.10                 3             10     100 0.9173335 0.6845588
## 19      0.30                 3             10     100 0.9171705 0.7683824
## 4       0.01                 5             10     100 0.8648615 0.4867647
## 13      0.10                 5             10     100 0.9261571 0.7816176
## 22      0.30                 5             10     100 0.9117158 0.7566176
## 7       0.01                 7             10     100 0.8733486 0.5227941
## 16      0.10                 7             10     100 0.9153596 0.7102941
## 25      0.30                 7             10     100 0.9268627 0.7713235
## 2       0.01                 3             10     200 0.8743885 0.5367647
## 11      0.10                 3             10     200 0.9305866 0.7455882
## 20      0.30                 3             10     200 0.9236302 0.7566176
## 5       0.01                 5             10     200 0.8867981 0.5735294
## 14      0.10                 5             10     200 0.9350073 0.7933824
## 23      0.30                 5             10     200 0.9223049 0.7558824
## 8       0.01                 7             10     200 0.8872131 0.5852941
## 17      0.10                 7             10     200 0.9324093 0.7816176
## 26      0.30                 7             10     200 0.9312485 0.7227941
## 3       0.01                 3             10     300 0.8853623 0.5727941
## 12      0.10                 3             10     300 0.9276647 0.7595588
## 21      0.30                 3             10     300 0.9183082 0.7448529
## 6       0.01                 5             10     300 0.8938197 0.6095588
## 15      0.10                 5             10     300 0.9388572 0.7926471
## 24      0.30                 5             10     300 0.9142715 0.7823529
## 9       0.01                 7             10     300 0.8961760 0.6227941
## 18      0.10                 7             10     300 0.9351696 0.7698529
## 27      0.30                 7             10     300 0.9248703 0.7580882
##         Spec      ROCSD     SensSD     SpecSD
## 1  0.8806452 0.03405207 0.13335507 0.09459599
## 10 0.9058468 0.03148248 0.13095153 0.02139140
## 19 0.9181452 0.04631778 0.09765165 0.01771155
## 4  0.8993952 0.03908242 0.14711855 0.03412116
## 13 0.9247984 0.03494813 0.15790858 0.05208192
## 22 0.9245968 0.02960686 0.11285028 0.04726845
## 7  0.9245968 0.03678181 0.18399624 0.04726845
## 16 0.9308468 0.03273220 0.11430809 0.03415242
## 25 0.9308468 0.03299204 0.13115780 0.03415242
## 2  0.8679435 0.03798668 0.14203267 0.02596622
## 11 0.9372984 0.02841765 0.07116606 0.03100024
## 20 0.9241935 0.03401371 0.05844316 0.04847837
## 5  0.8806452 0.04172968 0.14840833 0.02544839
## 14 0.9245968 0.03170012 0.08721066 0.02773918
## 23 0.9243952 0.04607032 0.07696917 0.02848563
## 8  0.8741935 0.03754756 0.16834462 0.02217055
## 17 0.9435484 0.03563974 0.11225584 0.02590745
## 26 0.9306452 0.04161968 0.12283484 0.02689285
## 3  0.8679435 0.04221019 0.14706801 0.02596622
## 12 0.9372984 0.03541226 0.14078050 0.03100024
## 21 0.9308468 0.03956110 0.10387575 0.04629201
## 6  0.8679435 0.04026280 0.10732524 0.02596622
## 15 0.9183468 0.02993190 0.10707938 0.04720175
## 24 0.9118952 0.04953136 0.08517201 0.04088320
## 9  0.8741935 0.03797336 0.17125409 0.02217055
## 18 0.9435484 0.03275198 0.10163962 0.02590745
## 27 0.9243952 0.04648541 0.09717990 0.03605152
# Prediksi
gbm_train_pred <- predict(gbm_cv_model, train_data_cv)
gbm_test_pred <- predict(gbm_cv_model, test_data_cv)

# Konversi kembali ke 0/1
gbm_train_pred_01 <- factor(ifelse(gbm_train_pred == "Yes", "1", "0"), 
                            levels = c("0", "1"))
gbm_test_pred_01 <- factor(ifelse(gbm_test_pred == "Yes", "1", "0"), 
                           levels = c("0", "1"))

# Evaluasi
results$gbm$train <- evaluate_model(gbm_train_pred_01, train_data$Y, "Gradient Boosting", "Train Data (CV)")
## 
##  ================================================== 
## Gradient Boosting - Train Data (CV) 
## ================================================== 
## Confusion Matrix:
##           Reference
## Prediction   0   1
##          0  81   0
##          1   1 159
## 
## Evaluation Metrics:
## Accuracy    : 0.9959 
## Sensitivity : 0.9878 
## Specificity : 1 
## Precision   : 1 
## F1-Score    : 0.9939

results$gbm$test <- evaluate_model(gbm_test_pred_01, test_data$Y, "Gradient Boosting", "Test Data")
## 
##  ================================================== 
## Gradient Boosting - Test Data 
## ================================================== 
## Confusion Matrix:
##           Reference
## Prediction  0  1
##          0 17  3
##          1  3 36
## 
## Evaluation Metrics:
## Accuracy    : 0.8983 
## Sensitivity : 0.85 
## Specificity : 0.9231 
## Precision   : 0.85 
## F1-Score    : 0.85

# Probabilitas untuk ROC
gbm_test_prob <- predict(gbm_cv_model, test_data_cv, type = "prob")[, "Yes"]
cv_results$gbm <- gbm_cv_model

1.0.11 Artificial Neural Network

# Normalisasi data untuk ANN
normalize <- function(x) {
  return((x - min(x)) / (max(x) - min(x)))
}

train_norm_cv <- as.data.frame(lapply(train_data_cv[, -ncol(train_data_cv)], normalize))
test_norm_cv <- as.data.frame(lapply(test_data_cv[, -ncol(test_data_cv)], normalize))
train_norm_cv$Y <- train_data_cv$Y
test_norm_cv$Y <- test_data_cv$Y

# Training Neural Network dengan CV
ann_cv_model <- train(Y ~ ., 
                      data = train_norm_cv,
                      method = "nnet",
                      trControl = ctrl_cv5,
                      tuneGrid = expand.grid(
                        size = c(5, 10, 15),
                        decay = c(0.001, 0.01, 0.1)
                      ),
                      maxit = 200,
                      trace = FALSE,
                      metric = "ROC",set.seed(2024))

# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")
## 
## Cross-Validation Results:
print(ann_cv_model$results)
##   size decay       ROC      Sens      Spec      ROCSD     SensSD     SpecSD
## 1    5 0.001 0.8951861 0.7808824 0.8683468 0.02674271 0.08065644 0.07077876
## 2    5 0.010 0.8819264 0.7220588 0.8497984 0.06482386 0.17411106 0.08051905
## 3    5 0.100 0.9021395 0.6602941 0.8491935 0.04501207 0.20117720 0.09191813
## 4   10 0.001 0.8667954 0.7933824 0.8685484 0.02826682 0.05136544 0.09452076
## 5   10 0.010 0.8978501 0.7580882 0.8685484 0.03534105 0.12370654 0.06737688
## 6   10 0.100 0.8986580 0.6602941 0.8491935 0.04605225 0.20117720 0.09191813
## 7   15 0.001 0.8939542 0.7948529 0.9062500 0.05036092 0.10455019 0.06250000
## 8   15 0.010 0.8837757 0.7338235 0.8560484 0.04038894 0.15365712 0.06436395
## 9   15 0.100 0.8997839 0.6602941 0.8491935 0.04601113 0.20117720 0.09191813
# Prediksi
ann_train_pred <- predict(ann_cv_model, train_norm_cv)
ann_test_pred <- predict(ann_cv_model, test_norm_cv)

# Konversi kembali ke 0/1
ann_train_pred_01 <- factor(ifelse(ann_train_pred == "Yes", "1", "0"), 
                            levels = c("0", "1"))
ann_test_pred_01 <- factor(ifelse(ann_test_pred == "Yes", "1", "0"), 
                           levels = c("0", "1"))

# Evaluasi
results$ann$train <- evaluate_model(ann_train_pred_01, train_data$Y, "Neural Network", "Train Data (CV)")
## 
##  ================================================== 
## Neural Network - Train Data (CV) 
## ================================================== 
## Confusion Matrix:
##           Reference
## Prediction   0   1
##          0  53  21
##          1  29 138
## 
## Evaluation Metrics:
## Accuracy    : 0.7925 
## Sensitivity : 0.6463 
## Specificity : 0.8679 
## Precision   : 0.7162 
## F1-Score    : 0.6795

results$ann$test <- evaluate_model(ann_test_pred_01, test_data$Y, "Neural Network", "Test Data")
## 
##  ================================================== 
## Neural Network - Test Data 
## ================================================== 
## Confusion Matrix:
##           Reference
## Prediction  0  1
##          0 16  9
##          1  4 30
## 
## Evaluation Metrics:
## Accuracy    : 0.7797 
## Sensitivity : 0.8 
## Specificity : 0.7692 
## Precision   : 0.64 
## F1-Score    : 0.7111

# Probabilitas untuk ROC
ann_test_prob <- predict(ann_cv_model, test_norm_cv, type = "prob")[, "Yes"]
cv_results$ann <- ann_cv_model
library(NeuralNetTools) 
plotnet(ann_cv_model,
        alpha = 0.6,
        circle_col = list('lightblue', 'lightgreen', 'lightpink'),
        bord_col = 'black',
        max_sp = TRUE,
        rel_rsc = c(2, 8),
        x_names = names(train_norm_cv)[-ncol(train_norm_cv)],
        y_names = c("No", "Yes"),
        main = "Neural Network Architecture")

1.0.12 Perbandingan Model

cat("\n", strrep("=", 60))
## 
##  ============================================================
cat("\nROC CURVES COMPARISON (ALL MODELS WITH CV)")
## 
## ROC CURVES COMPARISON (ALL MODELS WITH CV)
cat("\n", strrep("=", 60), "\n")
## 
##  ============================================================
# Siapkan plot ROC
par(mfrow = c(1, 1), mar = c(5, 5, 4, 2) + 0.1)

# Warna yang lebih kontras dan berbeda
colors <- c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", 
            "#FF7F00", "#FFFF33", "#A65628", "#F781BF")

# Buat plot kosong dulu
plot(NULL, xlim = c(0, 1), ylim = c(0, 1),
     xlab = "False Positive Rate", 
     ylab = "True Positive Rate",
     main = "ROC Curves - All Models (5-Fold CV)",
     cex.main = 1.5,
     cex.lab = 1.2)
grid(col = "gray90", lty = 2)
abline(a = 0, b = 1, col = "gray50", lty = 2, lwd = 2)

# List untuk menyimpan AUC
auc_values <- list()

# Plot ROC untuk setiap model
model_names <- c("Logistic Regression", "SVM", "Naive Bayes", "KNN",
                 "Decision Tree", "Random Forest", "Gradient Boosting", "Neural Network")
probs_list <- list(logit_test_prob, svm_test_prob, nb_test_prob, knn_test_prob,
                   dt_test_prob, rf_test_prob, gbm_test_prob, ann_test_prob)

# Pastikan actual values sama untuk semua
actual_numeric <- as.numeric(test_data$Y) - 1

for(i in 1:length(model_names)) {
  # Hitung ROC
  roc_obj <- roc(actual_numeric, probs_list[[i]])
  auc_val <- auc(roc_obj)
  auc_values[[model_names[i]]] <- auc_val
  
  # Plot ROC curve
  lines(roc_obj$specificities, roc_obj$sensitivities, 
        col = colors[i], lwd = 3)
  
  # Tambahkan titik di optimal cut-off (Youden Index)
  youden_index <- roc_obj$sensitivities + roc_obj$specificities - 1
  optimal_idx <- which.max(youden_index)
  points(1 - roc_obj$specificities[optimal_idx], 
         roc_obj$sensitivities[optimal_idx], 
         col = colors[i], pch = 19, cex = 1.5)
}

# Tambahkan legend dengan warna kontras
legend("bottomright", 
       legend = paste(model_names, " (AUC=", 
                      round(unlist(auc_values), 4), ")", sep = ""),
       col = colors, 
       lwd = 3, 
       cex = 0.8,
       bg = "white",
       box.col = "black",
       text.col = "black")

cat("\n", strrep("=", 60))
## 
##  ============================================================
cat("\nACCURACY COMPARISON TABLE (5-FOLD CV)")
## 
## ACCURACY COMPARISON TABLE (5-FOLD CV)
cat("\n", strrep("=", 60), "\n")
## 
##  ============================================================
# Buat dataframe untuk perbandingan
comparison_df <- data.frame(
  Model = model_names,
  Train_Accuracy = c(
    results$logit$train$accuracy,
    results$svm$train$accuracy,
    results$nb$train$accuracy,
    results$knn$train$accuracy,
    results$dt$train$accuracy,
    results$rf$train$accuracy,
    results$gbm$train$accuracy,
    results$ann$train$accuracy
  ),
  Test_Accuracy = c(
    results$logit$test$accuracy,
    results$svm$test$accuracy,
    results$nb$test$accuracy,
    results$knn$test$accuracy,
    results$dt$test$accuracy,
    results$rf$test$accuracy,
    results$gbm$test$accuracy,
    results$ann$test$accuracy
  ),
  AUC = unlist(auc_values)
)

# Tambahkan kolom overfitting (selisih akurasi train dan test)
comparison_df$Overfitting <- comparison_df$Train_Accuracy - comparison_df$Test_Accuracy

# Tambahkan kolom F1-Score
comparison_df$F1_Score <- c(
  results$logit$test$f1_score,
  results$svm$test$f1_score,
  results$nb$test$f1_score,
  results$knn$test$f1_score,
  results$dt$test$f1_score,
  results$rf$test$f1_score,
  results$gbm$test$f1_score,
  results$ann$test$f1_score
)

# Urutkan berdasarkan AUC tertinggi
comparison_df <- comparison_df[order(-comparison_df$AUC), ]

# Format output untuk lebih rapi
print_df <- comparison_df
print_df[, 2:6] <- round(print_df[, 2:6], 4)
print(print_df)
##                                   Model Train_Accuracy Test_Accuracy    AUC
## Random Forest             Random Forest         0.9917        0.8983 0.9679
## Gradient Boosting     Gradient Boosting         0.9959        0.8983 0.9410
## Naive Bayes                 Naive Bayes         0.7884        0.7966 0.8897
## Neural Network           Neural Network         0.7925        0.7797 0.8885
## SVM                                 SVM         0.8340        0.7966 0.8833
## Logistic Regression Logistic Regression         0.7759        0.7797 0.8705
## Decision Tree             Decision Tree         0.8465        0.7797 0.8596
## KNN                                 KNN         0.8133        0.7627 0.8532
##                     Overfitting F1_Score
## Random Forest            0.0934   0.8500
## Gradient Boosting        0.0975   0.8500
## Naive Bayes             -0.0082   0.7273
## Neural Network           0.0129   0.7111
## SVM                      0.0374   0.7000
## Logistic Regression     -0.0037   0.7111
## Decision Tree            0.0668   0.6486
## KNN                      0.0506   0.6818
barplot(height = t(as.matrix(comparison_df[, c("Train_Accuracy", "Test_Accuracy")])),
        beside = TRUE,
        names.arg = comparison_df$Model,
        col = c("#66C2A5", "#FC8D62"),
        las = 2,
        ylim = c(0, 1),
        main = "Train vs Test Accuracy (5-Fold CV)",
        ylab = "Accuracy",
        cex.names = 0.8,
        border = NA)
legend("topright", 
       legend = c("Train (CV)", "Test"), 
       fill = c("#66C2A5", "#FC8D62"),
       bg = "white",
       box.col = "black")
grid(nx = NA, ny = NULL, col = "gray90")

barplot(comparison_df$AUC,
        names.arg = comparison_df$Model,
        col = "#8DA0CB",
        las = 2,
        ylim = c(0, 1),
        main = "AUC Values Comparison (5-Fold CV)",
        ylab = "AUC",
        cex.names = 0.8,
        border = NA)
abline(h = 0.5, col = "red", lty = 2, lwd = 2)
text(x = 1:length(comparison_df$Model), 
     y = comparison_df$AUC,
     labels = round(comparison_df$AUC, 3),
     pos = 3, cex = 0.8, col = "darkblue")
grid(nx = NA, ny = NULL, col = "gray90")

barplot(comparison_df$F1_Score,
        names.arg = comparison_df$Model,
        col = "#E78AC3",
        las = 2,
        ylim = c(0, 1),
        main = "F1-Score Comparison (5-Fold CV)",
        ylab = "F1-Score",
        cex.names = 0.8,
        border = NA)
text(x = 1:length(comparison_df$Model), 
     y = comparison_df$F1_Score,
     labels = round(comparison_df$F1_Score, 3),
     pos = 3, cex = 0.8, col = "darkblue")
grid(nx = NA, ny = NULL, col = "gray90")

barplot(comparison_df$Overfitting,
        names.arg = comparison_df$Model,
        col = ifelse(comparison_df$Overfitting > 0.1, "#FB8072", "#B3DE69"),
        las = 2,
        ylim = c(min(comparison_df$Overfitting) - 0.05, 
                 max(comparison_df$Overfitting) + 0.05),
        main = "Overfitting (5-Fold CV)",
        ylab = "Train Acc - Test Acc",
        cex.names = 0.8,
        border = NA)
abline(h = 0, col = "black", lwd = 1)
abline(h = 0.05, col = "orange", lty = 2, lwd = 1)
abline(h = 0.1, col = "red", lty = 2, lwd = 1)
text(x = 1:length(comparison_df$Model), 
     y = comparison_df$Overfitting,
     labels = round(comparison_df$Overfitting, 3),
     pos = ifelse(comparison_df$Overfitting >= 0, 3, 1), 
     cex = 0.8, col = "darkblue")
grid(nx = NA, ny = NULL, col = "gray90")

cv_summary <- data.frame(
  Model = model_names,
  CV_Accuracy = sapply(cv_results, function(x) max(x$results$Accuracy, na.rm = TRUE)),
  CV_ROC = sapply(cv_results, function(x) max(x$results$ROC, na.rm = TRUE)),
  Best_Params = sapply(cv_results, function(x) {
    params <- x$bestTune
    paste(names(params), "=", params, collapse = ", ")
  })
)

2 Permodelan Regresi

library(AER)
data("CreditCard")
data <- CreditCard
glimpse(data)
## Rows: 1,319
## Columns: 12
## $ card        <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, no,…
## $ reports     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 3…
## $ age         <dbl> 37.66667, 33.25000, 33.66667, 30.50000, 32.16667, 23.25000…
## $ income      <dbl> 4.5200, 2.4200, 4.5000, 2.5400, 9.7867, 2.5000, 3.9600, 2.…
## $ share       <dbl> 0.0332699100, 0.0052169420, 0.0041555560, 0.0652137800, 0.…
## $ expenditure <dbl> 124.983300, 9.854167, 15.000000, 137.869200, 546.503300, 9…
## $ owner       <fct> yes, no, yes, no, yes, no, no, yes, yes, no, yes, yes, yes…
## $ selfemp     <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no…
## $ dependents  <dbl> 3, 3, 4, 0, 2, 0, 2, 0, 0, 0, 1, 2, 1, 0, 2, 1, 2, 2, 0, 0…
## $ months      <dbl> 54, 34, 58, 25, 64, 54, 7, 77, 97, 65, 24, 36, 42, 26, 120…
## $ majorcards  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1…
## $ active      <dbl> 12, 13, 5, 7, 5, 1, 5, 3, 6, 18, 20, 0, 12, 3, 5, 22, 0, 8…
# Load library yang dibutuhkan
library(tidyverse)
library(caret)
library(Metrics)
library(ggplot2)
library(splines)
library(pROC)
library(e1071)
library(randomForest)
library(gbm)
library(rpart)
library(rpart.plot)
library(nnet)
library(kknn)
library(glmnet)
library(patchwork)
library(NeuralNetTools)

cat("DATA PREPARATION\n")
## DATA PREPARATION
cat(strrep("=", 60), "\n")
## ============================================================
# Cek data
cat("Data structure:\n")
## Data structure:
print(str(data))
## 'data.frame':    1319 obs. of  12 variables:
##  $ card       : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ reports    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ age        : num  37.7 33.2 33.7 30.5 32.2 ...
##  $ income     : num  4.52 2.42 4.5 2.54 9.79 ...
##  $ share      : num  0.03327 0.00522 0.00416 0.06521 0.06705 ...
##  $ expenditure: num  124.98 9.85 15 137.87 546.5 ...
##  $ owner      : Factor w/ 2 levels "no","yes": 2 1 2 1 2 1 1 2 2 1 ...
##  $ selfemp    : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ dependents : num  3 3 4 0 2 0 2 0 0 0 ...
##  $ months     : num  54 34 58 25 64 54 7 77 97 65 ...
##  $ majorcards : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ active     : num  12 13 5 7 5 1 5 3 6 18 ...
## NULL
# Cek apakah 'expenditure' ada dalam data
if(!"expenditure" %in% names(data)) {
  stop("Kolom 'expenditure' tidak ditemukan dalam data.")
}

# Pastikan expenditure adalah numeric
if(is.factor(data$expenditure)) {
  data$expenditure <- as.numeric(as.character(data$expenditure))
  cat("\nExpenditure di-convert dari factor ke numeric\n")
}

cat("\n", strrep("=", 60))
## 
##  ============================================================
cat("\nSPLIT DATA (80% TRAIN - 20% TEST)")
## 
## SPLIT DATA (80% TRAIN - 20% TEST)
cat("\n", strrep("=", 60), "\n")
## 
##  ============================================================
set.seed(2024)
train_index <- createDataPartition(data$expenditure, p = 0.8, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]

cat("\nData split results:\n")
## 
## Data split results:
cat("Total data  :", nrow(data), "observations\n")
## Total data  : 1319 observations
cat("Train data  :", nrow(train_data), "observations (", round(nrow(train_data)/nrow(data)*100, 1), "%)\n")
## Train data  : 1057 observations ( 80.1 %)
cat("Test data   :", nrow(test_data), "observations (", round(nrow(test_data)/nrow(data)*100, 1), "%)\n")
## Test data   : 262 observations ( 19.9 %)
# Cek distribusi expenditure
cat("\nExpenditure statistics:\n")
## 
## Expenditure statistics:
summary_stats <- data.frame(
  Dataset = c("Train", "Test", "Full"),
  N = c(nrow(train_data), nrow(test_data), nrow(data)),
  Mean = c(mean(train_data$expenditure), mean(test_data$expenditure), mean(data$expenditure)),
  SD = c(sd(train_data$expenditure), sd(test_data$expenditure), sd(data$expenditure)),
  Min = c(min(train_data$expenditure), min(test_data$expenditure), min(data$expenditure)),
  Max = c(max(train_data$expenditure), max(test_data$expenditure), max(data$expenditure))
)
print(summary_stats)
##   Dataset    N     Mean       SD Min      Max
## 1   Train 1057 183.6665 269.3513   0 3099.505
## 2    Test  262 190.6672 283.9563   0 2001.547
## 3    Full 1319 185.0571 272.2189   0 3099.505
ctrl_cv5 <- trainControl(method = "cv", 
                         number = 5,
                         savePredictions = TRUE,
                         verboseIter = FALSE)

cat("\nCross-Validation setup: 5-fold CV\n")
## 
## Cross-Validation setup: 5-fold CV
evaluate_regression <- function(predictions, actual, model_name, data_type, plot_residuals = TRUE) {
  # Hitung metrik
  rmse_val <- rmse(actual, predictions)
  mae_val <- mae(actual, predictions)
  mse_val <- mse(actual, predictions)
  r2_val <- cor(actual, predictions)^2
  
  # Print hasil
  cat("\n", strrep("=", 50), "\n")
  cat(model_name, "-", data_type, "\n")
  cat(strrep("=", 50), "\n")
  
  cat("RMSE:", round(rmse_val, 4), "\n")
  cat("MAE :", round(mae_val, 4), "\n")
  cat("MSE :", round(mse_val, 4), "\n")
  cat("R²  :", round(r2_val, 4), "\n")
  
  # Plot jika diminta
  if(plot_residuals) {
    plot_regression_results(predictions, actual, model_name, data_type)
  }
  
  return(list(
    rmse = rmse_val,
    mae = mae_val,
    mse = mse_val,
    r2 = r2_val,
    predictions = predictions
  ))
}

plot_regression_results <- function(predicted, actual, model_name, data_type) {
  # Setup layout
  par(mfrow = c(2, 2), mar = c(4, 4, 3, 2))
  
  plot(actual, predicted,
       main = paste("Predicted vs Actual -", model_name),
       xlab = "Actual Expenditure",
       ylab = "Predicted Expenditure",
       pch = 19, col = "blue", cex = 0.7)
  abline(0, 1, col = "red", lwd = 2)
  grid()
  
  residuals <- actual - predicted
  fitted <- predicted
  plot(fitted, residuals,
       main = "Residuals vs Fitted",
       xlab = "Fitted Values",
       ylab = "Residuals",
       pch = 19, col = "darkgreen", cex = 0.7)
  abline(h = 0, col = "red", lwd = 2)
  grid()
  
  hist(residuals,
       main = "Distribution of Residuals",
       xlab = "Residuals",
       col = "lightblue",
       border = "black",
       breaks = 30)
  abline(v = 0, col = "red", lwd = 2)
  
  qqnorm(residuals, main = "Q-Q Plot of Residuals")
  qqline(residuals, col = "red", lwd = 2)
  
  # Reset layout
  par(mfrow = c(1, 1))
}

results <- list()
cv_results <- list()

2.0.1 Regresi Polynomial

predictors <- names(train_data)[names(train_data) != "expenditure"]

if(length(predictors) == 0) {
  stop("Tidak ada predictor dalam data.")
}

# Pilih hanya numeric predictors untuk polynomial regression
numeric_predictors <- predictors[sapply(train_data[predictors], is.numeric)]
if(length(numeric_predictors) == 0) {
  cat("Tidak ada predictor numeric untuk polynomial regression.\n")
  best_predictor <- predictors[1]
} else {
  # Hitung korelasi dengan expenditure
  correlations <- sapply(numeric_predictors, function(x) {
    cor(train_data[[x]], train_data$expenditure, use = "complete.obs")
  })
  best_predictor <- names(which.max(abs(correlations)))
  cat("Menggunakan predictor numeric dengan korelasi tertinggi:", best_predictor, 
      "(korelasi =", round(correlations[best_predictor], 3), ")\n")
}
## Menggunakan predictor numeric dengan korelasi tertinggi: share (korelasi = 0.835 )
# Training polynomial dengan berbagai degree
polynomial_results <- data.frame()
best_poly_degree <- 2
best_poly_rmse <- Inf

for(degree in 2:5) {
  cat("\nPolynomial Degree:", degree, "\n")
  
  # Buat formula
  formula_poly <- as.formula(paste("expenditure ~ poly(", best_predictor, ", ", degree, ", raw=TRUE)"))
  
  tryCatch({
    # Training dengan CV
    poly_model <- train(formula_poly,
                        data = train_data,
                        method = "lm",
                        trControl = ctrl_cv5)
    
    # Prediksi
    poly_train_pred <- predict(poly_model, train_data)
    poly_test_pred <- predict(poly_model, test_data)
    
    # Evaluasi
    train_eval <- evaluate_regression(poly_train_pred, train_data$expenditure, 
                                     paste("Poly Degree", degree), "Train Data", plot_residuals = FALSE)
    test_eval <- evaluate_regression(poly_test_pred, test_data$expenditure, 
                                    paste("Poly Degree", degree), "Test Data", plot_residuals = FALSE)
    
    # Simpan hasil
    poly_result <- data.frame(
      Degree = degree,
      Train_RMSE = train_eval$rmse,
      Test_RMSE = test_eval$rmse,
      Overfitting = train_eval$rmse - test_eval$rmse,
      R2 = test_eval$r2
    )
    polynomial_results <- rbind(polynomial_results, poly_result)
    
    # Update best model
    if(test_eval$rmse < best_poly_rmse) {
      best_poly_rmse <- test_eval$rmse
      best_poly_degree <- degree
      best_poly_model <- poly_model
    }
  }, error = function(e) {
    cat("Error untuk degree", degree, ":", e$message, "\n")
  })
}
## 
## Polynomial Degree: 2 
## 
##  ================================================== 
## Poly Degree 2 - Train Data 
## ================================================== 
## RMSE: 145.9464 
## MAE : 64.5504 
## MSE : 21300.35 
## R²  : 0.7061 
## 
##  ================================================== 
## Poly Degree 2 - Test Data 
## ================================================== 
## RMSE: 146.3213 
## MAE : 67.0184 
## MSE : 21409.92 
## R²  : 0.7447 
## 
## Polynomial Degree: 3 
## 
##  ================================================== 
## Poly Degree 3 - Train Data 
## ================================================== 
## RMSE: 145.9035 
## MAE : 65.0803 
## MSE : 21287.83 
## R²  : 0.7063 
## 
##  ================================================== 
## Poly Degree 3 - Test Data 
## ================================================== 
## RMSE: 146.7144 
## MAE : 67.5132 
## MSE : 21525.11 
## R²  : 0.7442 
## 
## Polynomial Degree: 4 
## 
##  ================================================== 
## Poly Degree 4 - Train Data 
## ================================================== 
## RMSE: 145.8176 
## MAE : 64.1362 
## MSE : 21262.78 
## R²  : 0.7066 
## 
##  ================================================== 
## Poly Degree 4 - Test Data 
## ================================================== 
## RMSE: 146.4865 
## MAE : 67.0277 
## MSE : 21458.28 
## R²  : 0.7446 
## 
## Polynomial Degree: 5 
## 
##  ================================================== 
## Poly Degree 5 - Train Data 
## ================================================== 
## RMSE: 145.525 
## MAE : 64.0101 
## MSE : 21177.53 
## R²  : 0.7078 
## 
##  ================================================== 
## Poly Degree 5 - Test Data 
## ================================================== 
## RMSE: 148.2246 
## MAE : 67.3913 
## MSE : 21970.53 
## R²  : 0.7393
# Tampilkan hasil semua degree
if(nrow(polynomial_results) > 0) {
  cat("\nPolynomial Regression Results:\n")
  print(polynomial_results)
  
  # Pilih model terbaik
  cat("\nBest Polynomial Model: Degree", best_poly_degree, "with Test RMSE =", 
      round(best_poly_rmse, 4), "\n")
  
  # Plot perbandingan
  if(require(ggplot2)) {
    p <- ggplot(polynomial_results, aes(x = Degree)) +
      geom_line(aes(y = Train_RMSE, color = "Train"), size = 1.5) +
      geom_line(aes(y = Test_RMSE, color = "Test"), size = 1.5) +
      geom_point(aes(y = Train_RMSE), size = 3) +
      geom_point(aes(y = Test_RMSE), size = 3) +
      labs(title = "Polynomial Regression: RMSE vs Degree",
           x = "Polynomial Degree",
           y = "RMSE",
           color = "Dataset") +
      theme_minimal() +
      scale_x_continuous(breaks = 2:5)
    print(p)
  }
  
  # Plot model terbaik
  best_poly_test_pred <- predict(best_poly_model, test_data)
  results$poly <- evaluate_regression(best_poly_test_pred, test_data$expenditure, 
                                     paste("Best Poly (Degree", best_poly_degree, ")"), 
                                     "Test Data")
} else {
  cat("\nTidak ada model polynomial yang berhasil di-train.\n")
}
## 
## Polynomial Regression Results:
##   Degree Train_RMSE Test_RMSE Overfitting        R2
## 1      2   145.9464  146.3213  -0.3748947 0.7447015
## 2      3   145.9035  146.7144  -0.8109065 0.7441506
## 3      4   145.8176  146.4865  -0.6688267 0.7445540
## 4      5   145.5250  148.2246  -2.6995783 0.7393477
## 
## Best Polynomial Model: Degree 2 with Test RMSE = 146.3213

## 
##  ================================================== 
## Best Poly (Degree 2 ) - Test Data 
## ================================================== 
## RMSE: 146.3213 
## MAE : 67.0184 
## MSE : 21409.92 
## R²  : 0.7447

2.0.2 Piecewise Constant Via Decision Tree

tryCatch({
  # Gunakan decision tree dengan depth 1 untuk piecewise constant
  pc_model <- train(expenditure ~ .,
                    data = train_data,
                    method = "rpart",
                    trControl = ctrl_cv5,
                    tuneGrid = data.frame(cp = 0.01),
                    control = rpart.control(maxdepth = 1))
  
  # Plot tree
  rpart.plot(pc_model$finalModel,
             main = "Piecewise Constant (Regression Stump)",
             type = 2,
             extra = 101,
             box.palette = "RdYlGn")
  
  # Prediksi
  pc_train_pred <- predict(pc_model, train_data)
  pc_test_pred <- predict(pc_model, test_data)
  
  # Evaluasi
  results$piecewise <- evaluate_regression(pc_test_pred, test_data$expenditure, 
                                          "Piecewise Constant", "Test Data")
  cv_results$piecewise <- pc_model
}, error = function(e) {
  cat("Error dalam Piecewise Constant Regression:", e$message, "\n")
})

## 
##  ================================================== 
## Piecewise Constant - Test Data 
## ================================================== 
## RMSE: 217.2666 
## MAE : 126.2558 
## MSE : 47204.78 
## R²  : 0.4126

2.0.3 Cubic Spline

tryCatch({
  # Tentukan knots untuk cubic spline
  n_knots <- min(3, floor(nrow(train_data) / 20))
  if(n_knots < 1) n_knots <- 1
  
  knots <- quantile(train_data[[best_predictor]], 
                    probs = seq(0, 1, length.out = n_knots + 2)[-c(1, n_knots + 2)])
  
  # Buat basis spline
  cubic_spline_train <- bs(train_data[[best_predictor]], 
                           knots = knots, 
                           degree = 3,
                           intercept = TRUE)
  cubic_spline_test <- bs(test_data[[best_predictor]], 
                          knots = knots,
                          degree = 3,
                          intercept = TRUE)
  
  # Gabungkan dengan predictors lain
  other_predictors <- setdiff(predictors, best_predictor)
  if(length(other_predictors) > 0) {
    train_spline <- cbind(as.data.frame(cubic_spline_train), 
                         train_data[, other_predictors, drop = FALSE])
    test_spline <- cbind(as.data.frame(cubic_spline_test), 
                        test_data[, other_predictors, drop = FALSE])
  } else {
    train_spline <- as.data.frame(cubic_spline_train)
    test_spline <- as.data.frame(cubic_spline_test)
  }
  
  train_spline$expenditure <- train_data$expenditure
  test_spline$expenditure <- test_data$expenditure
  
  # Training model
  cs_model <- train(expenditure ~ .,
                    data = train_spline,
                    method = "lm",
                    trControl = ctrl_cv5)
  
  # Prediksi
  cs_train_pred <- predict(cs_model, train_spline)
  cs_test_pred <- predict(cs_model, test_spline)
  
  # Evaluasi
  results$cubic_spline <- evaluate_regression(cs_test_pred, test_data$expenditure, 
                                             "Cubic Spline", "Test Data")
  
  # Plot spline
  if(require(ggplot2)) {
    plot_data <- data.frame(
      x = seq(min(train_data[[best_predictor]]), max(train_data[[best_predictor]]), length.out = 100),
      y = predict(cs_model, 
                  newdata = data.frame(V1 = bs(seq(min(train_data[[best_predictor]]), 
                                                   max(train_data[[best_predictor]]), 
                                                   length.out = 100), 
                                              knots = knots, degree = 3, intercept = TRUE)))
    )
    
    p <- ggplot() +
      geom_point(data = train_data, aes(x = .data[[best_predictor]], y = expenditure), 
                 color = "blue", alpha = 0.5) +
      geom_line(data = plot_data, aes(x = x, y = y), 
                color = "red", size = 1.5) +
      geom_vline(xintercept = knots, linetype = "dashed", color = "gray") +
      labs(title = "Cubic Spline Regression",
           x = best_predictor,
           y = "Expenditure") +
      theme_minimal()
    print(p)
  }
}, error = function(e) {
  cat("Error dalam Cubic Spline Regression:", e$message, "\n")
})
## 
##  ================================================== 
## Cubic Spline - Test Data 
## ================================================== 
## RMSE: 113.0013 
## MAE : 61.8996 
## MSE : 12769.3 
## R²  : 0.8422

## Error dalam Cubic Spline Regression: object '1' not found

2.0.4 KNN Regression

tryCatch({
  # Training KNN dengan CV
  knn_reg_model <- train(expenditure ~ .,
                         data = train_data,
                         method = "knn",
                         trControl = ctrl_cv5,
                         tuneGrid = expand.grid(k = seq(3, min(21, nrow(train_data)-1), by = 2)),
                         preProcess = c("center", "scale"))
  
  # Tampilkan tuning results
  cat("\nKNN Tuning Results:\n")
  print(knn_reg_model$results)
  cat("\nBest K:", knn_reg_model$bestTune$k, "\n")
  
  # Plot tuning results
  plot(knn_reg_model, main = "KNN Regression: RMSE vs K")
  
  # Prediksi
  knn_reg_train_pred <- predict(knn_reg_model, train_data)
  knn_reg_test_pred <- predict(knn_reg_model, test_data)
  
  # Evaluasi
  results$knn_reg <- evaluate_regression(knn_reg_test_pred, test_data$expenditure, 
                                        "KNN Regression", "Test Data")
  cv_results$knn_reg <- knn_reg_model
}, error = function(e) {
  cat("Error dalam KNN Regression:", e$message, "\n")
})
## 
## KNN Tuning Results:
##     k     RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
## 1   3 150.7975 0.7166484 74.83276 39.19433 0.04195406 6.243915
## 2   5 144.2537 0.7608913 72.08577 47.65963 0.06896944 8.682050
## 3   7 141.2630 0.7859317 71.31523 46.64292 0.05344641 9.383293
## 4   9 143.3205 0.7857083 71.85967 49.14845 0.06454640 9.974117
## 5  11 146.8377 0.7802253 73.50562 47.92362 0.05980393 9.622740
## 6  13 148.4843 0.7840720 74.80977 48.13834 0.06233632 9.782057
## 7  15 152.5194 0.7726519 76.83856 47.97431 0.05828648 9.341220
## 8  17 155.8848 0.7626555 78.44253 48.03739 0.05914034 9.952061
## 9  19 158.4333 0.7561882 79.63346 47.67147 0.05703458 9.879802
## 10 21 159.8502 0.7577948 80.70819 47.52791 0.05391118 9.890727
## 
## Best K: 7 
## 
##  ================================================== 
## KNN Regression - Test Data 
## ================================================== 
## RMSE: 168.7605 
## MAE : 77.3134 
## MSE : 28480.11 
## R²  : 0.7396

2.0.5 Regression Tree

tryCatch({
  # Training regression tree dengan CV
  tree_model <- train(expenditure ~ .,
                      data = train_data,
                      method = "rpart",
                      trControl = ctrl_cv5,
                      tuneLength = 5)
  
  # Tampilkan tuning results
  cat("\nRegression Tree Tuning Results:\n")
  print(tree_model$results)
  
  # Plot tree
  rpart.plot(tree_model$finalModel,
             main = "Regression Tree (CV Optimized)",
             type = 4,
             extra = 101,
             box.palette = "RdYlGn",
             fallen.leaves = TRUE)
  
  # Prediksi
  tree_train_pred <- predict(tree_model, train_data)
  tree_test_pred <- predict(tree_model, test_data)
  
  # Evaluasi
  results$tree <- evaluate_regression(tree_test_pred, test_data$expenditure, 
                                     "Regression Tree", "Test Data")
  cv_results$tree <- tree_model
}, error = function(e) {
  cat("Error dalam Regression Tree:", e$message, "\n")
})
## 
## Regression Tree Tuning Results:
##           cp     RMSE  Rsquared       MAE   RMSESD RsquaredSD    MAESD
## 1 0.04636867 150.3884 0.6750795  78.21845 41.46525 0.15450680 10.58495
## 2 0.06183555 176.3621 0.5677563 100.19437 44.24339 0.11221869 11.12011
## 3 0.09300498 177.6830 0.5621279 104.65656 46.95235 0.11943023 15.15254
## 4 0.17183649 185.8391 0.5143367 116.44504 56.01800 0.20629713 17.18961
## 5 0.44851012 249.6979 0.2926773 158.68752 36.34515 0.05997516 22.99496

## 
##  ================================================== 
## Regression Tree - Test Data 
## ================================================== 
## RMSE: 177.0062 
## MAE : 90.3364 
## MSE : 31331.18 
## R²  : 0.6141

2.0.6 Random Forest

tryCatch({
  # Training Random Forest dengan CV
  rf_reg_model <- train(expenditure ~ .,
                        data = train_data,
                        method = "rf",
                        trControl = ctrl_cv5,
                        tuneLength = 2,
                        ntree = 100,
                        importance = TRUE)
  
  # Tampilkan tuning results
  cat("\nRandom Forest Tuning Results:\n")
  print(rf_reg_model$results)
  
  # Plot variable importance
  varImpPlot <- varImp(rf_reg_model, scale = FALSE)
  plot(varImpPlot, 
       main = "Random Forest - Variable Importance",
       top = min(10, ncol(train_data) - 1))
  
  # Plot OOB error
  if("finalModel" %in% names(rf_reg_model)) {
    oob_error <- rf_reg_model$finalModel$mse
    plot(1:length(oob_error), oob_error,
         type = "l", col = "red", lwd = 2,
         main = "Random Forest OOB Error",
         xlab = "Number of Trees",
         ylab = "OOB MSE")
    grid()
  }
  
  # Prediksi
  rf_reg_train_pred <- predict(rf_reg_model, train_data)
  rf_reg_test_pred <- predict(rf_reg_model, test_data)
  
  # Evaluasi
  results$rf_reg <- evaluate_regression(rf_reg_test_pred, test_data$expenditure, 
                                       "Random Forest", "Test Data")
  cv_results$rf_reg <- rf_reg_model
}, error = function(e) {
  cat("Error dalam Random Forest Regression:", e$message, "\n")
})
## 
## Random Forest Tuning Results:
##   mtry      RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
## 1    2 138.55246 0.8059870 60.76234 38.21001 0.07113424 5.056425
## 2   11  65.55562 0.9438179 15.36185 39.38930 0.04863151 4.665513

## 
##  ================================================== 
## Random Forest - Test Data 
## ================================================== 
## RMSE: 79.0744 
## MAE : 16.9195 
## MSE : 6252.764 
## R²  : 0.9445

2.0.7 Neural Network

tryCatch({
  # Normalisasi data
  normalize <- function(x) {
    if(is.numeric(x)) {
      if(sd(x, na.rm = TRUE) == 0) return(x)
      return((x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE)))
    }
    return(x)
  }
  
  # Pisahkan factor dan numeric variables
  factor_vars <- names(train_data)[sapply(train_data, is.factor)]
  numeric_vars <- names(train_data)[sapply(train_data, is.numeric) & names(train_data) != "expenditure"]
  
  # Normalisasi hanya numeric variables
  train_norm <- train_data
  test_norm <- test_data
  
  for(var in numeric_vars) {
    train_norm[[var]] <- normalize(train_data[[var]])
    test_norm[[var]] <- normalize(test_data[[var]])
  }
  
  # Training Neural Network dengan CV
  nn_reg_model <- train(expenditure ~ .,
                        data = train_norm,
                        method = "nnet",
                        trControl = ctrl_cv5,
                        tuneGrid = expand.grid(
                          size = c(3, 5, 7),
                          decay = c(0.001, 0.01)
                        ),
                        linout = TRUE,  # Untuk regression
                        maxit = 100,
                        trace = FALSE)
  
  # Tampilkan tuning results
  cat("\nNeural Network Tuning Results:\n")
  print(nn_reg_model$results)
  
  # Plot neural network architecture
  if(require(NeuralNetTools) && max(nn_reg_model$results$size) <= 10) {
    plotnet(nn_reg_model$finalModel,
            alpha = 0.6,
            circle_col = "lightblue",
            bord_col = "black",
            main = "Neural Network Architecture")
  }
  
  # Prediksi
  nn_reg_train_pred <- predict(nn_reg_model, train_norm)
  nn_reg_test_pred <- predict(nn_reg_model, test_norm)
  
  # Evaluasi
  results$nn_reg <- evaluate_regression(nn_reg_test_pred, test_data$expenditure, 
                                       "Neural Network", "Test Data")
  cv_results$nn_reg <- nn_reg_model
}, error = function(e) {
  cat("Error dalam Neural Network Regression:", e$message, "\n")
})
## 
## Neural Network Tuning Results:
##   size decay     RMSE  Rsquared       MAE   RMSESD RsquaredSD    MAESD
## 1    3 0.001 210.4160 0.3469014 126.61956 28.18749 0.32848915 36.38809
## 2    3 0.010 137.4261 0.7611434  75.54134 35.11794 0.07487461 21.85270
## 3    5 0.001 172.3432 0.6067911 103.68488 65.23254 0.39971504 59.36173
## 4    5 0.010 206.1157 0.6129911 119.55365 84.88204 0.27371472 49.73585
## 5    7 0.001 221.3524 0.4677079 138.00686 29.32972 0.27789740 29.56603
## 6    7 0.010 162.6287 0.6432729  91.01524 81.47856 0.18997924 34.05221

## 
##  ================================================== 
## Neural Network - Test Data 
## ================================================== 
## RMSE: 248.7369 
## MAE : 129.2936 
## MSE : 61870.06 
## R²  : 0.2337

2.0.8 Gradient Boosting

tryCatch({
  # Training Gradient Boosting dengan CV
  gbm_reg_model <- train(expenditure ~ .,
                         data = train_data,
                         method = "gbm",
                         trControl = ctrl_cv5,
                         verbose = FALSE,
                         tuneGrid = expand.grid(
                           n.trees = c(50, 100),
                           interaction.depth = c(2, 3),
                           shrinkage = c(0.01, 0.1),
                           n.minobsinnode = 5
                         ))
  
  # Tampilkan tuning results
  cat("\nGradient Boosting Tuning Results:\n")
  print(gbm_reg_model$results)
  
  # Plot variable importance
  varImpPlot_gbm <- varImp(gbm_reg_model, scale = FALSE)
  plot(varImpPlot_gbm, 
       main = "Gradient Boosting - Variable Importance",
       top = min(10, ncol(train_data) - 1))
  
  # Prediksi
  gbm_reg_train_pred <- predict(gbm_reg_model, train_data)
  gbm_reg_test_pred <- predict(gbm_reg_model, test_data)
  
  # Evaluasi
  results$gbm_reg <- evaluate_regression(gbm_reg_test_pred, test_data$expenditure, 
                                        "Gradient Boosting", "Test Data")
  cv_results$gbm_reg <- gbm_reg_model
}, error = function(e) {
  cat("Error dalam Gradient Boosting Regression:", e$message, "\n")
})
## 
## Gradient Boosting Tuning Results:
##   shrinkage interaction.depth n.minobsinnode n.trees      RMSE  Rsquared
## 1      0.01                 2              5      50 205.67317 0.7734641
## 5      0.10                 2              5      50  75.67444 0.9311453
## 3      0.01                 3              5      50 196.54572 0.8336064
## 7      0.10                 3              5      50  71.88018 0.9372471
## 2      0.01                 2              5     100 167.47345 0.8236467
## 6      0.10                 2              5     100  64.93840 0.9461651
## 4      0.01                 3              5     100 152.93270 0.8626378
## 8      0.10                 3              5     100  65.68809 0.9456612
##         MAE   RMSESD RsquaredSD     MAESD
## 1 127.58591 43.37049 0.06469733 12.130833
## 5  33.87627 31.84996 0.03556314  5.612493
## 3 121.68726 43.91787 0.05757927 11.694968
## 7  28.29045 27.72399 0.02496331  3.808863
## 2 101.27320 40.75475 0.04535695 10.640828
## 6  26.75595 27.26843 0.02863940  3.861846
## 4  91.34489 41.25138 0.04314469  9.894588
## 8  24.08445 25.18381 0.02286334  3.773035
## 
##  ================================================== 
## Gradient Boosting - Test Data 
## ================================================== 
## RMSE: 62.5461 
## MAE : 27.2084 
## MSE : 3912.009 
## R²  : 0.9611

2.0.9 SVM

tryCatch({
  # Convert factor variables to numeric for SVM
  train_svm <- train_data
  test_svm <- test_data
  
  for(var in names(train_svm)) {
    if(is.factor(train_svm[[var]])) {
      train_svm[[var]] <- as.numeric(train_svm[[var]])
      test_svm[[var]] <- as.numeric(test_svm[[var]])
    }
  }
  
  # Training SVM Regression dengan CV
  svm_reg_model <- train(expenditure ~ .,
                         data = train_svm,
                         method = "svmRadial",
                         trControl = ctrl_cv5,
                         tuneLength = 3,
                         preProcess = c("center", "scale"))
  
  # Tampilkan tuning results
  cat("\nSVM Regression Tuning Results:\n")
  print(svm_reg_model$results)
  
  # Prediksi
  svm_reg_train_pred <- predict(svm_reg_model, train_svm)
  svm_reg_test_pred <- predict(svm_reg_model, test_svm)
  
  # Evaluasi
  results$svm_reg <- evaluate_regression(svm_reg_test_pred, test_data$expenditure, 
                                        "SVM Regression", "Test Data")
  cv_results$svm_reg <- svm_reg_model
}, error = function(e) {
  cat("Error dalam SVM Regression:", e$message, "\n")
})
## 
## SVM Regression Tuning Results:
##        sigma    C     RMSE  Rsquared      MAE   RMSESD RsquaredSD    MAESD
## 1 0.08338795 0.25 147.7661 0.7450056 50.51453 56.68765  0.1437049 8.655255
## 2 0.08338795 0.50 127.2987 0.8052488 41.65658 57.39252  0.1377684 8.161104
## 3 0.08338795 1.00 108.9882 0.8521085 36.02953 54.88329  0.1225144 6.315507
## 
##  ================================================== 
## SVM Regression - Test Data 
## ================================================== 
## RMSE: 135.3141 
## MAE : 40.5069 
## MSE : 18309.9 
## R²  : 0.8101

2.0.10 Perbandingan

model_names <- c("Polynomial", "Piecewise Constant", "Cubic Spline", 
                 "KNN", "Regression Tree", "Random Forest", "Neural Network",
                 "Gradient Boosting", "SVM")

# Inisialisasi dataframe
comparison_df <- data.frame(
  Model = model_names,
  Test_RMSE = rep(NA, length(model_names)),
  Test_R2 = rep(NA, length(model_names)),
  MAE = rep(NA, length(model_names))
)

# Isi hasil
if(!is.null(results$poly)) {
  comparison_df[comparison_df$Model == "Polynomial", c("Test_RMSE", "Test_R2", "MAE")] <- 
    c(results$poly$rmse, results$poly$r2, results$poly$mae)
}
if(!is.null(results$piecewise)) {
  comparison_df[comparison_df$Model == "Piecewise Constant", c("Test_RMSE", "Test_R2", "MAE")] <- 
    c(results$piecewise$rmse, results$piecewise$r2, results$piecewise$mae)
}
if(!is.null(results$cubic_spline)) {
  comparison_df[comparison_df$Model == "Cubic Spline", c("Test_RMSE", "Test_R2", "MAE")] <- 
    c(results$cubic_spline$rmse, results$cubic_spline$r2, results$cubic_spline$mae)
}
if(!is.null(results$knn_reg)) {
  comparison_df[comparison_df$Model == "KNN", c("Test_RMSE", "Test_R2", "MAE")] <- 
    c(results$knn_reg$rmse, results$knn_reg$r2, results$knn_reg$mae)
}
if(!is.null(results$tree)) {
  comparison_df[comparison_df$Model == "Regression Tree", c("Test_RMSE", "Test_R2", "MAE")] <- 
    c(results$tree$rmse, results$tree$r2, results$tree$mae)
}
if(!is.null(results$rf_reg)) {
  comparison_df[comparison_df$Model == "Random Forest", c("Test_RMSE", "Test_R2", "MAE")] <- 
    c(results$rf_reg$rmse, results$rf_reg$r2, results$rf_reg$mae)
}
if(!is.null(results$nn_reg)) {
  comparison_df[comparison_df$Model == "Neural Network", c("Test_RMSE", "Test_R2", "MAE")] <- 
    c(results$nn_reg$rmse, results$nn_reg$r2, results$nn_reg$mae)
}
if(!is.null(results$gbm_reg)) {
  comparison_df[comparison_df$Model == "Gradient Boosting", c("Test_RMSE", "Test_R2", "MAE")] <- 
    c(results$gbm_reg$rmse, results$gbm_reg$r2, results$gbm_reg$mae)
}
if(!is.null(results$svm_reg)) {
  comparison_df[comparison_df$Model == "SVM", c("Test_RMSE", "Test_R2", "MAE")] <- 
    c(results$svm_reg$rmse, results$svm_reg$r2, results$svm_reg$mae)
}

# Hapus baris dengan NA
comparison_df <- comparison_df[complete.cases(comparison_df), ]

# Urutkan berdasarkan RMSE terbaik (terkecil)
if(nrow(comparison_df) > 0) {
  comparison_df <- comparison_df[order(comparison_df$Test_RMSE), ]
  comparison_df$Rank <- 1:nrow(comparison_df)
  
  cat("\nModel Performance Comparison (sorted by Test RMSE):\n")
  print(comparison_df)
  
  barplot(comparison_df$Test_RMSE,
          names.arg = comparison_df$Model,
          las = 2,
          col = ifelse(comparison_df$Rank == 1, "gold", "steelblue"),
          ylim = c(0, max(comparison_df$Test_RMSE) * 1.1),
          main = "Test RMSE Comparison",
          ylab = "RMSE",
          cex.names = 0.8)
  abline(h = min(comparison_df$Test_RMSE), col = "red", lty = 2, lwd = 2)
  
  barplot(comparison_df$Test_R2,
          names.arg = comparison_df$Model,
          las = 2,
          col = ifelse(comparison_df$Rank == 1, "gold", "darkgreen"),
          ylim = c(0, 1),
          main = "Test R² Comparison",
          ylab = "R²",
          cex.names = 0.8)
  abline(h = max(comparison_df$Test_R2), col = "red", lty = 2, lwd = 2)
  
  barplot(comparison_df$MAE,
          names.arg = comparison_df$Model,
          las = 2,
          col = ifelse(comparison_df$Rank == 1, "gold", "darkorange"),
          ylim = c(0, max(comparison_df$MAE) * 1.1),
          main = "Test MAE Comparison",
          ylab = "MAE",
          cex.names = 0.8)
  abline(h = min(comparison_df$MAE), col = "red", lty = 2, lwd = 2)
  
  plot(comparison_df$Rank, comparison_df$Test_RMSE,
       type = "b", pch = 19, col = "blue", lwd = 2,
       xlab = "Rank (1 = Best)", ylab = "Test RMSE",
       main = "Model Ranking by RMSE",
       xlim = c(0.5, nrow(comparison_df) + 0.5))
  text(comparison_df$Rank, comparison_df$Test_RMSE,
       labels = comparison_df$Model,
       pos = 3, cex = 0.7, col = "darkred")
  grid()
  
  cat("\n", strrep("=", 60))
  cat("\nOVERFITTING ANALYSIS")
  cat("\n", strrep("=", 60), "\n")
  
  # Buat dataframe untuk overfitting analysis
  overfitting_df <- data.frame(
    Model = comparison_df$Model,
    Train_RMSE = NA,
    Test_RMSE = comparison_df$Test_RMSE,
    Overfitting_Index = NA
  )
  
  # Isi Train RMSE
  train_preds <- list(
    poly_train_pred,
    pc_train_pred,
    cs_train_pred,
    knn_reg_train_pred,
    tree_train_pred,
    rf_reg_train_pred,
    nn_reg_train_pred,
    gbm_reg_train_pred,
    svm_reg_train_pred
  )
  
  for(i in 1:nrow(overfitting_df)) {
    model_name <- overfitting_df$Model[i]
    
    if(model_name == "Polynomial" && exists("poly_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, poly_train_pred)
    } else if(model_name == "Piecewise Constant" && exists("pc_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, pc_train_pred)
    } else if(model_name == "Cubic Spline" && exists("cs_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, cs_train_pred)
    } else if(model_name == "Natural Spline" && exists("ns_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, ns_train_pred)
    } else if(model_name == "KNN" && exists("knn_reg_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, knn_reg_train_pred)
    } else if(model_name == "Regression Tree" && exists("tree_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, tree_train_pred)
    } else if(model_name == "Random Forest" && exists("rf_reg_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, rf_reg_train_pred)
    } else if(model_name == "Neural Network" && exists("nn_reg_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, nn_reg_train_pred)
    } else if(model_name == "Gradient Boosting" && exists("gbm_reg_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, gbm_reg_train_pred)
    } else if(model_name == "SVM" && exists("svm_reg_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, svm_reg_train_pred)
    }
  }
  
  # Hitung overfitting index
  overfitting_df$Overfitting_Index <- (overfitting_df$Train_RMSE - overfitting_df$Test_RMSE) / overfitting_df$Test_RMSE
  
  cat("\nOverfitting Analysis (positive values indicate overfitting):\n")
  print(overfitting_df)
  
  # Plot overfitting
  if(require(ggplot2)) {
    p <- ggplot(overfitting_df, aes(x = Model)) +
      geom_bar(aes(y = Train_RMSE, fill = "Train"), stat = "identity", alpha = 0.7) +
      geom_bar(aes(y = Test_RMSE, fill = "Test"), stat = "identity", alpha = 0.7) +
      labs(title = "Train vs Test RMSE (Overfitting Analysis)",
           x = "Model",
           y = "RMSE",
           fill = "Dataset") +
      theme_minimal() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
      scale_fill_manual(values = c("Train" = "blue", "Test" = "red"))
    print(p)
  }
  
} else {
  cat("\nTidak ada model yang berhasil di-train untuk perbandingan.\n")
}
## 
## Model Performance Comparison (sorted by Test RMSE):
##                Model Test_RMSE   Test_R2       MAE Rank
## 8  Gradient Boosting  62.54606 0.9611186  27.20840    1
## 6      Random Forest  79.07442 0.9445186  16.91945    2
## 3       Cubic Spline 113.00134 0.8421841  61.89962    3
## 9                SVM 135.31408 0.8100652  40.50692    4
## 1         Polynomial 146.32130 0.7447015  67.01839    5
## 4                KNN 168.76051 0.7395614  77.31341    6
## 5    Regression Tree 177.00617 0.6141311  90.33640    7
## 2 Piecewise Constant 217.26661 0.4126403 126.25576    8
## 7     Neural Network 248.73694 0.2337293 129.29362    9

## 
##  ============================================================
## OVERFITTING ANALYSIS
##  ============================================================ 
## 
## Overfitting Analysis (positive values indicate overfitting):
##                Model Train_RMSE Test_RMSE Overfitting_Index
## 1  Gradient Boosting   46.01191  62.54606      -0.264351606
## 2      Random Forest   30.93598  79.07442      -0.608773807
## 3       Cubic Spline  111.34691 113.00134      -0.014640856
## 4                SVM   92.57636 135.31408      -0.315840887
## 5         Polynomial  145.52501 146.32130      -0.005442083
## 6                KNN  125.20119 168.76051      -0.258113248
## 7    Regression Tree  127.65096 177.00617      -0.278833287
## 8 Piecewise Constant  199.93198 217.26661      -0.079785059
## 9     Neural Network  238.28377 248.73694      -0.042024977

---
title: "Sains Data"
author: "Ngurah Sentana"
date: "`r format(Sys.Date(), '%d %B %Y')`"
output: 
  html_document:
    theme: 
      version: 4
      bootswatch: flatly
    highlight: pygments
    toc: true
    toc_float:
      collapsed: true
      smooth_scroll: true
      toc_depth: 3
    code_folding: show
    code_download: true
    number_sections: true
    fig_width: 10
    fig_height: 6
    fig_align: 'center'
    df_print: paged
---

```{=html}
<style>
/* Custom CSS untuk styling tambahan */
body {
  font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
  line-height: 1.6;
}

h1, h2, h3 {
  color: #2c3e50;
  border-bottom: 2px solid #3498db;
  padding-bottom: 10px;
}

h1 {
  color: #2980b9;
}

.main-container {
  max-width: 1200px;
  margin: 0 auto;
  padding: 20px;
}

.navbar {
  background-color: #2c3e50 !important;
}

.navbar-brand {
  color: #ecf0f1 !important;
  font-weight: bold;
}

.btn-primary {
  background-color: #3498db;
  border-color: #2980b9;
}

.btn-primary:hover {
  background-color: #2980b9;
  border-color: #1c6ea4;
}

.card {
  border: 1px solid #ddd;
  border-radius: 8px;
  box-shadow: 0 2px 4px rgba(0,0,0,0.1);
  margin-bottom: 20px;
  overflow: hidden;
}

.card-header {
  background-color: #f8f9fa;
  padding: 15px;
  border-bottom: 1px solid #ddd;
  font-weight: bold;
  color: #2c3e50;
}

.card-body {
  padding: 20px;
}

.alert-info {
  background-color: #d9edf7;
  border-color: #bce8f1;
  color: #31708f;
}

.alert-success {
  background-color: #dff0d8;
  border-color: #d6e9c6;
  color: #3c763d;
}

.alert-warning {
  background-color: #fcf8e3;
  border-color: #faebcc;
  color: #8a6d3b;
}

.plot-container {
  background-color: white;
  padding: 15px;
  border-radius: 5px;
  margin: 15px 0;
  box-shadow: 0 1px 3px rgba(0,0,0,0.1);
}

.download-btn {
  background-color: #27ae60;
  color: white;
  padding: 8px 15px;
  border-radius: 4px;
  text-decoration: none;
  display: inline-block;
  margin: 5px;
  border: none;
  cursor: pointer;
}

.download-btn:hover {
  background-color: #219653;
  color: white;
  text-decoration: none;
}

.table-styled {
  width: 100%;
  border-collapse: collapse;
  margin: 20px 0;
}

.table-styled th {
  background-color: #3498db;
  color: white;
  padding: 12px;
  text-align: left;
}

.table-styled td {
  padding: 10px;
  border-bottom: 1px solid #ddd;
}

.table-styled tr:nth-child(even) {
  background-color: #f8f9fa;
}

.table-styled tr:hover {
  background-color: #e8f4fc;
}

.code-header {
  background-color: #2c3e50;
  color: white;
  padding: 10px;
  border-radius: 5px 5px 0 0;
  margin-top: 20px;
  font-family: monospace;
}

.footer {
  margin-top: 40px;
  padding: 20px;
  background-color: #f8f9fa;
  border-top: 1px solid #ddd;
  text-align: center;
  color: #7f8c8d;
}
</style>
```

::: main-container

------------------------------------------------------------------------

```{r setup, include=FALSE}
# Setup global options
knitr::opts_chunk$set(
  echo = TRUE,
  warning = FALSE,
  message = FALSE,
  fig.align = 'center',
  fig.width = 10,
  fig.height = 6,
  out.width = '90%',
  cache = TRUE
)
```

# Permodelan Klasifikasi

### Studi Kasus

Kasus ini menggunakan data yang diberikan pada UAS tahun 2022

[LINK](https://docs.google.com/spreadsheets/d/1-R_0QT_xzrEsIkhdgXZP0yZRk_gzb7mr/edit?rtpof=true)

```{r}
library(readxl)
data <- read_excel("C:/Users/Legion/Downloads/data_STA1581.xlsx")
```

```{r}
print(data)
```

```{r}
set.seed(2024)
bootstrap_indices <- sample(1:nrow(data), size = 300, replace = TRUE, set.seed(2024))
data <- data[bootstrap_indices, ]

cat("\nData setelah resampling (bootstrap n=300):\n")
print(head(data))
cat("\nDimensi data resampled:", dim(data), "\n")
cat("Proporsi kelas Y setelah resampling:\n")
print(prop.table(table(data$Y)))
```

### Permodelan dengan boostraping

### Definisi Function, Library dan Dataframe

```{r}
library(tidyverse)
library(caret)
library(pROC)
library(e1071)      
library(randomForest)
library(gbm)        
library(rpart)      
library(rpart.plot)
library(nnet)       
library(kknn)       

data$Y <- factor(data$Y, levels = c("0", "1"))
set.seed(2024)
train_index <- createDataPartition(data$Y, p = 0.8, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]

train_data$Y <- factor(train_data$Y, levels = levels(data$Y))
test_data$Y <- factor(test_data$Y, levels = levels(data$Y))

ctrl_cv5 <- trainControl(method = "cv", 
                         number = 5,
                         classProbs = TRUE,
                         summaryFunction = twoClassSummary,
                         savePredictions = TRUE)

train_data_cv <- train_data
test_data_cv <- test_data
levels(train_data_cv$Y) <- c("No", "Yes")
levels(test_data_cv$Y) <- c("No", "Yes")

evaluate_model <- function(predictions, actual, model_name, data_type, plot_cm = TRUE) {
  # Pastikan predictions dan actual memiliki level yang sama
  if(!is.factor(predictions)) {
    predictions <- factor(predictions, levels = levels(actual))
  }
  
  cm <- confusionMatrix(predictions, actual)
  
  # Hitung metrik tambahan
  precision <- cm$byClass["Pos Pred Value"]
  recall <- cm$byClass["Sensitivity"]
  f1_score <- 2 * (precision * recall) / (precision + recall)
  
  # Print evaluasi
  cat("\n", strrep("=", 50), "\n")
  cat(model_name, "-", data_type, "\n")
  cat(strrep("=", 50), "\n")
  
  cat("Confusion Matrix:\n")
  print(cm$table)
  
  cat("\nEvaluation Metrics:\n")
  cat("Accuracy    :", round(cm$overall["Accuracy"], 4), "\n")
  cat("Sensitivity :", round(cm$byClass["Sensitivity"], 4), "\n")
  cat("Specificity :", round(cm$byClass["Specificity"], 4), "\n")
  cat("Precision   :", round(precision, 4), "\n")
  cat("F1-Score    :", round(f1_score, 4), "\n")
  
  # Plot Confusion Matrix jika diminta
  if(plot_cm) {
    plot_confusion_matrix(cm, model_name, data_type)
  }
  
  return(list(
    confusion = cm$table,
    accuracy = cm$overall["Accuracy"],
    sensitivity = cm$byClass["Sensitivity"],
    specificity = cm$byClass["Specificity"],
    precision = precision,
    f1_score = f1_score
  ))
}

plot_confusion_matrix <- function(cm, model_name, data_type) {
  cm_matrix <- as.matrix(cm$table)
  
  # Hitung persentase berdasarkan total
  total <- sum(cm_matrix)
  cm_percent <- round((cm_matrix / total) * 100, 1)
  
  # Buat plot yang lebih jelas
  par(mfrow = c(1, 1), mar = c(5, 5, 5, 5))
  
  # Buat warna yang kontras
  colors <- colorRampPalette(c("#F6F6F6", "#FF6B6B", "#4ECDC4"))(100)
  
  # Plot heatmap
  image(1:ncol(cm_matrix), 1:nrow(cm_matrix), 
        t(cm_matrix), 
        col = colors,
        xlab = "Predicted", ylab = "Actual",
        xaxt = "n", yaxt = "n",
        main = paste("Confusion Matrix\n", model_name, "-", data_type),
        cex.main = 1.2)
  
  # Tambahkan axis labels
  axis(1, at = 1:ncol(cm_matrix), labels = c("0", "1"))
  axis(2, at = 1:nrow(cm_matrix), labels = c("0", "1"), las = 2)
  
  # Tambahkan grid
  abline(h = seq(0.5, nrow(cm_matrix)+0.5, 1), col = "gray", lty = 3)
  abline(v = seq(0.5, ncol(cm_matrix)+0.5, 1), col = "gray", lty = 3)
  
  # Tambahkan teks di setiap sel
  for(i in 1:nrow(cm_matrix)) {
    for(j in 1:ncol(cm_matrix)) {
      text(j, i, 
           paste(cm_matrix[i, j], "\n(", cm_percent[i, j], "%)", sep = ""),
           cex = 1, 
           col = ifelse(cm_matrix[i, j] > max(cm_matrix)/2, "white", "black"),
           font = 2)
    }
  }
  
  # Tambahkan metrik di sudut
  mtext(paste("Accuracy:", round(cm$overall["Accuracy"], 4)), 
        side = 3, line = 0.5, adj = 0.05, cex = 0.9, col = "darkblue")
}

plot_roc_curve <- function(probs, actual, model_name, color) {
  roc_obj <- roc(actual, probs)
  auc_val <- auc(roc_obj)
  
  # Plot ROC dengan pengaturan yang lebih baik
  plot(roc_obj, 
       main = paste("ROC Curve -", model_name),
       col = color, 
       lwd = 3,
       print.auc = TRUE, 
       auc.polygon = TRUE,
       auc.polygon.col = adjustcolor(color, 0.2),
       print.auc.col = "black",  # Warna teks AUC hitam
       print.auc.cex = 1.1,      # Ukuran teks AUC
       legacy.axes = TRUE,       # False Positive Rate di sumbu x
       xlab = "False Positive Rate",
       ylab = "True Positive Rate",
       grid = TRUE,              # Tambahkan grid
       grid.col = "gray90",
       xlim = c(0, 1),
       ylim = c(0, 1))
  
  # Tambahkan diagonal line
  abline(a = 0, b = 1, col = "gray", lty = 2, lwd = 2)
  
  # Tambahkan judul AUC dengan background
  legend("bottomright", 
         legend = paste("AUC =", round(auc_val, 4)),
         bty = "n",
         text.col = "darkblue",
         cex = 1.1,
         bg = "white",
         box.col = "black")
  
  return(auc_val)
}

results <- list()
cv_results <- list()

```

### Regresi Logistik

```{r}
# Training dengan CV
logit_cv_model <- train(Y ~ ., 
                        data = train_data_cv,
                        method = "glm",
                        family = binomial,
                        trControl = ctrl_cv5,
                        metric = "ROC", set.seed(2024))

# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")
print(logit_cv_model$results)

# Prediksi untuk train data (dari model CV)
logit_train_pred <- predict(logit_cv_model, train_data_cv)
logit_test_pred <- predict(logit_cv_model, test_data_cv)

# Konversi kembali ke 0/1 untuk evaluasi
logit_train_pred_01 <- factor(ifelse(logit_train_pred == "Yes", "1", "0"), 
                              levels = c("0", "1"))
logit_test_pred_01 <- factor(ifelse(logit_test_pred == "Yes", "1", "0"), 
                             levels = c("0", "1"))

# Evaluasi dengan plot
results$logit$train <- evaluate_model(logit_train_pred_01, train_data$Y, 
                                      "Logistic Regression", "Train Data (CV)")
results$logit$test <- evaluate_model(logit_test_pred_01, test_data$Y, 
                                     "Logistic Regression", "Test Data")

# Simpan probabilitas untuk ROC
logit_test_prob <- predict(logit_cv_model, test_data_cv, type = "prob")[, "Yes"]
cv_results$logit <- logit_cv_model
```

### SVM

```{r}
# Training SVM dengan CV
svm_cv_model <- train(Y ~ ., 
                      data = train_data_cv,
                      method = "svmRadial",
                      trControl = ctrl_cv5,
                      tuneLength = 5,
                      metric = "ROC",set.seed(2024))

# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")
print(svm_cv_model$results)

# Prediksi
svm_train_pred <- predict(svm_cv_model, train_data_cv)
svm_test_pred <- predict(svm_cv_model, test_data_cv)

# Konversi kembali ke 0/1
svm_train_pred_01 <- factor(ifelse(svm_train_pred == "Yes", "1", "0"), 
                            levels = c("0", "1"))
svm_test_pred_01 <- factor(ifelse(svm_test_pred == "Yes", "1", "0"), 
                           levels = c("0", "1"))

# Evaluasi
results$svm$train <- evaluate_model(svm_train_pred_01, train_data$Y, "SVM", "Train Data (CV)")
results$svm$test <- evaluate_model(svm_test_pred_01, test_data$Y, "SVM", "Test Data")

# Probabilitas untuk ROC
svm_test_prob <- predict(svm_cv_model, test_data_cv, type = "prob")[, "Yes"]
cv_results$svm <- svm_cv_model
```

### Naive Bayes

```{r}
# Training Naive Bayes dengan CV
nb_cv_model <- train(Y ~ ., 
                     data = train_data_cv,
                     method = "naive_bayes",
                     trControl = ctrl_cv5,
                     tuneLength = 5,
                     metric = "ROC",set.seed(2024))

# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")
print(nb_cv_model$results)

# Prediksi
nb_train_pred <- predict(nb_cv_model, train_data_cv)
nb_test_pred <- predict(nb_cv_model, test_data_cv)

# Konversi kembali ke 0/1
nb_train_pred_01 <- factor(ifelse(nb_train_pred == "Yes", "1", "0"), 
                           levels = c("0", "1"))
nb_test_pred_01 <- factor(ifelse(nb_test_pred == "Yes", "1", "0"), 
                          levels = c("0", "1"))

# Evaluasi
results$nb$train <- evaluate_model(nb_train_pred_01, train_data$Y, "Naive Bayes", "Train Data (CV)")
results$nb$test <- evaluate_model(nb_test_pred_01, test_data$Y, "Naive Bayes", "Test Data")

# Probabilitas untuk ROC
nb_test_prob <- predict(nb_cv_model, test_data_cv, type = "prob")[, "Yes"]
cv_results$nb <- nb_cv_model
```

### KNN

```{r}
# Training KNN dengan CV
knn_cv_model <- train(Y ~ ., 
                      data = train_data_cv,
                      method = "knn",
                      trControl = ctrl_cv5,
                      tuneGrid = expand.grid(k = c(3, 5, 7, 9, 11)),
                      metric = "ROC",set.seed(2024))

# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")
print(knn_cv_model$results)
cat("\nBest K value:", knn_cv_model$bestTune$k, "\n")

# Prediksi
knn_train_pred <- predict(knn_cv_model, train_data_cv)
knn_test_pred <- predict(knn_cv_model, test_data_cv)

# Konversi kembali ke 0/1
knn_train_pred_01 <- factor(ifelse(knn_train_pred == "Yes", "1", "0"), 
                            levels = c("0", "1"))
knn_test_pred_01 <- factor(ifelse(knn_test_pred == "Yes", "1", "0"), 
                           levels = c("0", "1"))

# Evaluasi
results$knn$train <- evaluate_model(knn_train_pred_01, train_data$Y, "KNN", "Train Data (CV)")
results$knn$test <- evaluate_model(knn_test_pred_01, test_data$Y, "KNN", "Test Data")

# Probabilitas untuk ROC
knn_test_prob <- predict(knn_cv_model, test_data_cv, type = "prob")[, "Yes"]
cv_results$knn <- knn_cv_model
```

### Decision Tree

```{r}
# Training Decision Tree dengan CV
dt_cv_model <- train(Y ~ ., 
                     data = train_data_cv,
                     method = "rpart",
                     trControl = ctrl_cv5,
                     tuneLength = 10,
                     metric = "ROC",set.seed(2024))

# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")
print(dt_cv_model$results)

# Plot decision tree
par(mfrow = c(1, 1))
rpart.plot(dt_cv_model$finalModel, 
           main = "Decision Tree (CV Optimized)",
           type = 4, 
           extra = 104, 
           box.palette = "GnBu")

rpart.plot(dt_cv_model$finalModel, 
           main = "Decision Tree Structure",
           type = 0,  
           extra = 101,
           box.palette = "RdYlGn", # Red-Yellow-Green palette
           shadow.col = "gray",
           branch.lty = 3,
           fallen.leaves = TRUE,
           cex = 0.8)

# Plot 2: Decision tree dengan informasi probabilitas
rpart.plot(dt_cv_model$finalModel, 
           main = "Decision Tree with Class Probabilities",
           type = 2,  # Display split variable names
           extra = 104, # Display probability per class and percentage
           box.palette = "GnBu",
           shadow.col = "gray",
           branch = 0.3,
           under = TRUE,
           cex = 0.7)

# Prediksi
dt_train_pred <- predict(dt_cv_model, train_data_cv)
dt_test_pred <- predict(dt_cv_model, test_data_cv)

# Konversi kembali ke 0/1
dt_train_pred_01 <- factor(ifelse(dt_train_pred == "Yes", "1", "0"), 
                           levels = c("0", "1"))
dt_test_pred_01 <- factor(ifelse(dt_test_pred == "Yes", "1", "0"), 
                          levels = c("0", "1"))

# Evaluasi
results$dt$train <- evaluate_model(dt_train_pred_01, train_data$Y, "Decision Tree", "Train Data (CV)")
results$dt$test <- evaluate_model(dt_test_pred_01, test_data$Y, "Decision Tree", "Test Data")

# Probabilitas untuk ROC
dt_test_prob <- predict(dt_cv_model, test_data_cv, type = "prob")[, "Yes"]
cv_results$dt <- dt_cv_model
```

### Random Forest

```{r}
# Training Random Forest dengan CV
rf_cv_model <- train(Y ~ ., 
                     data = train_data_cv,
                     method = "rf",
                     trControl = ctrl_cv5,
                     tuneLength = 3,
                     ntree = 500,
                     metric = "ROC",set.seed(2024))

# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")
print(rf_cv_model$results)

# Plot variable importance
par(mfrow = c(1, 1))
varImpPlot <- varImp(rf_cv_model, scale = FALSE)
plot(varImpPlot, 
     main = "Random Forest - Variable Importance (CV)",
     top = 5)

# Prediksi
rf_train_pred <- predict(rf_cv_model, train_data_cv)
rf_test_pred <- predict(rf_cv_model, test_data_cv)

# Konversi kembali ke 0/1
rf_train_pred_01 <- factor(ifelse(rf_train_pred == "Yes", "1", "0"), 
                           levels = c("0", "1"))
rf_test_pred_01 <- factor(ifelse(rf_test_pred == "Yes", "1", "0"), 
                          levels = c("0", "1"))

# Evaluasi
results$rf$train <- evaluate_model(rf_train_pred_01, train_data$Y, "Random Forest", "Train Data (CV)")
results$rf$test <- evaluate_model(rf_test_pred_01, test_data$Y, "Random Forest", "Test Data")

# Probabilitas untuk ROC
rf_test_prob <- predict(rf_cv_model, test_data_cv, type = "prob")[, "Yes"]
cv_results$rf <- rf_cv_model
```

### Gradient Boosting

```{r}
# Training Gradient Boosting dengan CV
gbm_cv_model <- train(Y ~ ., 
                      data = train_data_cv,
                      method = "gbm",
                      trControl = ctrl_cv5,
                      verbose = FALSE,
                      tuneGrid = expand.grid(
                        n.trees = c(100, 200, 300),
                        interaction.depth = c(3, 5, 7),
                        shrinkage = c(0.01, 0.1, 0.3),
                        n.minobsinnode = 10
                      ),
                      metric = "ROC",set.seed(2024))

# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")
print(gbm_cv_model$results)

# Prediksi
gbm_train_pred <- predict(gbm_cv_model, train_data_cv)
gbm_test_pred <- predict(gbm_cv_model, test_data_cv)

# Konversi kembali ke 0/1
gbm_train_pred_01 <- factor(ifelse(gbm_train_pred == "Yes", "1", "0"), 
                            levels = c("0", "1"))
gbm_test_pred_01 <- factor(ifelse(gbm_test_pred == "Yes", "1", "0"), 
                           levels = c("0", "1"))

# Evaluasi
results$gbm$train <- evaluate_model(gbm_train_pred_01, train_data$Y, "Gradient Boosting", "Train Data (CV)")
results$gbm$test <- evaluate_model(gbm_test_pred_01, test_data$Y, "Gradient Boosting", "Test Data")

# Probabilitas untuk ROC
gbm_test_prob <- predict(gbm_cv_model, test_data_cv, type = "prob")[, "Yes"]
cv_results$gbm <- gbm_cv_model
```

### Artificial Neural Network

```{r}
# Normalisasi data untuk ANN
normalize <- function(x) {
  return((x - min(x)) / (max(x) - min(x)))
}

train_norm_cv <- as.data.frame(lapply(train_data_cv[, -ncol(train_data_cv)], normalize))
test_norm_cv <- as.data.frame(lapply(test_data_cv[, -ncol(test_data_cv)], normalize))
train_norm_cv$Y <- train_data_cv$Y
test_norm_cv$Y <- test_data_cv$Y

# Training Neural Network dengan CV
ann_cv_model <- train(Y ~ ., 
                      data = train_norm_cv,
                      method = "nnet",
                      trControl = ctrl_cv5,
                      tuneGrid = expand.grid(
                        size = c(5, 10, 15),
                        decay = c(0.001, 0.01, 0.1)
                      ),
                      maxit = 200,
                      trace = FALSE,
                      metric = "ROC",set.seed(2024))

# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")
print(ann_cv_model$results)

# Prediksi
ann_train_pred <- predict(ann_cv_model, train_norm_cv)
ann_test_pred <- predict(ann_cv_model, test_norm_cv)

# Konversi kembali ke 0/1
ann_train_pred_01 <- factor(ifelse(ann_train_pred == "Yes", "1", "0"), 
                            levels = c("0", "1"))
ann_test_pred_01 <- factor(ifelse(ann_test_pred == "Yes", "1", "0"), 
                           levels = c("0", "1"))

# Evaluasi
results$ann$train <- evaluate_model(ann_train_pred_01, train_data$Y, "Neural Network", "Train Data (CV)")
results$ann$test <- evaluate_model(ann_test_pred_01, test_data$Y, "Neural Network", "Test Data")

# Probabilitas untuk ROC
ann_test_prob <- predict(ann_cv_model, test_norm_cv, type = "prob")[, "Yes"]
cv_results$ann <- ann_cv_model
```

```{r}
library(NeuralNetTools) 
plotnet(ann_cv_model,
        alpha = 0.6,
        circle_col = list('lightblue', 'lightgreen', 'lightpink'),
        bord_col = 'black',
        max_sp = TRUE,
        rel_rsc = c(2, 8),
        x_names = names(train_norm_cv)[-ncol(train_norm_cv)],
        y_names = c("No", "Yes"),
        main = "Neural Network Architecture")
```

### Perbandingan Model

```{r}
cat("\n", strrep("=", 60))
cat("\nROC CURVES COMPARISON (ALL MODELS WITH CV)")
cat("\n", strrep("=", 60), "\n")

# Siapkan plot ROC
par(mfrow = c(1, 1), mar = c(5, 5, 4, 2) + 0.1)

# Warna yang lebih kontras dan berbeda
colors <- c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", 
            "#FF7F00", "#FFFF33", "#A65628", "#F781BF")

# Buat plot kosong dulu
plot(NULL, xlim = c(0, 1), ylim = c(0, 1),
     xlab = "False Positive Rate", 
     ylab = "True Positive Rate",
     main = "ROC Curves - All Models (5-Fold CV)",
     cex.main = 1.5,
     cex.lab = 1.2)
grid(col = "gray90", lty = 2)
abline(a = 0, b = 1, col = "gray50", lty = 2, lwd = 2)

# List untuk menyimpan AUC
auc_values <- list()

# Plot ROC untuk setiap model
model_names <- c("Logistic Regression", "SVM", "Naive Bayes", "KNN",
                 "Decision Tree", "Random Forest", "Gradient Boosting", "Neural Network")
probs_list <- list(logit_test_prob, svm_test_prob, nb_test_prob, knn_test_prob,
                   dt_test_prob, rf_test_prob, gbm_test_prob, ann_test_prob)

# Pastikan actual values sama untuk semua
actual_numeric <- as.numeric(test_data$Y) - 1

for(i in 1:length(model_names)) {
  # Hitung ROC
  roc_obj <- roc(actual_numeric, probs_list[[i]])
  auc_val <- auc(roc_obj)
  auc_values[[model_names[i]]] <- auc_val
  
  # Plot ROC curve
  lines(roc_obj$specificities, roc_obj$sensitivities, 
        col = colors[i], lwd = 3)
  
  # Tambahkan titik di optimal cut-off (Youden Index)
  youden_index <- roc_obj$sensitivities + roc_obj$specificities - 1
  optimal_idx <- which.max(youden_index)
  points(1 - roc_obj$specificities[optimal_idx], 
         roc_obj$sensitivities[optimal_idx], 
         col = colors[i], pch = 19, cex = 1.5)
}

# Tambahkan legend dengan warna kontras
legend("bottomright", 
       legend = paste(model_names, " (AUC=", 
                      round(unlist(auc_values), 4), ")", sep = ""),
       col = colors, 
       lwd = 3, 
       cex = 0.8,
       bg = "white",
       box.col = "black",
       text.col = "black")

cat("\n", strrep("=", 60))
cat("\nACCURACY COMPARISON TABLE (5-FOLD CV)")
cat("\n", strrep("=", 60), "\n")

# Buat dataframe untuk perbandingan
comparison_df <- data.frame(
  Model = model_names,
  Train_Accuracy = c(
    results$logit$train$accuracy,
    results$svm$train$accuracy,
    results$nb$train$accuracy,
    results$knn$train$accuracy,
    results$dt$train$accuracy,
    results$rf$train$accuracy,
    results$gbm$train$accuracy,
    results$ann$train$accuracy
  ),
  Test_Accuracy = c(
    results$logit$test$accuracy,
    results$svm$test$accuracy,
    results$nb$test$accuracy,
    results$knn$test$accuracy,
    results$dt$test$accuracy,
    results$rf$test$accuracy,
    results$gbm$test$accuracy,
    results$ann$test$accuracy
  ),
  AUC = unlist(auc_values)
)

# Tambahkan kolom overfitting (selisih akurasi train dan test)
comparison_df$Overfitting <- comparison_df$Train_Accuracy - comparison_df$Test_Accuracy

# Tambahkan kolom F1-Score
comparison_df$F1_Score <- c(
  results$logit$test$f1_score,
  results$svm$test$f1_score,
  results$nb$test$f1_score,
  results$knn$test$f1_score,
  results$dt$test$f1_score,
  results$rf$test$f1_score,
  results$gbm$test$f1_score,
  results$ann$test$f1_score
)

# Urutkan berdasarkan AUC tertinggi
comparison_df <- comparison_df[order(-comparison_df$AUC), ]

# Format output untuk lebih rapi
print_df <- comparison_df
print_df[, 2:6] <- round(print_df[, 2:6], 4)
print(print_df)

barplot(height = t(as.matrix(comparison_df[, c("Train_Accuracy", "Test_Accuracy")])),
        beside = TRUE,
        names.arg = comparison_df$Model,
        col = c("#66C2A5", "#FC8D62"),
        las = 2,
        ylim = c(0, 1),
        main = "Train vs Test Accuracy (5-Fold CV)",
        ylab = "Accuracy",
        cex.names = 0.8,
        border = NA)
legend("topright", 
       legend = c("Train (CV)", "Test"), 
       fill = c("#66C2A5", "#FC8D62"),
       bg = "white",
       box.col = "black")
grid(nx = NA, ny = NULL, col = "gray90")

barplot(comparison_df$AUC,
        names.arg = comparison_df$Model,
        col = "#8DA0CB",
        las = 2,
        ylim = c(0, 1),
        main = "AUC Values Comparison (5-Fold CV)",
        ylab = "AUC",
        cex.names = 0.8,
        border = NA)
abline(h = 0.5, col = "red", lty = 2, lwd = 2)
text(x = 1:length(comparison_df$Model), 
     y = comparison_df$AUC,
     labels = round(comparison_df$AUC, 3),
     pos = 3, cex = 0.8, col = "darkblue")
grid(nx = NA, ny = NULL, col = "gray90")

barplot(comparison_df$F1_Score,
        names.arg = comparison_df$Model,
        col = "#E78AC3",
        las = 2,
        ylim = c(0, 1),
        main = "F1-Score Comparison (5-Fold CV)",
        ylab = "F1-Score",
        cex.names = 0.8,
        border = NA)
text(x = 1:length(comparison_df$Model), 
     y = comparison_df$F1_Score,
     labels = round(comparison_df$F1_Score, 3),
     pos = 3, cex = 0.8, col = "darkblue")
grid(nx = NA, ny = NULL, col = "gray90")

barplot(comparison_df$Overfitting,
        names.arg = comparison_df$Model,
        col = ifelse(comparison_df$Overfitting > 0.1, "#FB8072", "#B3DE69"),
        las = 2,
        ylim = c(min(comparison_df$Overfitting) - 0.05, 
                 max(comparison_df$Overfitting) + 0.05),
        main = "Overfitting (5-Fold CV)",
        ylab = "Train Acc - Test Acc",
        cex.names = 0.8,
        border = NA)
abline(h = 0, col = "black", lwd = 1)
abline(h = 0.05, col = "orange", lty = 2, lwd = 1)
abline(h = 0.1, col = "red", lty = 2, lwd = 1)
text(x = 1:length(comparison_df$Model), 
     y = comparison_df$Overfitting,
     labels = round(comparison_df$Overfitting, 3),
     pos = ifelse(comparison_df$Overfitting >= 0, 3, 1), 
     cex = 0.8, col = "darkblue")
grid(nx = NA, ny = NULL, col = "gray90")


cv_summary <- data.frame(
  Model = model_names,
  CV_Accuracy = sapply(cv_results, function(x) max(x$results$Accuracy, na.rm = TRUE)),
  CV_ROC = sapply(cv_results, function(x) max(x$results$ROC, na.rm = TRUE)),
  Best_Params = sapply(cv_results, function(x) {
    params <- x$bestTune
    paste(names(params), "=", params, collapse = ", ")
  })
)
```

# Permodelan Regresi

```{r}
library(AER)
data("CreditCard")
data <- CreditCard
```

```{r}
glimpse(data)
```

```{r}
# Load library yang dibutuhkan
library(tidyverse)
library(caret)
library(Metrics)
library(ggplot2)
library(splines)
library(pROC)
library(e1071)
library(randomForest)
library(gbm)
library(rpart)
library(rpart.plot)
library(nnet)
library(kknn)
library(glmnet)
library(patchwork)
library(NeuralNetTools)

cat("DATA PREPARATION\n")
cat(strrep("=", 60), "\n")

# Cek data
cat("Data structure:\n")
print(str(data))

# Cek apakah 'expenditure' ada dalam data
if(!"expenditure" %in% names(data)) {
  stop("Kolom 'expenditure' tidak ditemukan dalam data.")
}

# Pastikan expenditure adalah numeric
if(is.factor(data$expenditure)) {
  data$expenditure <- as.numeric(as.character(data$expenditure))
  cat("\nExpenditure di-convert dari factor ke numeric\n")
}

cat("\n", strrep("=", 60))
cat("\nSPLIT DATA (80% TRAIN - 20% TEST)")
cat("\n", strrep("=", 60), "\n")

set.seed(2024)
train_index <- createDataPartition(data$expenditure, p = 0.8, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]

cat("\nData split results:\n")
cat("Total data  :", nrow(data), "observations\n")
cat("Train data  :", nrow(train_data), "observations (", round(nrow(train_data)/nrow(data)*100, 1), "%)\n")
cat("Test data   :", nrow(test_data), "observations (", round(nrow(test_data)/nrow(data)*100, 1), "%)\n")

# Cek distribusi expenditure
cat("\nExpenditure statistics:\n")
summary_stats <- data.frame(
  Dataset = c("Train", "Test", "Full"),
  N = c(nrow(train_data), nrow(test_data), nrow(data)),
  Mean = c(mean(train_data$expenditure), mean(test_data$expenditure), mean(data$expenditure)),
  SD = c(sd(train_data$expenditure), sd(test_data$expenditure), sd(data$expenditure)),
  Min = c(min(train_data$expenditure), min(test_data$expenditure), min(data$expenditure)),
  Max = c(max(train_data$expenditure), max(test_data$expenditure), max(data$expenditure))
)
print(summary_stats)

ctrl_cv5 <- trainControl(method = "cv", 
                         number = 5,
                         savePredictions = TRUE,
                         verboseIter = FALSE)

cat("\nCross-Validation setup: 5-fold CV\n")

evaluate_regression <- function(predictions, actual, model_name, data_type, plot_residuals = TRUE) {
  # Hitung metrik
  rmse_val <- rmse(actual, predictions)
  mae_val <- mae(actual, predictions)
  mse_val <- mse(actual, predictions)
  r2_val <- cor(actual, predictions)^2
  
  # Print hasil
  cat("\n", strrep("=", 50), "\n")
  cat(model_name, "-", data_type, "\n")
  cat(strrep("=", 50), "\n")
  
  cat("RMSE:", round(rmse_val, 4), "\n")
  cat("MAE :", round(mae_val, 4), "\n")
  cat("MSE :", round(mse_val, 4), "\n")
  cat("R²  :", round(r2_val, 4), "\n")
  
  # Plot jika diminta
  if(plot_residuals) {
    plot_regression_results(predictions, actual, model_name, data_type)
  }
  
  return(list(
    rmse = rmse_val,
    mae = mae_val,
    mse = mse_val,
    r2 = r2_val,
    predictions = predictions
  ))
}

plot_regression_results <- function(predicted, actual, model_name, data_type) {
  # Setup layout
  par(mfrow = c(2, 2), mar = c(4, 4, 3, 2))
  
  plot(actual, predicted,
       main = paste("Predicted vs Actual -", model_name),
       xlab = "Actual Expenditure",
       ylab = "Predicted Expenditure",
       pch = 19, col = "blue", cex = 0.7)
  abline(0, 1, col = "red", lwd = 2)
  grid()
  
  residuals <- actual - predicted
  fitted <- predicted
  plot(fitted, residuals,
       main = "Residuals vs Fitted",
       xlab = "Fitted Values",
       ylab = "Residuals",
       pch = 19, col = "darkgreen", cex = 0.7)
  abline(h = 0, col = "red", lwd = 2)
  grid()
  
  hist(residuals,
       main = "Distribution of Residuals",
       xlab = "Residuals",
       col = "lightblue",
       border = "black",
       breaks = 30)
  abline(v = 0, col = "red", lwd = 2)
  
  qqnorm(residuals, main = "Q-Q Plot of Residuals")
  qqline(residuals, col = "red", lwd = 2)
  
  # Reset layout
  par(mfrow = c(1, 1))
}

results <- list()
cv_results <- list()
```

### Regresi Polynomial

```{r}
predictors <- names(train_data)[names(train_data) != "expenditure"]

if(length(predictors) == 0) {
  stop("Tidak ada predictor dalam data.")
}

# Pilih hanya numeric predictors untuk polynomial regression
numeric_predictors <- predictors[sapply(train_data[predictors], is.numeric)]
if(length(numeric_predictors) == 0) {
  cat("Tidak ada predictor numeric untuk polynomial regression.\n")
  best_predictor <- predictors[1]
} else {
  # Hitung korelasi dengan expenditure
  correlations <- sapply(numeric_predictors, function(x) {
    cor(train_data[[x]], train_data$expenditure, use = "complete.obs")
  })
  best_predictor <- names(which.max(abs(correlations)))
  cat("Menggunakan predictor numeric dengan korelasi tertinggi:", best_predictor, 
      "(korelasi =", round(correlations[best_predictor], 3), ")\n")
}

# Training polynomial dengan berbagai degree
polynomial_results <- data.frame()
best_poly_degree <- 2
best_poly_rmse <- Inf

for(degree in 2:5) {
  cat("\nPolynomial Degree:", degree, "\n")
  
  # Buat formula
  formula_poly <- as.formula(paste("expenditure ~ poly(", best_predictor, ", ", degree, ", raw=TRUE)"))
  
  tryCatch({
    # Training dengan CV
    poly_model <- train(formula_poly,
                        data = train_data,
                        method = "lm",
                        trControl = ctrl_cv5)
    
    # Prediksi
    poly_train_pred <- predict(poly_model, train_data)
    poly_test_pred <- predict(poly_model, test_data)
    
    # Evaluasi
    train_eval <- evaluate_regression(poly_train_pred, train_data$expenditure, 
                                     paste("Poly Degree", degree), "Train Data", plot_residuals = FALSE)
    test_eval <- evaluate_regression(poly_test_pred, test_data$expenditure, 
                                    paste("Poly Degree", degree), "Test Data", plot_residuals = FALSE)
    
    # Simpan hasil
    poly_result <- data.frame(
      Degree = degree,
      Train_RMSE = train_eval$rmse,
      Test_RMSE = test_eval$rmse,
      Overfitting = train_eval$rmse - test_eval$rmse,
      R2 = test_eval$r2
    )
    polynomial_results <- rbind(polynomial_results, poly_result)
    
    # Update best model
    if(test_eval$rmse < best_poly_rmse) {
      best_poly_rmse <- test_eval$rmse
      best_poly_degree <- degree
      best_poly_model <- poly_model
    }
  }, error = function(e) {
    cat("Error untuk degree", degree, ":", e$message, "\n")
  })
}

# Tampilkan hasil semua degree
if(nrow(polynomial_results) > 0) {
  cat("\nPolynomial Regression Results:\n")
  print(polynomial_results)
  
  # Pilih model terbaik
  cat("\nBest Polynomial Model: Degree", best_poly_degree, "with Test RMSE =", 
      round(best_poly_rmse, 4), "\n")
  
  # Plot perbandingan
  if(require(ggplot2)) {
    p <- ggplot(polynomial_results, aes(x = Degree)) +
      geom_line(aes(y = Train_RMSE, color = "Train"), size = 1.5) +
      geom_line(aes(y = Test_RMSE, color = "Test"), size = 1.5) +
      geom_point(aes(y = Train_RMSE), size = 3) +
      geom_point(aes(y = Test_RMSE), size = 3) +
      labs(title = "Polynomial Regression: RMSE vs Degree",
           x = "Polynomial Degree",
           y = "RMSE",
           color = "Dataset") +
      theme_minimal() +
      scale_x_continuous(breaks = 2:5)
    print(p)
  }
  
  # Plot model terbaik
  best_poly_test_pred <- predict(best_poly_model, test_data)
  results$poly <- evaluate_regression(best_poly_test_pred, test_data$expenditure, 
                                     paste("Best Poly (Degree", best_poly_degree, ")"), 
                                     "Test Data")
} else {
  cat("\nTidak ada model polynomial yang berhasil di-train.\n")
}
```

### Piecewise Constant Via Decision Tree

```{r}
tryCatch({
  # Gunakan decision tree dengan depth 1 untuk piecewise constant
  pc_model <- train(expenditure ~ .,
                    data = train_data,
                    method = "rpart",
                    trControl = ctrl_cv5,
                    tuneGrid = data.frame(cp = 0.01),
                    control = rpart.control(maxdepth = 1))
  
  # Plot tree
  rpart.plot(pc_model$finalModel,
             main = "Piecewise Constant (Regression Stump)",
             type = 2,
             extra = 101,
             box.palette = "RdYlGn")
  
  # Prediksi
  pc_train_pred <- predict(pc_model, train_data)
  pc_test_pred <- predict(pc_model, test_data)
  
  # Evaluasi
  results$piecewise <- evaluate_regression(pc_test_pred, test_data$expenditure, 
                                          "Piecewise Constant", "Test Data")
  cv_results$piecewise <- pc_model
}, error = function(e) {
  cat("Error dalam Piecewise Constant Regression:", e$message, "\n")
})

```

### Cubic Spline

```{r}
tryCatch({
  # Tentukan knots untuk cubic spline
  n_knots <- min(3, floor(nrow(train_data) / 20))
  if(n_knots < 1) n_knots <- 1
  
  knots <- quantile(train_data[[best_predictor]], 
                    probs = seq(0, 1, length.out = n_knots + 2)[-c(1, n_knots + 2)])
  
  # Buat basis spline
  cubic_spline_train <- bs(train_data[[best_predictor]], 
                           knots = knots, 
                           degree = 3,
                           intercept = TRUE)
  cubic_spline_test <- bs(test_data[[best_predictor]], 
                          knots = knots,
                          degree = 3,
                          intercept = TRUE)
  
  # Gabungkan dengan predictors lain
  other_predictors <- setdiff(predictors, best_predictor)
  if(length(other_predictors) > 0) {
    train_spline <- cbind(as.data.frame(cubic_spline_train), 
                         train_data[, other_predictors, drop = FALSE])
    test_spline <- cbind(as.data.frame(cubic_spline_test), 
                        test_data[, other_predictors, drop = FALSE])
  } else {
    train_spline <- as.data.frame(cubic_spline_train)
    test_spline <- as.data.frame(cubic_spline_test)
  }
  
  train_spline$expenditure <- train_data$expenditure
  test_spline$expenditure <- test_data$expenditure
  
  # Training model
  cs_model <- train(expenditure ~ .,
                    data = train_spline,
                    method = "lm",
                    trControl = ctrl_cv5)
  
  # Prediksi
  cs_train_pred <- predict(cs_model, train_spline)
  cs_test_pred <- predict(cs_model, test_spline)
  
  # Evaluasi
  results$cubic_spline <- evaluate_regression(cs_test_pred, test_data$expenditure, 
                                             "Cubic Spline", "Test Data")
  
  # Plot spline
  if(require(ggplot2)) {
    plot_data <- data.frame(
      x = seq(min(train_data[[best_predictor]]), max(train_data[[best_predictor]]), length.out = 100),
      y = predict(cs_model, 
                  newdata = data.frame(V1 = bs(seq(min(train_data[[best_predictor]]), 
                                                   max(train_data[[best_predictor]]), 
                                                   length.out = 100), 
                                              knots = knots, degree = 3, intercept = TRUE)))
    )
    
    p <- ggplot() +
      geom_point(data = train_data, aes(x = .data[[best_predictor]], y = expenditure), 
                 color = "blue", alpha = 0.5) +
      geom_line(data = plot_data, aes(x = x, y = y), 
                color = "red", size = 1.5) +
      geom_vline(xintercept = knots, linetype = "dashed", color = "gray") +
      labs(title = "Cubic Spline Regression",
           x = best_predictor,
           y = "Expenditure") +
      theme_minimal()
    print(p)
  }
}, error = function(e) {
  cat("Error dalam Cubic Spline Regression:", e$message, "\n")
})
```

### KNN Regression

```{r}
tryCatch({
  # Training KNN dengan CV
  knn_reg_model <- train(expenditure ~ .,
                         data = train_data,
                         method = "knn",
                         trControl = ctrl_cv5,
                         tuneGrid = expand.grid(k = seq(3, min(21, nrow(train_data)-1), by = 2)),
                         preProcess = c("center", "scale"))
  
  # Tampilkan tuning results
  cat("\nKNN Tuning Results:\n")
  print(knn_reg_model$results)
  cat("\nBest K:", knn_reg_model$bestTune$k, "\n")
  
  # Plot tuning results
  plot(knn_reg_model, main = "KNN Regression: RMSE vs K")
  
  # Prediksi
  knn_reg_train_pred <- predict(knn_reg_model, train_data)
  knn_reg_test_pred <- predict(knn_reg_model, test_data)
  
  # Evaluasi
  results$knn_reg <- evaluate_regression(knn_reg_test_pred, test_data$expenditure, 
                                        "KNN Regression", "Test Data")
  cv_results$knn_reg <- knn_reg_model
}, error = function(e) {
  cat("Error dalam KNN Regression:", e$message, "\n")
})
```

### Regression Tree

```{r}
tryCatch({
  # Training regression tree dengan CV
  tree_model <- train(expenditure ~ .,
                      data = train_data,
                      method = "rpart",
                      trControl = ctrl_cv5,
                      tuneLength = 5)
  
  # Tampilkan tuning results
  cat("\nRegression Tree Tuning Results:\n")
  print(tree_model$results)
  
  # Plot tree
  rpart.plot(tree_model$finalModel,
             main = "Regression Tree (CV Optimized)",
             type = 4,
             extra = 101,
             box.palette = "RdYlGn",
             fallen.leaves = TRUE)
  
  # Prediksi
  tree_train_pred <- predict(tree_model, train_data)
  tree_test_pred <- predict(tree_model, test_data)
  
  # Evaluasi
  results$tree <- evaluate_regression(tree_test_pred, test_data$expenditure, 
                                     "Regression Tree", "Test Data")
  cv_results$tree <- tree_model
}, error = function(e) {
  cat("Error dalam Regression Tree:", e$message, "\n")
})
```

### Random Forest

```{r}
tryCatch({
  # Training Random Forest dengan CV
  rf_reg_model <- train(expenditure ~ .,
                        data = train_data,
                        method = "rf",
                        trControl = ctrl_cv5,
                        tuneLength = 2,
                        ntree = 100,
                        importance = TRUE)
  
  # Tampilkan tuning results
  cat("\nRandom Forest Tuning Results:\n")
  print(rf_reg_model$results)
  
  # Plot variable importance
  varImpPlot <- varImp(rf_reg_model, scale = FALSE)
  plot(varImpPlot, 
       main = "Random Forest - Variable Importance",
       top = min(10, ncol(train_data) - 1))
  
  # Plot OOB error
  if("finalModel" %in% names(rf_reg_model)) {
    oob_error <- rf_reg_model$finalModel$mse
    plot(1:length(oob_error), oob_error,
         type = "l", col = "red", lwd = 2,
         main = "Random Forest OOB Error",
         xlab = "Number of Trees",
         ylab = "OOB MSE")
    grid()
  }
  
  # Prediksi
  rf_reg_train_pred <- predict(rf_reg_model, train_data)
  rf_reg_test_pred <- predict(rf_reg_model, test_data)
  
  # Evaluasi
  results$rf_reg <- evaluate_regression(rf_reg_test_pred, test_data$expenditure, 
                                       "Random Forest", "Test Data")
  cv_results$rf_reg <- rf_reg_model
}, error = function(e) {
  cat("Error dalam Random Forest Regression:", e$message, "\n")
})
```

### Neural Network

```{r}
tryCatch({
  # Normalisasi data
  normalize <- function(x) {
    if(is.numeric(x)) {
      if(sd(x, na.rm = TRUE) == 0) return(x)
      return((x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE)))
    }
    return(x)
  }
  
  # Pisahkan factor dan numeric variables
  factor_vars <- names(train_data)[sapply(train_data, is.factor)]
  numeric_vars <- names(train_data)[sapply(train_data, is.numeric) & names(train_data) != "expenditure"]
  
  # Normalisasi hanya numeric variables
  train_norm <- train_data
  test_norm <- test_data
  
  for(var in numeric_vars) {
    train_norm[[var]] <- normalize(train_data[[var]])
    test_norm[[var]] <- normalize(test_data[[var]])
  }
  
  # Training Neural Network dengan CV
  nn_reg_model <- train(expenditure ~ .,
                        data = train_norm,
                        method = "nnet",
                        trControl = ctrl_cv5,
                        tuneGrid = expand.grid(
                          size = c(3, 5, 7),
                          decay = c(0.001, 0.01)
                        ),
                        linout = TRUE,  # Untuk regression
                        maxit = 100,
                        trace = FALSE)
  
  # Tampilkan tuning results
  cat("\nNeural Network Tuning Results:\n")
  print(nn_reg_model$results)
  
  # Plot neural network architecture
  if(require(NeuralNetTools) && max(nn_reg_model$results$size) <= 10) {
    plotnet(nn_reg_model$finalModel,
            alpha = 0.6,
            circle_col = "lightblue",
            bord_col = "black",
            main = "Neural Network Architecture")
  }
  
  # Prediksi
  nn_reg_train_pred <- predict(nn_reg_model, train_norm)
  nn_reg_test_pred <- predict(nn_reg_model, test_norm)
  
  # Evaluasi
  results$nn_reg <- evaluate_regression(nn_reg_test_pred, test_data$expenditure, 
                                       "Neural Network", "Test Data")
  cv_results$nn_reg <- nn_reg_model
}, error = function(e) {
  cat("Error dalam Neural Network Regression:", e$message, "\n")
})
```

### Gradient Boosting

```{r}
tryCatch({
  # Training Gradient Boosting dengan CV
  gbm_reg_model <- train(expenditure ~ .,
                         data = train_data,
                         method = "gbm",
                         trControl = ctrl_cv5,
                         verbose = FALSE,
                         tuneGrid = expand.grid(
                           n.trees = c(50, 100),
                           interaction.depth = c(2, 3),
                           shrinkage = c(0.01, 0.1),
                           n.minobsinnode = 5
                         ))
  
  # Tampilkan tuning results
  cat("\nGradient Boosting Tuning Results:\n")
  print(gbm_reg_model$results)
  
  # Plot variable importance
  varImpPlot_gbm <- varImp(gbm_reg_model, scale = FALSE)
  plot(varImpPlot_gbm, 
       main = "Gradient Boosting - Variable Importance",
       top = min(10, ncol(train_data) - 1))
  
  # Prediksi
  gbm_reg_train_pred <- predict(gbm_reg_model, train_data)
  gbm_reg_test_pred <- predict(gbm_reg_model, test_data)
  
  # Evaluasi
  results$gbm_reg <- evaluate_regression(gbm_reg_test_pred, test_data$expenditure, 
                                        "Gradient Boosting", "Test Data")
  cv_results$gbm_reg <- gbm_reg_model
}, error = function(e) {
  cat("Error dalam Gradient Boosting Regression:", e$message, "\n")
})
```

### SVM

```{r}
tryCatch({
  # Convert factor variables to numeric for SVM
  train_svm <- train_data
  test_svm <- test_data
  
  for(var in names(train_svm)) {
    if(is.factor(train_svm[[var]])) {
      train_svm[[var]] <- as.numeric(train_svm[[var]])
      test_svm[[var]] <- as.numeric(test_svm[[var]])
    }
  }
  
  # Training SVM Regression dengan CV
  svm_reg_model <- train(expenditure ~ .,
                         data = train_svm,
                         method = "svmRadial",
                         trControl = ctrl_cv5,
                         tuneLength = 3,
                         preProcess = c("center", "scale"))
  
  # Tampilkan tuning results
  cat("\nSVM Regression Tuning Results:\n")
  print(svm_reg_model$results)
  
  # Prediksi
  svm_reg_train_pred <- predict(svm_reg_model, train_svm)
  svm_reg_test_pred <- predict(svm_reg_model, test_svm)
  
  # Evaluasi
  results$svm_reg <- evaluate_regression(svm_reg_test_pred, test_data$expenditure, 
                                        "SVM Regression", "Test Data")
  cv_results$svm_reg <- svm_reg_model
}, error = function(e) {
  cat("Error dalam SVM Regression:", e$message, "\n")
})
```

### Perbandingan

```{r}
model_names <- c("Polynomial", "Piecewise Constant", "Cubic Spline", 
                 "KNN", "Regression Tree", "Random Forest", "Neural Network",
                 "Gradient Boosting", "SVM")

# Inisialisasi dataframe
comparison_df <- data.frame(
  Model = model_names,
  Test_RMSE = rep(NA, length(model_names)),
  Test_R2 = rep(NA, length(model_names)),
  MAE = rep(NA, length(model_names))
)

# Isi hasil
if(!is.null(results$poly)) {
  comparison_df[comparison_df$Model == "Polynomial", c("Test_RMSE", "Test_R2", "MAE")] <- 
    c(results$poly$rmse, results$poly$r2, results$poly$mae)
}
if(!is.null(results$piecewise)) {
  comparison_df[comparison_df$Model == "Piecewise Constant", c("Test_RMSE", "Test_R2", "MAE")] <- 
    c(results$piecewise$rmse, results$piecewise$r2, results$piecewise$mae)
}
if(!is.null(results$cubic_spline)) {
  comparison_df[comparison_df$Model == "Cubic Spline", c("Test_RMSE", "Test_R2", "MAE")] <- 
    c(results$cubic_spline$rmse, results$cubic_spline$r2, results$cubic_spline$mae)
}
if(!is.null(results$knn_reg)) {
  comparison_df[comparison_df$Model == "KNN", c("Test_RMSE", "Test_R2", "MAE")] <- 
    c(results$knn_reg$rmse, results$knn_reg$r2, results$knn_reg$mae)
}
if(!is.null(results$tree)) {
  comparison_df[comparison_df$Model == "Regression Tree", c("Test_RMSE", "Test_R2", "MAE")] <- 
    c(results$tree$rmse, results$tree$r2, results$tree$mae)
}
if(!is.null(results$rf_reg)) {
  comparison_df[comparison_df$Model == "Random Forest", c("Test_RMSE", "Test_R2", "MAE")] <- 
    c(results$rf_reg$rmse, results$rf_reg$r2, results$rf_reg$mae)
}
if(!is.null(results$nn_reg)) {
  comparison_df[comparison_df$Model == "Neural Network", c("Test_RMSE", "Test_R2", "MAE")] <- 
    c(results$nn_reg$rmse, results$nn_reg$r2, results$nn_reg$mae)
}
if(!is.null(results$gbm_reg)) {
  comparison_df[comparison_df$Model == "Gradient Boosting", c("Test_RMSE", "Test_R2", "MAE")] <- 
    c(results$gbm_reg$rmse, results$gbm_reg$r2, results$gbm_reg$mae)
}
if(!is.null(results$svm_reg)) {
  comparison_df[comparison_df$Model == "SVM", c("Test_RMSE", "Test_R2", "MAE")] <- 
    c(results$svm_reg$rmse, results$svm_reg$r2, results$svm_reg$mae)
}

# Hapus baris dengan NA
comparison_df <- comparison_df[complete.cases(comparison_df), ]

# Urutkan berdasarkan RMSE terbaik (terkecil)
if(nrow(comparison_df) > 0) {
  comparison_df <- comparison_df[order(comparison_df$Test_RMSE), ]
  comparison_df$Rank <- 1:nrow(comparison_df)
  
  cat("\nModel Performance Comparison (sorted by Test RMSE):\n")
  print(comparison_df)
  
  barplot(comparison_df$Test_RMSE,
          names.arg = comparison_df$Model,
          las = 2,
          col = ifelse(comparison_df$Rank == 1, "gold", "steelblue"),
          ylim = c(0, max(comparison_df$Test_RMSE) * 1.1),
          main = "Test RMSE Comparison",
          ylab = "RMSE",
          cex.names = 0.8)
  abline(h = min(comparison_df$Test_RMSE), col = "red", lty = 2, lwd = 2)
  
  barplot(comparison_df$Test_R2,
          names.arg = comparison_df$Model,
          las = 2,
          col = ifelse(comparison_df$Rank == 1, "gold", "darkgreen"),
          ylim = c(0, 1),
          main = "Test R² Comparison",
          ylab = "R²",
          cex.names = 0.8)
  abline(h = max(comparison_df$Test_R2), col = "red", lty = 2, lwd = 2)
  
  barplot(comparison_df$MAE,
          names.arg = comparison_df$Model,
          las = 2,
          col = ifelse(comparison_df$Rank == 1, "gold", "darkorange"),
          ylim = c(0, max(comparison_df$MAE) * 1.1),
          main = "Test MAE Comparison",
          ylab = "MAE",
          cex.names = 0.8)
  abline(h = min(comparison_df$MAE), col = "red", lty = 2, lwd = 2)
  
  plot(comparison_df$Rank, comparison_df$Test_RMSE,
       type = "b", pch = 19, col = "blue", lwd = 2,
       xlab = "Rank (1 = Best)", ylab = "Test RMSE",
       main = "Model Ranking by RMSE",
       xlim = c(0.5, nrow(comparison_df) + 0.5))
  text(comparison_df$Rank, comparison_df$Test_RMSE,
       labels = comparison_df$Model,
       pos = 3, cex = 0.7, col = "darkred")
  grid()
  
  cat("\n", strrep("=", 60))
  cat("\nOVERFITTING ANALYSIS")
  cat("\n", strrep("=", 60), "\n")
  
  # Buat dataframe untuk overfitting analysis
  overfitting_df <- data.frame(
    Model = comparison_df$Model,
    Train_RMSE = NA,
    Test_RMSE = comparison_df$Test_RMSE,
    Overfitting_Index = NA
  )
  
  # Isi Train RMSE
  train_preds <- list(
    poly_train_pred,
    pc_train_pred,
    cs_train_pred,
    knn_reg_train_pred,
    tree_train_pred,
    rf_reg_train_pred,
    nn_reg_train_pred,
    gbm_reg_train_pred,
    svm_reg_train_pred
  )
  
  for(i in 1:nrow(overfitting_df)) {
    model_name <- overfitting_df$Model[i]
    
    if(model_name == "Polynomial" && exists("poly_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, poly_train_pred)
    } else if(model_name == "Piecewise Constant" && exists("pc_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, pc_train_pred)
    } else if(model_name == "Cubic Spline" && exists("cs_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, cs_train_pred)
    } else if(model_name == "Natural Spline" && exists("ns_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, ns_train_pred)
    } else if(model_name == "KNN" && exists("knn_reg_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, knn_reg_train_pred)
    } else if(model_name == "Regression Tree" && exists("tree_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, tree_train_pred)
    } else if(model_name == "Random Forest" && exists("rf_reg_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, rf_reg_train_pred)
    } else if(model_name == "Neural Network" && exists("nn_reg_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, nn_reg_train_pred)
    } else if(model_name == "Gradient Boosting" && exists("gbm_reg_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, gbm_reg_train_pred)
    } else if(model_name == "SVM" && exists("svm_reg_train_pred")) {
      overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, svm_reg_train_pred)
    }
  }
  
  # Hitung overfitting index
  overfitting_df$Overfitting_Index <- (overfitting_df$Train_RMSE - overfitting_df$Test_RMSE) / overfitting_df$Test_RMSE
  
  cat("\nOverfitting Analysis (positive values indicate overfitting):\n")
  print(overfitting_df)
  
  # Plot overfitting
  if(require(ggplot2)) {
    p <- ggplot(overfitting_df, aes(x = Model)) +
      geom_bar(aes(y = Train_RMSE, fill = "Train"), stat = "identity", alpha = 0.7) +
      geom_bar(aes(y = Test_RMSE, fill = "Test"), stat = "identity", alpha = 0.7) +
      labs(title = "Train vs Test RMSE (Overfitting Analysis)",
           x = "Model",
           y = "RMSE",
           fill = "Dataset") +
      theme_minimal() +
      theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
      scale_fill_manual(values = c("Train" = "blue", "Test" = "red"))
    print(p)
  }
  
} else {
  cat("\nTidak ada model yang berhasil di-train untuk perbandingan.\n")
}
```
:::
