1 Permodelan Klasifikasi
1.0.1 Studi Kasus
Kasus ini menggunakan data yang diberikan pada UAS tahun 2022
## # A tibble: 300 × 6
## AP XRay Stage Grade Age Y
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 128 0 1 1 54 1
## 2 46 0 1 1 55 0
## 3 165 0 0 1 53 1
## 4 102 0 1 1 50 1
## 5 42 0 1 0 61 0
## 6 52 0 1 0 55 0
## 7 112 0 0 0 52 0
## 8 146 0 1 0 55 1
## 9 157 0 1 1 59 1
## 10 174 0 1 0 60 1
## # ℹ 290 more rows
set.seed(2024)
bootstrap_indices <- sample(1:nrow(data), size = 300, replace = TRUE, set.seed(2024))
data <- data[bootstrap_indices, ]
cat("\nData setelah resampling (bootstrap n=300):\n")##
## Data setelah resampling (bootstrap n=300):
## # A tibble: 6 × 6
## AP XRay Stage Grade Age Y
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 114 0 1 1 53 1
## 2 82 1 1 1 62 1
## 3 141 0 0 1 51 1
## 4 122 0 1 0 67 0
## 5 73 1 0 1 58 0
## 6 188 0 1 1 53 1
##
## Dimensi data resampled: 300 6
## Proporsi kelas Y setelah resampling:
##
## 0 1
## 0.34 0.66
1.0.2 Permodelan dengan boostraping
1.0.3 Definisi Function, Library dan Dataframe
library(tidyverse)
library(caret)
library(pROC)
library(e1071)
library(randomForest)
library(gbm)
library(rpart)
library(rpart.plot)
library(nnet)
library(kknn)
data$Y <- factor(data$Y, levels = c("0", "1"))
set.seed(2024)
train_index <- createDataPartition(data$Y, p = 0.8, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
train_data$Y <- factor(train_data$Y, levels = levels(data$Y))
test_data$Y <- factor(test_data$Y, levels = levels(data$Y))
ctrl_cv5 <- trainControl(method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = TRUE)
train_data_cv <- train_data
test_data_cv <- test_data
levels(train_data_cv$Y) <- c("No", "Yes")
levels(test_data_cv$Y) <- c("No", "Yes")
evaluate_model <- function(predictions, actual, model_name, data_type, plot_cm = TRUE) {
# Pastikan predictions dan actual memiliki level yang sama
if(!is.factor(predictions)) {
predictions <- factor(predictions, levels = levels(actual))
}
cm <- confusionMatrix(predictions, actual)
# Hitung metrik tambahan
precision <- cm$byClass["Pos Pred Value"]
recall <- cm$byClass["Sensitivity"]
f1_score <- 2 * (precision * recall) / (precision + recall)
# Print evaluasi
cat("\n", strrep("=", 50), "\n")
cat(model_name, "-", data_type, "\n")
cat(strrep("=", 50), "\n")
cat("Confusion Matrix:\n")
print(cm$table)
cat("\nEvaluation Metrics:\n")
cat("Accuracy :", round(cm$overall["Accuracy"], 4), "\n")
cat("Sensitivity :", round(cm$byClass["Sensitivity"], 4), "\n")
cat("Specificity :", round(cm$byClass["Specificity"], 4), "\n")
cat("Precision :", round(precision, 4), "\n")
cat("F1-Score :", round(f1_score, 4), "\n")
# Plot Confusion Matrix jika diminta
if(plot_cm) {
plot_confusion_matrix(cm, model_name, data_type)
}
return(list(
confusion = cm$table,
accuracy = cm$overall["Accuracy"],
sensitivity = cm$byClass["Sensitivity"],
specificity = cm$byClass["Specificity"],
precision = precision,
f1_score = f1_score
))
}
plot_confusion_matrix <- function(cm, model_name, data_type) {
cm_matrix <- as.matrix(cm$table)
# Hitung persentase berdasarkan total
total <- sum(cm_matrix)
cm_percent <- round((cm_matrix / total) * 100, 1)
# Buat plot yang lebih jelas
par(mfrow = c(1, 1), mar = c(5, 5, 5, 5))
# Buat warna yang kontras
colors <- colorRampPalette(c("#F6F6F6", "#FF6B6B", "#4ECDC4"))(100)
# Plot heatmap
image(1:ncol(cm_matrix), 1:nrow(cm_matrix),
t(cm_matrix),
col = colors,
xlab = "Predicted", ylab = "Actual",
xaxt = "n", yaxt = "n",
main = paste("Confusion Matrix\n", model_name, "-", data_type),
cex.main = 1.2)
# Tambahkan axis labels
axis(1, at = 1:ncol(cm_matrix), labels = c("0", "1"))
axis(2, at = 1:nrow(cm_matrix), labels = c("0", "1"), las = 2)
# Tambahkan grid
abline(h = seq(0.5, nrow(cm_matrix)+0.5, 1), col = "gray", lty = 3)
abline(v = seq(0.5, ncol(cm_matrix)+0.5, 1), col = "gray", lty = 3)
# Tambahkan teks di setiap sel
for(i in 1:nrow(cm_matrix)) {
for(j in 1:ncol(cm_matrix)) {
text(j, i,
paste(cm_matrix[i, j], "\n(", cm_percent[i, j], "%)", sep = ""),
cex = 1,
col = ifelse(cm_matrix[i, j] > max(cm_matrix)/2, "white", "black"),
font = 2)
}
}
# Tambahkan metrik di sudut
mtext(paste("Accuracy:", round(cm$overall["Accuracy"], 4)),
side = 3, line = 0.5, adj = 0.05, cex = 0.9, col = "darkblue")
}
plot_roc_curve <- function(probs, actual, model_name, color) {
roc_obj <- roc(actual, probs)
auc_val <- auc(roc_obj)
# Plot ROC dengan pengaturan yang lebih baik
plot(roc_obj,
main = paste("ROC Curve -", model_name),
col = color,
lwd = 3,
print.auc = TRUE,
auc.polygon = TRUE,
auc.polygon.col = adjustcolor(color, 0.2),
print.auc.col = "black", # Warna teks AUC hitam
print.auc.cex = 1.1, # Ukuran teks AUC
legacy.axes = TRUE, # False Positive Rate di sumbu x
xlab = "False Positive Rate",
ylab = "True Positive Rate",
grid = TRUE, # Tambahkan grid
grid.col = "gray90",
xlim = c(0, 1),
ylim = c(0, 1))
# Tambahkan diagonal line
abline(a = 0, b = 1, col = "gray", lty = 2, lwd = 2)
# Tambahkan judul AUC dengan background
legend("bottomright",
legend = paste("AUC =", round(auc_val, 4)),
bty = "n",
text.col = "darkblue",
cex = 1.1,
bg = "white",
box.col = "black")
return(auc_val)
}
results <- list()
cv_results <- list()1.0.4 Regresi Logistik
# Training dengan CV
logit_cv_model <- train(Y ~ .,
data = train_data_cv,
method = "glm",
family = binomial,
trControl = ctrl_cv5,
metric = "ROC", set.seed(2024))
# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")##
## Cross-Validation Results:
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.9058675 0.6705882 0.8364919 0.04109693 0.1798955 0.02594077
# Prediksi untuk train data (dari model CV)
logit_train_pred <- predict(logit_cv_model, train_data_cv)
logit_test_pred <- predict(logit_cv_model, test_data_cv)
# Konversi kembali ke 0/1 untuk evaluasi
logit_train_pred_01 <- factor(ifelse(logit_train_pred == "Yes", "1", "0"),
levels = c("0", "1"))
logit_test_pred_01 <- factor(ifelse(logit_test_pred == "Yes", "1", "0"),
levels = c("0", "1"))
# Evaluasi dengan plot
results$logit$train <- evaluate_model(logit_train_pred_01, train_data$Y,
"Logistic Regression", "Train Data (CV)")##
## ==================================================
## Logistic Regression - Train Data (CV)
## ==================================================
## Confusion Matrix:
## Reference
## Prediction 0 1
## 0 53 25
## 1 29 134
##
## Evaluation Metrics:
## Accuracy : 0.7759
## Sensitivity : 0.6463
## Specificity : 0.8428
## Precision : 0.6795
## F1-Score : 0.6625
results$logit$test <- evaluate_model(logit_test_pred_01, test_data$Y,
"Logistic Regression", "Test Data")##
## ==================================================
## Logistic Regression - Test Data
## ==================================================
## Confusion Matrix:
## Reference
## Prediction 0 1
## 0 16 9
## 1 4 30
##
## Evaluation Metrics:
## Accuracy : 0.7797
## Sensitivity : 0.8
## Specificity : 0.7692
## Precision : 0.64
## F1-Score : 0.7111
1.0.5 SVM
# Training SVM dengan CV
svm_cv_model <- train(Y ~ .,
data = train_data_cv,
method = "svmRadial",
trControl = ctrl_cv5,
tuneLength = 5,
metric = "ROC",set.seed(2024))
# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")##
## Cross-Validation Results:
## sigma C ROC Sens Spec ROCSD SensSD SpecSD
## 1 0.1484624 0.25 0.8776743 0.6102941 0.8872984 0.04972337 0.07915092 0.06058208
## 2 0.1484624 0.50 0.8858649 0.5852941 0.8872984 0.04122446 0.02565639 0.06058208
## 3 0.1484624 1.00 0.8806407 0.5360294 0.8808468 0.04520850 0.03894283 0.05080185
## 4 0.1484624 2.00 0.8767048 0.5860294 0.8995968 0.04411341 0.09753393 0.05980211
## 5 0.1484624 4.00 0.8798254 0.5492647 0.8997984 0.03714262 0.06333781 0.07101884
# Prediksi
svm_train_pred <- predict(svm_cv_model, train_data_cv)
svm_test_pred <- predict(svm_cv_model, test_data_cv)
# Konversi kembali ke 0/1
svm_train_pred_01 <- factor(ifelse(svm_train_pred == "Yes", "1", "0"),
levels = c("0", "1"))
svm_test_pred_01 <- factor(ifelse(svm_test_pred == "Yes", "1", "0"),
levels = c("0", "1"))
# Evaluasi
results$svm$train <- evaluate_model(svm_train_pred_01, train_data$Y, "SVM", "Train Data (CV)")##
## ==================================================
## SVM - Train Data (CV)
## ==================================================
## Confusion Matrix:
## Reference
## Prediction 0 1
## 0 58 16
## 1 24 143
##
## Evaluation Metrics:
## Accuracy : 0.834
## Sensitivity : 0.7073
## Specificity : 0.8994
## Precision : 0.7838
## F1-Score : 0.7436
##
## ==================================================
## SVM - Test Data
## ==================================================
## Confusion Matrix:
## Reference
## Prediction 0 1
## 0 14 6
## 1 6 33
##
## Evaluation Metrics:
## Accuracy : 0.7966
## Sensitivity : 0.7
## Specificity : 0.8462
## Precision : 0.7
## F1-Score : 0.7
1.0.6 Naive Bayes
# Training Naive Bayes dengan CV
nb_cv_model <- train(Y ~ .,
data = train_data_cv,
method = "naive_bayes",
trControl = ctrl_cv5,
tuneLength = 5,
metric = "ROC",set.seed(2024))
# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")##
## Cross-Validation Results:
## usekernel laplace adjust ROC Sens Spec ROCSD SensSD
## 1 FALSE 0 1 0.8800381 0.6345588 0.8556452 0.05872925 0.0932114
## 2 TRUE 0 1 0.8796000 0.5250000 0.9622984 0.07430989 0.1312608
## SpecSD
## 1 0.07142832
## 2 0.02608921
# Prediksi
nb_train_pred <- predict(nb_cv_model, train_data_cv)
nb_test_pred <- predict(nb_cv_model, test_data_cv)
# Konversi kembali ke 0/1
nb_train_pred_01 <- factor(ifelse(nb_train_pred == "Yes", "1", "0"),
levels = c("0", "1"))
nb_test_pred_01 <- factor(ifelse(nb_test_pred == "Yes", "1", "0"),
levels = c("0", "1"))
# Evaluasi
results$nb$train <- evaluate_model(nb_train_pred_01, train_data$Y, "Naive Bayes", "Train Data (CV)")##
## ==================================================
## Naive Bayes - Train Data (CV)
## ==================================================
## Confusion Matrix:
## Reference
## Prediction 0 1
## 0 52 21
## 1 30 138
##
## Evaluation Metrics:
## Accuracy : 0.7884
## Sensitivity : 0.6341
## Specificity : 0.8679
## Precision : 0.7123
## F1-Score : 0.671
##
## ==================================================
## Naive Bayes - Test Data
## ==================================================
## Confusion Matrix:
## Reference
## Prediction 0 1
## 0 16 8
## 1 4 31
##
## Evaluation Metrics:
## Accuracy : 0.7966
## Sensitivity : 0.8
## Specificity : 0.7949
## Precision : 0.6667
## F1-Score : 0.7273
1.0.7 KNN
# Training KNN dengan CV
knn_cv_model <- train(Y ~ .,
data = train_data_cv,
method = "knn",
trControl = ctrl_cv5,
tuneGrid = expand.grid(k = c(3, 5, 7, 9, 11)),
metric = "ROC",set.seed(2024))
# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")##
## Cross-Validation Results:
## k ROC Sens Spec ROCSD SensSD SpecSD
## 1 3 0.8095814 0.5992647 0.8118952 0.03394265 0.1555456 0.08182914
## 2 5 0.8211812 0.6338235 0.8429435 0.03811643 0.1468887 0.03748455
## 3 7 0.8228393 0.5250000 0.8743952 0.04595390 0.1620861 0.07624927
## 4 9 0.8049340 0.5375000 0.8431452 0.05703219 0.1335829 0.08786252
## 5 11 0.8070149 0.5125000 0.8429435 0.05013775 0.1482397 0.11021470
##
## Best K value: 7
# Prediksi
knn_train_pred <- predict(knn_cv_model, train_data_cv)
knn_test_pred <- predict(knn_cv_model, test_data_cv)
# Konversi kembali ke 0/1
knn_train_pred_01 <- factor(ifelse(knn_train_pred == "Yes", "1", "0"),
levels = c("0", "1"))
knn_test_pred_01 <- factor(ifelse(knn_test_pred == "Yes", "1", "0"),
levels = c("0", "1"))
# Evaluasi
results$knn$train <- evaluate_model(knn_train_pred_01, train_data$Y, "KNN", "Train Data (CV)")##
## ==================================================
## KNN - Train Data (CV)
## ==================================================
## Confusion Matrix:
## Reference
## Prediction 0 1
## 0 57 20
## 1 25 139
##
## Evaluation Metrics:
## Accuracy : 0.8133
## Sensitivity : 0.6951
## Specificity : 0.8742
## Precision : 0.7403
## F1-Score : 0.717
##
## ==================================================
## KNN - Test Data
## ==================================================
## Confusion Matrix:
## Reference
## Prediction 0 1
## 0 15 9
## 1 5 30
##
## Evaluation Metrics:
## Accuracy : 0.7627
## Sensitivity : 0.75
## Specificity : 0.7692
## Precision : 0.625
## F1-Score : 0.6818
1.0.8 Decision Tree
# Training Decision Tree dengan CV
dt_cv_model <- train(Y ~ .,
data = train_data_cv,
method = "rpart",
trControl = ctrl_cv5,
tuneLength = 10,
metric = "ROC",set.seed(2024))
# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")##
## Cross-Validation Results:
## cp ROC Sens Spec ROCSD SensSD SpecSD
## 1 0.00000000 0.7937404 0.6654412 0.7739919 0.11028534 0.2605111 0.09645971
## 2 0.03387534 0.7685699 0.7227941 0.8046371 0.10308187 0.1447335 0.04358874
## 3 0.06775068 0.7586089 0.6852941 0.7796371 0.09374303 0.1467045 0.06733162
## 4 0.10162602 0.7667398 0.7852941 0.7481855 0.11155099 0.1921391 0.05980126
## 5 0.13550136 0.7667398 0.7852941 0.7481855 0.11155099 0.1921391 0.05980126
## 6 0.16937669 0.7667398 0.7852941 0.7481855 0.11155099 0.1921391 0.05980126
## 7 0.20325203 0.7667398 0.7852941 0.7481855 0.11155099 0.1921391 0.05980126
## 8 0.23712737 0.6823648 0.5852941 0.7794355 0.12375579 0.3599507 0.12619422
## 9 0.27100271 0.6823648 0.5852941 0.7794355 0.12375579 0.3599507 0.12619422
## 10 0.30487805 0.5636148 0.2352941 0.8919355 0.08739931 0.3221897 0.14865859
# Plot decision tree
par(mfrow = c(1, 1))
rpart.plot(dt_cv_model$finalModel,
main = "Decision Tree (CV Optimized)",
type = 4,
extra = 104,
box.palette = "GnBu")rpart.plot(dt_cv_model$finalModel,
main = "Decision Tree Structure",
type = 0,
extra = 101,
box.palette = "RdYlGn", # Red-Yellow-Green palette
shadow.col = "gray",
branch.lty = 3,
fallen.leaves = TRUE,
cex = 0.8)# Plot 2: Decision tree dengan informasi probabilitas
rpart.plot(dt_cv_model$finalModel,
main = "Decision Tree with Class Probabilities",
type = 2, # Display split variable names
extra = 104, # Display probability per class and percentage
box.palette = "GnBu",
shadow.col = "gray",
branch = 0.3,
under = TRUE,
cex = 0.7)# Prediksi
dt_train_pred <- predict(dt_cv_model, train_data_cv)
dt_test_pred <- predict(dt_cv_model, test_data_cv)
# Konversi kembali ke 0/1
dt_train_pred_01 <- factor(ifelse(dt_train_pred == "Yes", "1", "0"),
levels = c("0", "1"))
dt_test_pred_01 <- factor(ifelse(dt_test_pred == "Yes", "1", "0"),
levels = c("0", "1"))
# Evaluasi
results$dt$train <- evaluate_model(dt_train_pred_01, train_data$Y, "Decision Tree", "Train Data (CV)")##
## ==================================================
## Decision Tree - Train Data (CV)
## ==================================================
## Confusion Matrix:
## Reference
## Prediction 0 1
## 0 58 13
## 1 24 146
##
## Evaluation Metrics:
## Accuracy : 0.8465
## Sensitivity : 0.7073
## Specificity : 0.9182
## Precision : 0.8169
## F1-Score : 0.7582
##
## ==================================================
## Decision Tree - Test Data
## ==================================================
## Confusion Matrix:
## Reference
## Prediction 0 1
## 0 12 5
## 1 8 34
##
## Evaluation Metrics:
## Accuracy : 0.7797
## Sensitivity : 0.6
## Specificity : 0.8718
## Precision : 0.7059
## F1-Score : 0.6486
1.0.9 Random Forest
# Training Random Forest dengan CV
rf_cv_model <- train(Y ~ .,
data = train_data_cv,
method = "rf",
trControl = ctrl_cv5,
tuneLength = 3,
ntree = 500,
metric = "ROC",set.seed(2024))
# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")##
## Cross-Validation Results:
## mtry ROC Sens Spec ROCSD SensSD SpecSD
## 1 2 0.9443122 0.7801471 0.9179435 0.02359530 0.05656271 0.06166356
## 2 3 0.9380793 0.8051471 0.9054435 0.02137653 0.02310626 0.05902732
## 3 5 0.9219940 0.7566176 0.9118952 0.02140848 0.09630669 0.05600250
# Plot variable importance
par(mfrow = c(1, 1))
varImpPlot <- varImp(rf_cv_model, scale = FALSE)
plot(varImpPlot,
main = "Random Forest - Variable Importance (CV)",
top = 5)# Prediksi
rf_train_pred <- predict(rf_cv_model, train_data_cv)
rf_test_pred <- predict(rf_cv_model, test_data_cv)
# Konversi kembali ke 0/1
rf_train_pred_01 <- factor(ifelse(rf_train_pred == "Yes", "1", "0"),
levels = c("0", "1"))
rf_test_pred_01 <- factor(ifelse(rf_test_pred == "Yes", "1", "0"),
levels = c("0", "1"))
# Evaluasi
results$rf$train <- evaluate_model(rf_train_pred_01, train_data$Y, "Random Forest", "Train Data (CV)")##
## ==================================================
## Random Forest - Train Data (CV)
## ==================================================
## Confusion Matrix:
## Reference
## Prediction 0 1
## 0 80 0
## 1 2 159
##
## Evaluation Metrics:
## Accuracy : 0.9917
## Sensitivity : 0.9756
## Specificity : 1
## Precision : 1
## F1-Score : 0.9877
##
## ==================================================
## Random Forest - Test Data
## ==================================================
## Confusion Matrix:
## Reference
## Prediction 0 1
## 0 17 3
## 1 3 36
##
## Evaluation Metrics:
## Accuracy : 0.8983
## Sensitivity : 0.85
## Specificity : 0.9231
## Precision : 0.85
## F1-Score : 0.85
1.0.10 Gradient Boosting
# Training Gradient Boosting dengan CV
gbm_cv_model <- train(Y ~ .,
data = train_data_cv,
method = "gbm",
trControl = ctrl_cv5,
verbose = FALSE,
tuneGrid = expand.grid(
n.trees = c(100, 200, 300),
interaction.depth = c(3, 5, 7),
shrinkage = c(0.01, 0.1, 0.3),
n.minobsinnode = 10
),
metric = "ROC",set.seed(2024))
# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")##
## Cross-Validation Results:
## shrinkage interaction.depth n.minobsinnode n.trees ROC Sens
## 1 0.01 3 10 100 0.8618470 0.5117647
## 10 0.10 3 10 100 0.9173335 0.6845588
## 19 0.30 3 10 100 0.9171705 0.7683824
## 4 0.01 5 10 100 0.8648615 0.4867647
## 13 0.10 5 10 100 0.9261571 0.7816176
## 22 0.30 5 10 100 0.9117158 0.7566176
## 7 0.01 7 10 100 0.8733486 0.5227941
## 16 0.10 7 10 100 0.9153596 0.7102941
## 25 0.30 7 10 100 0.9268627 0.7713235
## 2 0.01 3 10 200 0.8743885 0.5367647
## 11 0.10 3 10 200 0.9305866 0.7455882
## 20 0.30 3 10 200 0.9236302 0.7566176
## 5 0.01 5 10 200 0.8867981 0.5735294
## 14 0.10 5 10 200 0.9350073 0.7933824
## 23 0.30 5 10 200 0.9223049 0.7558824
## 8 0.01 7 10 200 0.8872131 0.5852941
## 17 0.10 7 10 200 0.9324093 0.7816176
## 26 0.30 7 10 200 0.9312485 0.7227941
## 3 0.01 3 10 300 0.8853623 0.5727941
## 12 0.10 3 10 300 0.9276647 0.7595588
## 21 0.30 3 10 300 0.9183082 0.7448529
## 6 0.01 5 10 300 0.8938197 0.6095588
## 15 0.10 5 10 300 0.9388572 0.7926471
## 24 0.30 5 10 300 0.9142715 0.7823529
## 9 0.01 7 10 300 0.8961760 0.6227941
## 18 0.10 7 10 300 0.9351696 0.7698529
## 27 0.30 7 10 300 0.9248703 0.7580882
## Spec ROCSD SensSD SpecSD
## 1 0.8806452 0.03405207 0.13335507 0.09459599
## 10 0.9058468 0.03148248 0.13095153 0.02139140
## 19 0.9181452 0.04631778 0.09765165 0.01771155
## 4 0.8993952 0.03908242 0.14711855 0.03412116
## 13 0.9247984 0.03494813 0.15790858 0.05208192
## 22 0.9245968 0.02960686 0.11285028 0.04726845
## 7 0.9245968 0.03678181 0.18399624 0.04726845
## 16 0.9308468 0.03273220 0.11430809 0.03415242
## 25 0.9308468 0.03299204 0.13115780 0.03415242
## 2 0.8679435 0.03798668 0.14203267 0.02596622
## 11 0.9372984 0.02841765 0.07116606 0.03100024
## 20 0.9241935 0.03401371 0.05844316 0.04847837
## 5 0.8806452 0.04172968 0.14840833 0.02544839
## 14 0.9245968 0.03170012 0.08721066 0.02773918
## 23 0.9243952 0.04607032 0.07696917 0.02848563
## 8 0.8741935 0.03754756 0.16834462 0.02217055
## 17 0.9435484 0.03563974 0.11225584 0.02590745
## 26 0.9306452 0.04161968 0.12283484 0.02689285
## 3 0.8679435 0.04221019 0.14706801 0.02596622
## 12 0.9372984 0.03541226 0.14078050 0.03100024
## 21 0.9308468 0.03956110 0.10387575 0.04629201
## 6 0.8679435 0.04026280 0.10732524 0.02596622
## 15 0.9183468 0.02993190 0.10707938 0.04720175
## 24 0.9118952 0.04953136 0.08517201 0.04088320
## 9 0.8741935 0.03797336 0.17125409 0.02217055
## 18 0.9435484 0.03275198 0.10163962 0.02590745
## 27 0.9243952 0.04648541 0.09717990 0.03605152
# Prediksi
gbm_train_pred <- predict(gbm_cv_model, train_data_cv)
gbm_test_pred <- predict(gbm_cv_model, test_data_cv)
# Konversi kembali ke 0/1
gbm_train_pred_01 <- factor(ifelse(gbm_train_pred == "Yes", "1", "0"),
levels = c("0", "1"))
gbm_test_pred_01 <- factor(ifelse(gbm_test_pred == "Yes", "1", "0"),
levels = c("0", "1"))
# Evaluasi
results$gbm$train <- evaluate_model(gbm_train_pred_01, train_data$Y, "Gradient Boosting", "Train Data (CV)")##
## ==================================================
## Gradient Boosting - Train Data (CV)
## ==================================================
## Confusion Matrix:
## Reference
## Prediction 0 1
## 0 81 0
## 1 1 159
##
## Evaluation Metrics:
## Accuracy : 0.9959
## Sensitivity : 0.9878
## Specificity : 1
## Precision : 1
## F1-Score : 0.9939
##
## ==================================================
## Gradient Boosting - Test Data
## ==================================================
## Confusion Matrix:
## Reference
## Prediction 0 1
## 0 17 3
## 1 3 36
##
## Evaluation Metrics:
## Accuracy : 0.8983
## Sensitivity : 0.85
## Specificity : 0.9231
## Precision : 0.85
## F1-Score : 0.85
1.0.11 Artificial Neural Network
# Normalisasi data untuk ANN
normalize <- function(x) {
return((x - min(x)) / (max(x) - min(x)))
}
train_norm_cv <- as.data.frame(lapply(train_data_cv[, -ncol(train_data_cv)], normalize))
test_norm_cv <- as.data.frame(lapply(test_data_cv[, -ncol(test_data_cv)], normalize))
train_norm_cv$Y <- train_data_cv$Y
test_norm_cv$Y <- test_data_cv$Y
# Training Neural Network dengan CV
ann_cv_model <- train(Y ~ .,
data = train_norm_cv,
method = "nnet",
trControl = ctrl_cv5,
tuneGrid = expand.grid(
size = c(5, 10, 15),
decay = c(0.001, 0.01, 0.1)
),
maxit = 200,
trace = FALSE,
metric = "ROC",set.seed(2024))
# Tampilkan hasil CV
cat("\nCross-Validation Results:\n")##
## Cross-Validation Results:
## size decay ROC Sens Spec ROCSD SensSD SpecSD
## 1 5 0.001 0.8951861 0.7808824 0.8683468 0.02674271 0.08065644 0.07077876
## 2 5 0.010 0.8819264 0.7220588 0.8497984 0.06482386 0.17411106 0.08051905
## 3 5 0.100 0.9021395 0.6602941 0.8491935 0.04501207 0.20117720 0.09191813
## 4 10 0.001 0.8667954 0.7933824 0.8685484 0.02826682 0.05136544 0.09452076
## 5 10 0.010 0.8978501 0.7580882 0.8685484 0.03534105 0.12370654 0.06737688
## 6 10 0.100 0.8986580 0.6602941 0.8491935 0.04605225 0.20117720 0.09191813
## 7 15 0.001 0.8939542 0.7948529 0.9062500 0.05036092 0.10455019 0.06250000
## 8 15 0.010 0.8837757 0.7338235 0.8560484 0.04038894 0.15365712 0.06436395
## 9 15 0.100 0.8997839 0.6602941 0.8491935 0.04601113 0.20117720 0.09191813
# Prediksi
ann_train_pred <- predict(ann_cv_model, train_norm_cv)
ann_test_pred <- predict(ann_cv_model, test_norm_cv)
# Konversi kembali ke 0/1
ann_train_pred_01 <- factor(ifelse(ann_train_pred == "Yes", "1", "0"),
levels = c("0", "1"))
ann_test_pred_01 <- factor(ifelse(ann_test_pred == "Yes", "1", "0"),
levels = c("0", "1"))
# Evaluasi
results$ann$train <- evaluate_model(ann_train_pred_01, train_data$Y, "Neural Network", "Train Data (CV)")##
## ==================================================
## Neural Network - Train Data (CV)
## ==================================================
## Confusion Matrix:
## Reference
## Prediction 0 1
## 0 53 21
## 1 29 138
##
## Evaluation Metrics:
## Accuracy : 0.7925
## Sensitivity : 0.6463
## Specificity : 0.8679
## Precision : 0.7162
## F1-Score : 0.6795
##
## ==================================================
## Neural Network - Test Data
## ==================================================
## Confusion Matrix:
## Reference
## Prediction 0 1
## 0 16 9
## 1 4 30
##
## Evaluation Metrics:
## Accuracy : 0.7797
## Sensitivity : 0.8
## Specificity : 0.7692
## Precision : 0.64
## F1-Score : 0.7111
# Probabilitas untuk ROC
ann_test_prob <- predict(ann_cv_model, test_norm_cv, type = "prob")[, "Yes"]
cv_results$ann <- ann_cv_modellibrary(NeuralNetTools)
plotnet(ann_cv_model,
alpha = 0.6,
circle_col = list('lightblue', 'lightgreen', 'lightpink'),
bord_col = 'black',
max_sp = TRUE,
rel_rsc = c(2, 8),
x_names = names(train_norm_cv)[-ncol(train_norm_cv)],
y_names = c("No", "Yes"),
main = "Neural Network Architecture")1.0.12 Perbandingan Model
##
## ============================================================
##
## ROC CURVES COMPARISON (ALL MODELS WITH CV)
##
## ============================================================
# Siapkan plot ROC
par(mfrow = c(1, 1), mar = c(5, 5, 4, 2) + 0.1)
# Warna yang lebih kontras dan berbeda
colors <- c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3",
"#FF7F00", "#FFFF33", "#A65628", "#F781BF")
# Buat plot kosong dulu
plot(NULL, xlim = c(0, 1), ylim = c(0, 1),
xlab = "False Positive Rate",
ylab = "True Positive Rate",
main = "ROC Curves - All Models (5-Fold CV)",
cex.main = 1.5,
cex.lab = 1.2)
grid(col = "gray90", lty = 2)
abline(a = 0, b = 1, col = "gray50", lty = 2, lwd = 2)
# List untuk menyimpan AUC
auc_values <- list()
# Plot ROC untuk setiap model
model_names <- c("Logistic Regression", "SVM", "Naive Bayes", "KNN",
"Decision Tree", "Random Forest", "Gradient Boosting", "Neural Network")
probs_list <- list(logit_test_prob, svm_test_prob, nb_test_prob, knn_test_prob,
dt_test_prob, rf_test_prob, gbm_test_prob, ann_test_prob)
# Pastikan actual values sama untuk semua
actual_numeric <- as.numeric(test_data$Y) - 1
for(i in 1:length(model_names)) {
# Hitung ROC
roc_obj <- roc(actual_numeric, probs_list[[i]])
auc_val <- auc(roc_obj)
auc_values[[model_names[i]]] <- auc_val
# Plot ROC curve
lines(roc_obj$specificities, roc_obj$sensitivities,
col = colors[i], lwd = 3)
# Tambahkan titik di optimal cut-off (Youden Index)
youden_index <- roc_obj$sensitivities + roc_obj$specificities - 1
optimal_idx <- which.max(youden_index)
points(1 - roc_obj$specificities[optimal_idx],
roc_obj$sensitivities[optimal_idx],
col = colors[i], pch = 19, cex = 1.5)
}
# Tambahkan legend dengan warna kontras
legend("bottomright",
legend = paste(model_names, " (AUC=",
round(unlist(auc_values), 4), ")", sep = ""),
col = colors,
lwd = 3,
cex = 0.8,
bg = "white",
box.col = "black",
text.col = "black")##
## ============================================================
##
## ACCURACY COMPARISON TABLE (5-FOLD CV)
##
## ============================================================
# Buat dataframe untuk perbandingan
comparison_df <- data.frame(
Model = model_names,
Train_Accuracy = c(
results$logit$train$accuracy,
results$svm$train$accuracy,
results$nb$train$accuracy,
results$knn$train$accuracy,
results$dt$train$accuracy,
results$rf$train$accuracy,
results$gbm$train$accuracy,
results$ann$train$accuracy
),
Test_Accuracy = c(
results$logit$test$accuracy,
results$svm$test$accuracy,
results$nb$test$accuracy,
results$knn$test$accuracy,
results$dt$test$accuracy,
results$rf$test$accuracy,
results$gbm$test$accuracy,
results$ann$test$accuracy
),
AUC = unlist(auc_values)
)
# Tambahkan kolom overfitting (selisih akurasi train dan test)
comparison_df$Overfitting <- comparison_df$Train_Accuracy - comparison_df$Test_Accuracy
# Tambahkan kolom F1-Score
comparison_df$F1_Score <- c(
results$logit$test$f1_score,
results$svm$test$f1_score,
results$nb$test$f1_score,
results$knn$test$f1_score,
results$dt$test$f1_score,
results$rf$test$f1_score,
results$gbm$test$f1_score,
results$ann$test$f1_score
)
# Urutkan berdasarkan AUC tertinggi
comparison_df <- comparison_df[order(-comparison_df$AUC), ]
# Format output untuk lebih rapi
print_df <- comparison_df
print_df[, 2:6] <- round(print_df[, 2:6], 4)
print(print_df)## Model Train_Accuracy Test_Accuracy AUC
## Random Forest Random Forest 0.9917 0.8983 0.9679
## Gradient Boosting Gradient Boosting 0.9959 0.8983 0.9410
## Naive Bayes Naive Bayes 0.7884 0.7966 0.8897
## Neural Network Neural Network 0.7925 0.7797 0.8885
## SVM SVM 0.8340 0.7966 0.8833
## Logistic Regression Logistic Regression 0.7759 0.7797 0.8705
## Decision Tree Decision Tree 0.8465 0.7797 0.8596
## KNN KNN 0.8133 0.7627 0.8532
## Overfitting F1_Score
## Random Forest 0.0934 0.8500
## Gradient Boosting 0.0975 0.8500
## Naive Bayes -0.0082 0.7273
## Neural Network 0.0129 0.7111
## SVM 0.0374 0.7000
## Logistic Regression -0.0037 0.7111
## Decision Tree 0.0668 0.6486
## KNN 0.0506 0.6818
barplot(height = t(as.matrix(comparison_df[, c("Train_Accuracy", "Test_Accuracy")])),
beside = TRUE,
names.arg = comparison_df$Model,
col = c("#66C2A5", "#FC8D62"),
las = 2,
ylim = c(0, 1),
main = "Train vs Test Accuracy (5-Fold CV)",
ylab = "Accuracy",
cex.names = 0.8,
border = NA)
legend("topright",
legend = c("Train (CV)", "Test"),
fill = c("#66C2A5", "#FC8D62"),
bg = "white",
box.col = "black")
grid(nx = NA, ny = NULL, col = "gray90")barplot(comparison_df$AUC,
names.arg = comparison_df$Model,
col = "#8DA0CB",
las = 2,
ylim = c(0, 1),
main = "AUC Values Comparison (5-Fold CV)",
ylab = "AUC",
cex.names = 0.8,
border = NA)
abline(h = 0.5, col = "red", lty = 2, lwd = 2)
text(x = 1:length(comparison_df$Model),
y = comparison_df$AUC,
labels = round(comparison_df$AUC, 3),
pos = 3, cex = 0.8, col = "darkblue")
grid(nx = NA, ny = NULL, col = "gray90")barplot(comparison_df$F1_Score,
names.arg = comparison_df$Model,
col = "#E78AC3",
las = 2,
ylim = c(0, 1),
main = "F1-Score Comparison (5-Fold CV)",
ylab = "F1-Score",
cex.names = 0.8,
border = NA)
text(x = 1:length(comparison_df$Model),
y = comparison_df$F1_Score,
labels = round(comparison_df$F1_Score, 3),
pos = 3, cex = 0.8, col = "darkblue")
grid(nx = NA, ny = NULL, col = "gray90")barplot(comparison_df$Overfitting,
names.arg = comparison_df$Model,
col = ifelse(comparison_df$Overfitting > 0.1, "#FB8072", "#B3DE69"),
las = 2,
ylim = c(min(comparison_df$Overfitting) - 0.05,
max(comparison_df$Overfitting) + 0.05),
main = "Overfitting (5-Fold CV)",
ylab = "Train Acc - Test Acc",
cex.names = 0.8,
border = NA)
abline(h = 0, col = "black", lwd = 1)
abline(h = 0.05, col = "orange", lty = 2, lwd = 1)
abline(h = 0.1, col = "red", lty = 2, lwd = 1)
text(x = 1:length(comparison_df$Model),
y = comparison_df$Overfitting,
labels = round(comparison_df$Overfitting, 3),
pos = ifelse(comparison_df$Overfitting >= 0, 3, 1),
cex = 0.8, col = "darkblue")
grid(nx = NA, ny = NULL, col = "gray90")cv_summary <- data.frame(
Model = model_names,
CV_Accuracy = sapply(cv_results, function(x) max(x$results$Accuracy, na.rm = TRUE)),
CV_ROC = sapply(cv_results, function(x) max(x$results$ROC, na.rm = TRUE)),
Best_Params = sapply(cv_results, function(x) {
params <- x$bestTune
paste(names(params), "=", params, collapse = ", ")
})
)2 Permodelan Regresi
## Rows: 1,319
## Columns: 12
## $ card <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, no,…
## $ reports <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 3…
## $ age <dbl> 37.66667, 33.25000, 33.66667, 30.50000, 32.16667, 23.25000…
## $ income <dbl> 4.5200, 2.4200, 4.5000, 2.5400, 9.7867, 2.5000, 3.9600, 2.…
## $ share <dbl> 0.0332699100, 0.0052169420, 0.0041555560, 0.0652137800, 0.…
## $ expenditure <dbl> 124.983300, 9.854167, 15.000000, 137.869200, 546.503300, 9…
## $ owner <fct> yes, no, yes, no, yes, no, no, yes, yes, no, yes, yes, yes…
## $ selfemp <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no, no…
## $ dependents <dbl> 3, 3, 4, 0, 2, 0, 2, 0, 0, 0, 1, 2, 1, 0, 2, 1, 2, 2, 0, 0…
## $ months <dbl> 54, 34, 58, 25, 64, 54, 7, 77, 97, 65, 24, 36, 42, 26, 120…
## $ majorcards <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1…
## $ active <dbl> 12, 13, 5, 7, 5, 1, 5, 3, 6, 18, 20, 0, 12, 3, 5, 22, 0, 8…
# Load library yang dibutuhkan
library(tidyverse)
library(caret)
library(Metrics)
library(ggplot2)
library(splines)
library(pROC)
library(e1071)
library(randomForest)
library(gbm)
library(rpart)
library(rpart.plot)
library(nnet)
library(kknn)
library(glmnet)
library(patchwork)
library(NeuralNetTools)
cat("DATA PREPARATION\n")## DATA PREPARATION
## ============================================================
## Data structure:
## 'data.frame': 1319 obs. of 12 variables:
## $ card : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ reports : num 0 0 0 0 0 0 0 0 0 0 ...
## $ age : num 37.7 33.2 33.7 30.5 32.2 ...
## $ income : num 4.52 2.42 4.5 2.54 9.79 ...
## $ share : num 0.03327 0.00522 0.00416 0.06521 0.06705 ...
## $ expenditure: num 124.98 9.85 15 137.87 546.5 ...
## $ owner : Factor w/ 2 levels "no","yes": 2 1 2 1 2 1 1 2 2 1 ...
## $ selfemp : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ dependents : num 3 3 4 0 2 0 2 0 0 0 ...
## $ months : num 54 34 58 25 64 54 7 77 97 65 ...
## $ majorcards : num 1 1 1 1 1 1 1 1 1 1 ...
## $ active : num 12 13 5 7 5 1 5 3 6 18 ...
## NULL
# Cek apakah 'expenditure' ada dalam data
if(!"expenditure" %in% names(data)) {
stop("Kolom 'expenditure' tidak ditemukan dalam data.")
}
# Pastikan expenditure adalah numeric
if(is.factor(data$expenditure)) {
data$expenditure <- as.numeric(as.character(data$expenditure))
cat("\nExpenditure di-convert dari factor ke numeric\n")
}
cat("\n", strrep("=", 60))##
## ============================================================
##
## SPLIT DATA (80% TRAIN - 20% TEST)
##
## ============================================================
set.seed(2024)
train_index <- createDataPartition(data$expenditure, p = 0.8, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
cat("\nData split results:\n")##
## Data split results:
## Total data : 1319 observations
cat("Train data :", nrow(train_data), "observations (", round(nrow(train_data)/nrow(data)*100, 1), "%)\n")## Train data : 1057 observations ( 80.1 %)
cat("Test data :", nrow(test_data), "observations (", round(nrow(test_data)/nrow(data)*100, 1), "%)\n")## Test data : 262 observations ( 19.9 %)
##
## Expenditure statistics:
summary_stats <- data.frame(
Dataset = c("Train", "Test", "Full"),
N = c(nrow(train_data), nrow(test_data), nrow(data)),
Mean = c(mean(train_data$expenditure), mean(test_data$expenditure), mean(data$expenditure)),
SD = c(sd(train_data$expenditure), sd(test_data$expenditure), sd(data$expenditure)),
Min = c(min(train_data$expenditure), min(test_data$expenditure), min(data$expenditure)),
Max = c(max(train_data$expenditure), max(test_data$expenditure), max(data$expenditure))
)
print(summary_stats)## Dataset N Mean SD Min Max
## 1 Train 1057 183.6665 269.3513 0 3099.505
## 2 Test 262 190.6672 283.9563 0 2001.547
## 3 Full 1319 185.0571 272.2189 0 3099.505
ctrl_cv5 <- trainControl(method = "cv",
number = 5,
savePredictions = TRUE,
verboseIter = FALSE)
cat("\nCross-Validation setup: 5-fold CV\n")##
## Cross-Validation setup: 5-fold CV
evaluate_regression <- function(predictions, actual, model_name, data_type, plot_residuals = TRUE) {
# Hitung metrik
rmse_val <- rmse(actual, predictions)
mae_val <- mae(actual, predictions)
mse_val <- mse(actual, predictions)
r2_val <- cor(actual, predictions)^2
# Print hasil
cat("\n", strrep("=", 50), "\n")
cat(model_name, "-", data_type, "\n")
cat(strrep("=", 50), "\n")
cat("RMSE:", round(rmse_val, 4), "\n")
cat("MAE :", round(mae_val, 4), "\n")
cat("MSE :", round(mse_val, 4), "\n")
cat("R² :", round(r2_val, 4), "\n")
# Plot jika diminta
if(plot_residuals) {
plot_regression_results(predictions, actual, model_name, data_type)
}
return(list(
rmse = rmse_val,
mae = mae_val,
mse = mse_val,
r2 = r2_val,
predictions = predictions
))
}
plot_regression_results <- function(predicted, actual, model_name, data_type) {
# Setup layout
par(mfrow = c(2, 2), mar = c(4, 4, 3, 2))
plot(actual, predicted,
main = paste("Predicted vs Actual -", model_name),
xlab = "Actual Expenditure",
ylab = "Predicted Expenditure",
pch = 19, col = "blue", cex = 0.7)
abline(0, 1, col = "red", lwd = 2)
grid()
residuals <- actual - predicted
fitted <- predicted
plot(fitted, residuals,
main = "Residuals vs Fitted",
xlab = "Fitted Values",
ylab = "Residuals",
pch = 19, col = "darkgreen", cex = 0.7)
abline(h = 0, col = "red", lwd = 2)
grid()
hist(residuals,
main = "Distribution of Residuals",
xlab = "Residuals",
col = "lightblue",
border = "black",
breaks = 30)
abline(v = 0, col = "red", lwd = 2)
qqnorm(residuals, main = "Q-Q Plot of Residuals")
qqline(residuals, col = "red", lwd = 2)
# Reset layout
par(mfrow = c(1, 1))
}
results <- list()
cv_results <- list()2.0.1 Regresi Polynomial
predictors <- names(train_data)[names(train_data) != "expenditure"]
if(length(predictors) == 0) {
stop("Tidak ada predictor dalam data.")
}
# Pilih hanya numeric predictors untuk polynomial regression
numeric_predictors <- predictors[sapply(train_data[predictors], is.numeric)]
if(length(numeric_predictors) == 0) {
cat("Tidak ada predictor numeric untuk polynomial regression.\n")
best_predictor <- predictors[1]
} else {
# Hitung korelasi dengan expenditure
correlations <- sapply(numeric_predictors, function(x) {
cor(train_data[[x]], train_data$expenditure, use = "complete.obs")
})
best_predictor <- names(which.max(abs(correlations)))
cat("Menggunakan predictor numeric dengan korelasi tertinggi:", best_predictor,
"(korelasi =", round(correlations[best_predictor], 3), ")\n")
}## Menggunakan predictor numeric dengan korelasi tertinggi: share (korelasi = 0.835 )
# Training polynomial dengan berbagai degree
polynomial_results <- data.frame()
best_poly_degree <- 2
best_poly_rmse <- Inf
for(degree in 2:5) {
cat("\nPolynomial Degree:", degree, "\n")
# Buat formula
formula_poly <- as.formula(paste("expenditure ~ poly(", best_predictor, ", ", degree, ", raw=TRUE)"))
tryCatch({
# Training dengan CV
poly_model <- train(formula_poly,
data = train_data,
method = "lm",
trControl = ctrl_cv5)
# Prediksi
poly_train_pred <- predict(poly_model, train_data)
poly_test_pred <- predict(poly_model, test_data)
# Evaluasi
train_eval <- evaluate_regression(poly_train_pred, train_data$expenditure,
paste("Poly Degree", degree), "Train Data", plot_residuals = FALSE)
test_eval <- evaluate_regression(poly_test_pred, test_data$expenditure,
paste("Poly Degree", degree), "Test Data", plot_residuals = FALSE)
# Simpan hasil
poly_result <- data.frame(
Degree = degree,
Train_RMSE = train_eval$rmse,
Test_RMSE = test_eval$rmse,
Overfitting = train_eval$rmse - test_eval$rmse,
R2 = test_eval$r2
)
polynomial_results <- rbind(polynomial_results, poly_result)
# Update best model
if(test_eval$rmse < best_poly_rmse) {
best_poly_rmse <- test_eval$rmse
best_poly_degree <- degree
best_poly_model <- poly_model
}
}, error = function(e) {
cat("Error untuk degree", degree, ":", e$message, "\n")
})
}##
## Polynomial Degree: 2
##
## ==================================================
## Poly Degree 2 - Train Data
## ==================================================
## RMSE: 145.9464
## MAE : 64.5504
## MSE : 21300.35
## R² : 0.7061
##
## ==================================================
## Poly Degree 2 - Test Data
## ==================================================
## RMSE: 146.3213
## MAE : 67.0184
## MSE : 21409.92
## R² : 0.7447
##
## Polynomial Degree: 3
##
## ==================================================
## Poly Degree 3 - Train Data
## ==================================================
## RMSE: 145.9035
## MAE : 65.0803
## MSE : 21287.83
## R² : 0.7063
##
## ==================================================
## Poly Degree 3 - Test Data
## ==================================================
## RMSE: 146.7144
## MAE : 67.5132
## MSE : 21525.11
## R² : 0.7442
##
## Polynomial Degree: 4
##
## ==================================================
## Poly Degree 4 - Train Data
## ==================================================
## RMSE: 145.8176
## MAE : 64.1362
## MSE : 21262.78
## R² : 0.7066
##
## ==================================================
## Poly Degree 4 - Test Data
## ==================================================
## RMSE: 146.4865
## MAE : 67.0277
## MSE : 21458.28
## R² : 0.7446
##
## Polynomial Degree: 5
##
## ==================================================
## Poly Degree 5 - Train Data
## ==================================================
## RMSE: 145.525
## MAE : 64.0101
## MSE : 21177.53
## R² : 0.7078
##
## ==================================================
## Poly Degree 5 - Test Data
## ==================================================
## RMSE: 148.2246
## MAE : 67.3913
## MSE : 21970.53
## R² : 0.7393
# Tampilkan hasil semua degree
if(nrow(polynomial_results) > 0) {
cat("\nPolynomial Regression Results:\n")
print(polynomial_results)
# Pilih model terbaik
cat("\nBest Polynomial Model: Degree", best_poly_degree, "with Test RMSE =",
round(best_poly_rmse, 4), "\n")
# Plot perbandingan
if(require(ggplot2)) {
p <- ggplot(polynomial_results, aes(x = Degree)) +
geom_line(aes(y = Train_RMSE, color = "Train"), size = 1.5) +
geom_line(aes(y = Test_RMSE, color = "Test"), size = 1.5) +
geom_point(aes(y = Train_RMSE), size = 3) +
geom_point(aes(y = Test_RMSE), size = 3) +
labs(title = "Polynomial Regression: RMSE vs Degree",
x = "Polynomial Degree",
y = "RMSE",
color = "Dataset") +
theme_minimal() +
scale_x_continuous(breaks = 2:5)
print(p)
}
# Plot model terbaik
best_poly_test_pred <- predict(best_poly_model, test_data)
results$poly <- evaluate_regression(best_poly_test_pred, test_data$expenditure,
paste("Best Poly (Degree", best_poly_degree, ")"),
"Test Data")
} else {
cat("\nTidak ada model polynomial yang berhasil di-train.\n")
}##
## Polynomial Regression Results:
## Degree Train_RMSE Test_RMSE Overfitting R2
## 1 2 145.9464 146.3213 -0.3748947 0.7447015
## 2 3 145.9035 146.7144 -0.8109065 0.7441506
## 3 4 145.8176 146.4865 -0.6688267 0.7445540
## 4 5 145.5250 148.2246 -2.6995783 0.7393477
##
## Best Polynomial Model: Degree 2 with Test RMSE = 146.3213
##
## ==================================================
## Best Poly (Degree 2 ) - Test Data
## ==================================================
## RMSE: 146.3213
## MAE : 67.0184
## MSE : 21409.92
## R² : 0.7447
2.0.2 Piecewise Constant Via Decision Tree
tryCatch({
# Gunakan decision tree dengan depth 1 untuk piecewise constant
pc_model <- train(expenditure ~ .,
data = train_data,
method = "rpart",
trControl = ctrl_cv5,
tuneGrid = data.frame(cp = 0.01),
control = rpart.control(maxdepth = 1))
# Plot tree
rpart.plot(pc_model$finalModel,
main = "Piecewise Constant (Regression Stump)",
type = 2,
extra = 101,
box.palette = "RdYlGn")
# Prediksi
pc_train_pred <- predict(pc_model, train_data)
pc_test_pred <- predict(pc_model, test_data)
# Evaluasi
results$piecewise <- evaluate_regression(pc_test_pred, test_data$expenditure,
"Piecewise Constant", "Test Data")
cv_results$piecewise <- pc_model
}, error = function(e) {
cat("Error dalam Piecewise Constant Regression:", e$message, "\n")
})##
## ==================================================
## Piecewise Constant - Test Data
## ==================================================
## RMSE: 217.2666
## MAE : 126.2558
## MSE : 47204.78
## R² : 0.4126
2.0.3 Cubic Spline
tryCatch({
# Tentukan knots untuk cubic spline
n_knots <- min(3, floor(nrow(train_data) / 20))
if(n_knots < 1) n_knots <- 1
knots <- quantile(train_data[[best_predictor]],
probs = seq(0, 1, length.out = n_knots + 2)[-c(1, n_knots + 2)])
# Buat basis spline
cubic_spline_train <- bs(train_data[[best_predictor]],
knots = knots,
degree = 3,
intercept = TRUE)
cubic_spline_test <- bs(test_data[[best_predictor]],
knots = knots,
degree = 3,
intercept = TRUE)
# Gabungkan dengan predictors lain
other_predictors <- setdiff(predictors, best_predictor)
if(length(other_predictors) > 0) {
train_spline <- cbind(as.data.frame(cubic_spline_train),
train_data[, other_predictors, drop = FALSE])
test_spline <- cbind(as.data.frame(cubic_spline_test),
test_data[, other_predictors, drop = FALSE])
} else {
train_spline <- as.data.frame(cubic_spline_train)
test_spline <- as.data.frame(cubic_spline_test)
}
train_spline$expenditure <- train_data$expenditure
test_spline$expenditure <- test_data$expenditure
# Training model
cs_model <- train(expenditure ~ .,
data = train_spline,
method = "lm",
trControl = ctrl_cv5)
# Prediksi
cs_train_pred <- predict(cs_model, train_spline)
cs_test_pred <- predict(cs_model, test_spline)
# Evaluasi
results$cubic_spline <- evaluate_regression(cs_test_pred, test_data$expenditure,
"Cubic Spline", "Test Data")
# Plot spline
if(require(ggplot2)) {
plot_data <- data.frame(
x = seq(min(train_data[[best_predictor]]), max(train_data[[best_predictor]]), length.out = 100),
y = predict(cs_model,
newdata = data.frame(V1 = bs(seq(min(train_data[[best_predictor]]),
max(train_data[[best_predictor]]),
length.out = 100),
knots = knots, degree = 3, intercept = TRUE)))
)
p <- ggplot() +
geom_point(data = train_data, aes(x = .data[[best_predictor]], y = expenditure),
color = "blue", alpha = 0.5) +
geom_line(data = plot_data, aes(x = x, y = y),
color = "red", size = 1.5) +
geom_vline(xintercept = knots, linetype = "dashed", color = "gray") +
labs(title = "Cubic Spline Regression",
x = best_predictor,
y = "Expenditure") +
theme_minimal()
print(p)
}
}, error = function(e) {
cat("Error dalam Cubic Spline Regression:", e$message, "\n")
})##
## ==================================================
## Cubic Spline - Test Data
## ==================================================
## RMSE: 113.0013
## MAE : 61.8996
## MSE : 12769.3
## R² : 0.8422
## Error dalam Cubic Spline Regression: object '1' not found
2.0.4 KNN Regression
tryCatch({
# Training KNN dengan CV
knn_reg_model <- train(expenditure ~ .,
data = train_data,
method = "knn",
trControl = ctrl_cv5,
tuneGrid = expand.grid(k = seq(3, min(21, nrow(train_data)-1), by = 2)),
preProcess = c("center", "scale"))
# Tampilkan tuning results
cat("\nKNN Tuning Results:\n")
print(knn_reg_model$results)
cat("\nBest K:", knn_reg_model$bestTune$k, "\n")
# Plot tuning results
plot(knn_reg_model, main = "KNN Regression: RMSE vs K")
# Prediksi
knn_reg_train_pred <- predict(knn_reg_model, train_data)
knn_reg_test_pred <- predict(knn_reg_model, test_data)
# Evaluasi
results$knn_reg <- evaluate_regression(knn_reg_test_pred, test_data$expenditure,
"KNN Regression", "Test Data")
cv_results$knn_reg <- knn_reg_model
}, error = function(e) {
cat("Error dalam KNN Regression:", e$message, "\n")
})##
## KNN Tuning Results:
## k RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 3 150.7975 0.7166484 74.83276 39.19433 0.04195406 6.243915
## 2 5 144.2537 0.7608913 72.08577 47.65963 0.06896944 8.682050
## 3 7 141.2630 0.7859317 71.31523 46.64292 0.05344641 9.383293
## 4 9 143.3205 0.7857083 71.85967 49.14845 0.06454640 9.974117
## 5 11 146.8377 0.7802253 73.50562 47.92362 0.05980393 9.622740
## 6 13 148.4843 0.7840720 74.80977 48.13834 0.06233632 9.782057
## 7 15 152.5194 0.7726519 76.83856 47.97431 0.05828648 9.341220
## 8 17 155.8848 0.7626555 78.44253 48.03739 0.05914034 9.952061
## 9 19 158.4333 0.7561882 79.63346 47.67147 0.05703458 9.879802
## 10 21 159.8502 0.7577948 80.70819 47.52791 0.05391118 9.890727
##
## Best K: 7
##
## ==================================================
## KNN Regression - Test Data
## ==================================================
## RMSE: 168.7605
## MAE : 77.3134
## MSE : 28480.11
## R² : 0.7396
2.0.5 Regression Tree
tryCatch({
# Training regression tree dengan CV
tree_model <- train(expenditure ~ .,
data = train_data,
method = "rpart",
trControl = ctrl_cv5,
tuneLength = 5)
# Tampilkan tuning results
cat("\nRegression Tree Tuning Results:\n")
print(tree_model$results)
# Plot tree
rpart.plot(tree_model$finalModel,
main = "Regression Tree (CV Optimized)",
type = 4,
extra = 101,
box.palette = "RdYlGn",
fallen.leaves = TRUE)
# Prediksi
tree_train_pred <- predict(tree_model, train_data)
tree_test_pred <- predict(tree_model, test_data)
# Evaluasi
results$tree <- evaluate_regression(tree_test_pred, test_data$expenditure,
"Regression Tree", "Test Data")
cv_results$tree <- tree_model
}, error = function(e) {
cat("Error dalam Regression Tree:", e$message, "\n")
})##
## Regression Tree Tuning Results:
## cp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 0.04636867 150.3884 0.6750795 78.21845 41.46525 0.15450680 10.58495
## 2 0.06183555 176.3621 0.5677563 100.19437 44.24339 0.11221869 11.12011
## 3 0.09300498 177.6830 0.5621279 104.65656 46.95235 0.11943023 15.15254
## 4 0.17183649 185.8391 0.5143367 116.44504 56.01800 0.20629713 17.18961
## 5 0.44851012 249.6979 0.2926773 158.68752 36.34515 0.05997516 22.99496
##
## ==================================================
## Regression Tree - Test Data
## ==================================================
## RMSE: 177.0062
## MAE : 90.3364
## MSE : 31331.18
## R² : 0.6141
2.0.6 Random Forest
tryCatch({
# Training Random Forest dengan CV
rf_reg_model <- train(expenditure ~ .,
data = train_data,
method = "rf",
trControl = ctrl_cv5,
tuneLength = 2,
ntree = 100,
importance = TRUE)
# Tampilkan tuning results
cat("\nRandom Forest Tuning Results:\n")
print(rf_reg_model$results)
# Plot variable importance
varImpPlot <- varImp(rf_reg_model, scale = FALSE)
plot(varImpPlot,
main = "Random Forest - Variable Importance",
top = min(10, ncol(train_data) - 1))
# Plot OOB error
if("finalModel" %in% names(rf_reg_model)) {
oob_error <- rf_reg_model$finalModel$mse
plot(1:length(oob_error), oob_error,
type = "l", col = "red", lwd = 2,
main = "Random Forest OOB Error",
xlab = "Number of Trees",
ylab = "OOB MSE")
grid()
}
# Prediksi
rf_reg_train_pred <- predict(rf_reg_model, train_data)
rf_reg_test_pred <- predict(rf_reg_model, test_data)
# Evaluasi
results$rf_reg <- evaluate_regression(rf_reg_test_pred, test_data$expenditure,
"Random Forest", "Test Data")
cv_results$rf_reg <- rf_reg_model
}, error = function(e) {
cat("Error dalam Random Forest Regression:", e$message, "\n")
})##
## Random Forest Tuning Results:
## mtry RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 2 138.55246 0.8059870 60.76234 38.21001 0.07113424 5.056425
## 2 11 65.55562 0.9438179 15.36185 39.38930 0.04863151 4.665513
##
## ==================================================
## Random Forest - Test Data
## ==================================================
## RMSE: 79.0744
## MAE : 16.9195
## MSE : 6252.764
## R² : 0.9445
2.0.7 Neural Network
tryCatch({
# Normalisasi data
normalize <- function(x) {
if(is.numeric(x)) {
if(sd(x, na.rm = TRUE) == 0) return(x)
return((x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE)))
}
return(x)
}
# Pisahkan factor dan numeric variables
factor_vars <- names(train_data)[sapply(train_data, is.factor)]
numeric_vars <- names(train_data)[sapply(train_data, is.numeric) & names(train_data) != "expenditure"]
# Normalisasi hanya numeric variables
train_norm <- train_data
test_norm <- test_data
for(var in numeric_vars) {
train_norm[[var]] <- normalize(train_data[[var]])
test_norm[[var]] <- normalize(test_data[[var]])
}
# Training Neural Network dengan CV
nn_reg_model <- train(expenditure ~ .,
data = train_norm,
method = "nnet",
trControl = ctrl_cv5,
tuneGrid = expand.grid(
size = c(3, 5, 7),
decay = c(0.001, 0.01)
),
linout = TRUE, # Untuk regression
maxit = 100,
trace = FALSE)
# Tampilkan tuning results
cat("\nNeural Network Tuning Results:\n")
print(nn_reg_model$results)
# Plot neural network architecture
if(require(NeuralNetTools) && max(nn_reg_model$results$size) <= 10) {
plotnet(nn_reg_model$finalModel,
alpha = 0.6,
circle_col = "lightblue",
bord_col = "black",
main = "Neural Network Architecture")
}
# Prediksi
nn_reg_train_pred <- predict(nn_reg_model, train_norm)
nn_reg_test_pred <- predict(nn_reg_model, test_norm)
# Evaluasi
results$nn_reg <- evaluate_regression(nn_reg_test_pred, test_data$expenditure,
"Neural Network", "Test Data")
cv_results$nn_reg <- nn_reg_model
}, error = function(e) {
cat("Error dalam Neural Network Regression:", e$message, "\n")
})##
## Neural Network Tuning Results:
## size decay RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 3 0.001 210.4160 0.3469014 126.61956 28.18749 0.32848915 36.38809
## 2 3 0.010 137.4261 0.7611434 75.54134 35.11794 0.07487461 21.85270
## 3 5 0.001 172.3432 0.6067911 103.68488 65.23254 0.39971504 59.36173
## 4 5 0.010 206.1157 0.6129911 119.55365 84.88204 0.27371472 49.73585
## 5 7 0.001 221.3524 0.4677079 138.00686 29.32972 0.27789740 29.56603
## 6 7 0.010 162.6287 0.6432729 91.01524 81.47856 0.18997924 34.05221
##
## ==================================================
## Neural Network - Test Data
## ==================================================
## RMSE: 248.7369
## MAE : 129.2936
## MSE : 61870.06
## R² : 0.2337
2.0.8 Gradient Boosting
tryCatch({
# Training Gradient Boosting dengan CV
gbm_reg_model <- train(expenditure ~ .,
data = train_data,
method = "gbm",
trControl = ctrl_cv5,
verbose = FALSE,
tuneGrid = expand.grid(
n.trees = c(50, 100),
interaction.depth = c(2, 3),
shrinkage = c(0.01, 0.1),
n.minobsinnode = 5
))
# Tampilkan tuning results
cat("\nGradient Boosting Tuning Results:\n")
print(gbm_reg_model$results)
# Plot variable importance
varImpPlot_gbm <- varImp(gbm_reg_model, scale = FALSE)
plot(varImpPlot_gbm,
main = "Gradient Boosting - Variable Importance",
top = min(10, ncol(train_data) - 1))
# Prediksi
gbm_reg_train_pred <- predict(gbm_reg_model, train_data)
gbm_reg_test_pred <- predict(gbm_reg_model, test_data)
# Evaluasi
results$gbm_reg <- evaluate_regression(gbm_reg_test_pred, test_data$expenditure,
"Gradient Boosting", "Test Data")
cv_results$gbm_reg <- gbm_reg_model
}, error = function(e) {
cat("Error dalam Gradient Boosting Regression:", e$message, "\n")
})##
## Gradient Boosting Tuning Results:
## shrinkage interaction.depth n.minobsinnode n.trees RMSE Rsquared
## 1 0.01 2 5 50 205.67317 0.7734641
## 5 0.10 2 5 50 75.67444 0.9311453
## 3 0.01 3 5 50 196.54572 0.8336064
## 7 0.10 3 5 50 71.88018 0.9372471
## 2 0.01 2 5 100 167.47345 0.8236467
## 6 0.10 2 5 100 64.93840 0.9461651
## 4 0.01 3 5 100 152.93270 0.8626378
## 8 0.10 3 5 100 65.68809 0.9456612
## MAE RMSESD RsquaredSD MAESD
## 1 127.58591 43.37049 0.06469733 12.130833
## 5 33.87627 31.84996 0.03556314 5.612493
## 3 121.68726 43.91787 0.05757927 11.694968
## 7 28.29045 27.72399 0.02496331 3.808863
## 2 101.27320 40.75475 0.04535695 10.640828
## 6 26.75595 27.26843 0.02863940 3.861846
## 4 91.34489 41.25138 0.04314469 9.894588
## 8 24.08445 25.18381 0.02286334 3.773035
##
## ==================================================
## Gradient Boosting - Test Data
## ==================================================
## RMSE: 62.5461
## MAE : 27.2084
## MSE : 3912.009
## R² : 0.9611
2.0.9 SVM
tryCatch({
# Convert factor variables to numeric for SVM
train_svm <- train_data
test_svm <- test_data
for(var in names(train_svm)) {
if(is.factor(train_svm[[var]])) {
train_svm[[var]] <- as.numeric(train_svm[[var]])
test_svm[[var]] <- as.numeric(test_svm[[var]])
}
}
# Training SVM Regression dengan CV
svm_reg_model <- train(expenditure ~ .,
data = train_svm,
method = "svmRadial",
trControl = ctrl_cv5,
tuneLength = 3,
preProcess = c("center", "scale"))
# Tampilkan tuning results
cat("\nSVM Regression Tuning Results:\n")
print(svm_reg_model$results)
# Prediksi
svm_reg_train_pred <- predict(svm_reg_model, train_svm)
svm_reg_test_pred <- predict(svm_reg_model, test_svm)
# Evaluasi
results$svm_reg <- evaluate_regression(svm_reg_test_pred, test_data$expenditure,
"SVM Regression", "Test Data")
cv_results$svm_reg <- svm_reg_model
}, error = function(e) {
cat("Error dalam SVM Regression:", e$message, "\n")
})##
## SVM Regression Tuning Results:
## sigma C RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 0.08338795 0.25 147.7661 0.7450056 50.51453 56.68765 0.1437049 8.655255
## 2 0.08338795 0.50 127.2987 0.8052488 41.65658 57.39252 0.1377684 8.161104
## 3 0.08338795 1.00 108.9882 0.8521085 36.02953 54.88329 0.1225144 6.315507
##
## ==================================================
## SVM Regression - Test Data
## ==================================================
## RMSE: 135.3141
## MAE : 40.5069
## MSE : 18309.9
## R² : 0.8101
2.0.10 Perbandingan
model_names <- c("Polynomial", "Piecewise Constant", "Cubic Spline",
"KNN", "Regression Tree", "Random Forest", "Neural Network",
"Gradient Boosting", "SVM")
# Inisialisasi dataframe
comparison_df <- data.frame(
Model = model_names,
Test_RMSE = rep(NA, length(model_names)),
Test_R2 = rep(NA, length(model_names)),
MAE = rep(NA, length(model_names))
)
# Isi hasil
if(!is.null(results$poly)) {
comparison_df[comparison_df$Model == "Polynomial", c("Test_RMSE", "Test_R2", "MAE")] <-
c(results$poly$rmse, results$poly$r2, results$poly$mae)
}
if(!is.null(results$piecewise)) {
comparison_df[comparison_df$Model == "Piecewise Constant", c("Test_RMSE", "Test_R2", "MAE")] <-
c(results$piecewise$rmse, results$piecewise$r2, results$piecewise$mae)
}
if(!is.null(results$cubic_spline)) {
comparison_df[comparison_df$Model == "Cubic Spline", c("Test_RMSE", "Test_R2", "MAE")] <-
c(results$cubic_spline$rmse, results$cubic_spline$r2, results$cubic_spline$mae)
}
if(!is.null(results$knn_reg)) {
comparison_df[comparison_df$Model == "KNN", c("Test_RMSE", "Test_R2", "MAE")] <-
c(results$knn_reg$rmse, results$knn_reg$r2, results$knn_reg$mae)
}
if(!is.null(results$tree)) {
comparison_df[comparison_df$Model == "Regression Tree", c("Test_RMSE", "Test_R2", "MAE")] <-
c(results$tree$rmse, results$tree$r2, results$tree$mae)
}
if(!is.null(results$rf_reg)) {
comparison_df[comparison_df$Model == "Random Forest", c("Test_RMSE", "Test_R2", "MAE")] <-
c(results$rf_reg$rmse, results$rf_reg$r2, results$rf_reg$mae)
}
if(!is.null(results$nn_reg)) {
comparison_df[comparison_df$Model == "Neural Network", c("Test_RMSE", "Test_R2", "MAE")] <-
c(results$nn_reg$rmse, results$nn_reg$r2, results$nn_reg$mae)
}
if(!is.null(results$gbm_reg)) {
comparison_df[comparison_df$Model == "Gradient Boosting", c("Test_RMSE", "Test_R2", "MAE")] <-
c(results$gbm_reg$rmse, results$gbm_reg$r2, results$gbm_reg$mae)
}
if(!is.null(results$svm_reg)) {
comparison_df[comparison_df$Model == "SVM", c("Test_RMSE", "Test_R2", "MAE")] <-
c(results$svm_reg$rmse, results$svm_reg$r2, results$svm_reg$mae)
}
# Hapus baris dengan NA
comparison_df <- comparison_df[complete.cases(comparison_df), ]
# Urutkan berdasarkan RMSE terbaik (terkecil)
if(nrow(comparison_df) > 0) {
comparison_df <- comparison_df[order(comparison_df$Test_RMSE), ]
comparison_df$Rank <- 1:nrow(comparison_df)
cat("\nModel Performance Comparison (sorted by Test RMSE):\n")
print(comparison_df)
barplot(comparison_df$Test_RMSE,
names.arg = comparison_df$Model,
las = 2,
col = ifelse(comparison_df$Rank == 1, "gold", "steelblue"),
ylim = c(0, max(comparison_df$Test_RMSE) * 1.1),
main = "Test RMSE Comparison",
ylab = "RMSE",
cex.names = 0.8)
abline(h = min(comparison_df$Test_RMSE), col = "red", lty = 2, lwd = 2)
barplot(comparison_df$Test_R2,
names.arg = comparison_df$Model,
las = 2,
col = ifelse(comparison_df$Rank == 1, "gold", "darkgreen"),
ylim = c(0, 1),
main = "Test R² Comparison",
ylab = "R²",
cex.names = 0.8)
abline(h = max(comparison_df$Test_R2), col = "red", lty = 2, lwd = 2)
barplot(comparison_df$MAE,
names.arg = comparison_df$Model,
las = 2,
col = ifelse(comparison_df$Rank == 1, "gold", "darkorange"),
ylim = c(0, max(comparison_df$MAE) * 1.1),
main = "Test MAE Comparison",
ylab = "MAE",
cex.names = 0.8)
abline(h = min(comparison_df$MAE), col = "red", lty = 2, lwd = 2)
plot(comparison_df$Rank, comparison_df$Test_RMSE,
type = "b", pch = 19, col = "blue", lwd = 2,
xlab = "Rank (1 = Best)", ylab = "Test RMSE",
main = "Model Ranking by RMSE",
xlim = c(0.5, nrow(comparison_df) + 0.5))
text(comparison_df$Rank, comparison_df$Test_RMSE,
labels = comparison_df$Model,
pos = 3, cex = 0.7, col = "darkred")
grid()
cat("\n", strrep("=", 60))
cat("\nOVERFITTING ANALYSIS")
cat("\n", strrep("=", 60), "\n")
# Buat dataframe untuk overfitting analysis
overfitting_df <- data.frame(
Model = comparison_df$Model,
Train_RMSE = NA,
Test_RMSE = comparison_df$Test_RMSE,
Overfitting_Index = NA
)
# Isi Train RMSE
train_preds <- list(
poly_train_pred,
pc_train_pred,
cs_train_pred,
knn_reg_train_pred,
tree_train_pred,
rf_reg_train_pred,
nn_reg_train_pred,
gbm_reg_train_pred,
svm_reg_train_pred
)
for(i in 1:nrow(overfitting_df)) {
model_name <- overfitting_df$Model[i]
if(model_name == "Polynomial" && exists("poly_train_pred")) {
overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, poly_train_pred)
} else if(model_name == "Piecewise Constant" && exists("pc_train_pred")) {
overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, pc_train_pred)
} else if(model_name == "Cubic Spline" && exists("cs_train_pred")) {
overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, cs_train_pred)
} else if(model_name == "Natural Spline" && exists("ns_train_pred")) {
overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, ns_train_pred)
} else if(model_name == "KNN" && exists("knn_reg_train_pred")) {
overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, knn_reg_train_pred)
} else if(model_name == "Regression Tree" && exists("tree_train_pred")) {
overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, tree_train_pred)
} else if(model_name == "Random Forest" && exists("rf_reg_train_pred")) {
overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, rf_reg_train_pred)
} else if(model_name == "Neural Network" && exists("nn_reg_train_pred")) {
overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, nn_reg_train_pred)
} else if(model_name == "Gradient Boosting" && exists("gbm_reg_train_pred")) {
overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, gbm_reg_train_pred)
} else if(model_name == "SVM" && exists("svm_reg_train_pred")) {
overfitting_df$Train_RMSE[i] <- rmse(train_data$expenditure, svm_reg_train_pred)
}
}
# Hitung overfitting index
overfitting_df$Overfitting_Index <- (overfitting_df$Train_RMSE - overfitting_df$Test_RMSE) / overfitting_df$Test_RMSE
cat("\nOverfitting Analysis (positive values indicate overfitting):\n")
print(overfitting_df)
# Plot overfitting
if(require(ggplot2)) {
p <- ggplot(overfitting_df, aes(x = Model)) +
geom_bar(aes(y = Train_RMSE, fill = "Train"), stat = "identity", alpha = 0.7) +
geom_bar(aes(y = Test_RMSE, fill = "Test"), stat = "identity", alpha = 0.7) +
labs(title = "Train vs Test RMSE (Overfitting Analysis)",
x = "Model",
y = "RMSE",
fill = "Dataset") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_manual(values = c("Train" = "blue", "Test" = "red"))
print(p)
}
} else {
cat("\nTidak ada model yang berhasil di-train untuk perbandingan.\n")
}##
## Model Performance Comparison (sorted by Test RMSE):
## Model Test_RMSE Test_R2 MAE Rank
## 8 Gradient Boosting 62.54606 0.9611186 27.20840 1
## 6 Random Forest 79.07442 0.9445186 16.91945 2
## 3 Cubic Spline 113.00134 0.8421841 61.89962 3
## 9 SVM 135.31408 0.8100652 40.50692 4
## 1 Polynomial 146.32130 0.7447015 67.01839 5
## 4 KNN 168.76051 0.7395614 77.31341 6
## 5 Regression Tree 177.00617 0.6141311 90.33640 7
## 2 Piecewise Constant 217.26661 0.4126403 126.25576 8
## 7 Neural Network 248.73694 0.2337293 129.29362 9
##
## ============================================================
## OVERFITTING ANALYSIS
## ============================================================
##
## Overfitting Analysis (positive values indicate overfitting):
## Model Train_RMSE Test_RMSE Overfitting_Index
## 1 Gradient Boosting 46.01191 62.54606 -0.264351606
## 2 Random Forest 30.93598 79.07442 -0.608773807
## 3 Cubic Spline 111.34691 113.00134 -0.014640856
## 4 SVM 92.57636 135.31408 -0.315840887
## 5 Polynomial 145.52501 146.32130 -0.005442083
## 6 KNN 125.20119 168.76051 -0.258113248
## 7 Regression Tree 127.65096 177.00617 -0.278833287
## 8 Piecewise Constant 199.93198 217.26661 -0.079785059
## 9 Neural Network 238.28377 248.73694 -0.042024977