library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(mlbench)
library(skimr)
library(visdat)
library(corrplot)
## corrplot 0.95 loaded
library(rpart)
library(rpart.plot)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
data("BostonHousing", package = "mlbench")
df <- BostonHousing
skim(df)
| Name | df |
| Number of rows | 506 |
| Number of columns | 14 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 13 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| chas | 0 | 1 | FALSE | 2 | 0: 471, 1: 35 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| crim | 0 | 1 | 3.61 | 8.60 | 0.01 | 0.08 | 0.26 | 3.68 | 88.98 | ▇▁▁▁▁ |
| zn | 0 | 1 | 11.36 | 23.32 | 0.00 | 0.00 | 0.00 | 12.50 | 100.00 | ▇▁▁▁▁ |
| indus | 0 | 1 | 11.14 | 6.86 | 0.46 | 5.19 | 9.69 | 18.10 | 27.74 | ▇▆▁▇▁ |
| nox | 0 | 1 | 0.55 | 0.12 | 0.38 | 0.45 | 0.54 | 0.62 | 0.87 | ▇▇▆▅▁ |
| rm | 0 | 1 | 6.28 | 0.70 | 3.56 | 5.89 | 6.21 | 6.62 | 8.78 | ▁▂▇▂▁ |
| age | 0 | 1 | 68.57 | 28.15 | 2.90 | 45.02 | 77.50 | 94.07 | 100.00 | ▂▂▂▃▇ |
| dis | 0 | 1 | 3.80 | 2.11 | 1.13 | 2.10 | 3.21 | 5.19 | 12.13 | ▇▅▂▁▁ |
| rad | 0 | 1 | 9.55 | 8.71 | 1.00 | 4.00 | 5.00 | 24.00 | 24.00 | ▇▂▁▁▃ |
| tax | 0 | 1 | 408.24 | 168.54 | 187.00 | 279.00 | 330.00 | 666.00 | 711.00 | ▇▇▃▁▇ |
| ptratio | 0 | 1 | 18.46 | 2.16 | 12.60 | 17.40 | 19.05 | 20.20 | 22.00 | ▁▃▅▅▇ |
| b | 0 | 1 | 356.67 | 91.29 | 0.32 | 375.38 | 391.44 | 396.22 | 396.90 | ▁▁▁▁▇ |
| lstat | 0 | 1 | 12.65 | 7.14 | 1.73 | 6.95 | 11.36 | 16.96 | 37.97 | ▇▇▅▂▁ |
| medv | 0 | 1 | 22.53 | 9.20 | 5.00 | 17.02 | 21.20 | 25.00 | 50.00 | ▂▇▅▁▁ |
vis_dat(df)
dim(df)
## [1] 506 14
summary(df)
## crim zn indus chas nox
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 0:471 Min. :0.3850
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1: 35 1st Qu.:0.4490
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.5380
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.5547
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.6240
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :0.8710
## rm age dis rad
## Min. :3.561 Min. : 2.90 Min. : 1.130 Min. : 1.000
## 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100 1st Qu.: 4.000
## Median :6.208 Median : 77.50 Median : 3.207 Median : 5.000
## Mean :6.285 Mean : 68.57 Mean : 3.795 Mean : 9.549
## 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188 3rd Qu.:24.000
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.000
## tax ptratio b lstat
## Min. :187.0 Min. :12.60 Min. : 0.32 Min. : 1.73
## 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38 1st Qu.: 6.95
## Median :330.0 Median :19.05 Median :391.44 Median :11.36
## Mean :408.2 Mean :18.46 Mean :356.67 Mean :12.65
## 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23 3rd Qu.:16.95
## Max. :711.0 Max. :22.00 Max. :396.90 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
hist(df$medv, main = "Distribusi medv", xlab = "medv")
boxplot(df$medv, horizontal = TRUE, main = "Boxplot medv")
boxplot(medv ~ chas, data = df, main = "medv per kategori chas", xlab = "chas", ylab = "medv")
num_df <- df %>%
select(where(is.numeric))
corr_mat <- cor(num_df, use = "complete.obs")
corrplot(
corr_mat,
method = "color",
type = "upper",
order = "hclust",
addCoef.col = "black",
tl.cex = 0.8,
number.cex = 0.6,
col = colorRampPalette(c("green", "white", "red"))(200),
diag = FALSE
)
set.seed(100)
idx <- createDataPartition(df$medv, p = 0.8, list = FALSE)
train <- df[idx, ]
test <- df[-idx, ]
dim(train); dim(test)
## [1] 407 14
## [1] 99 14
iterasi_1 <- rpart(
medv ~ rm + lstat + dis,
data = train,
method = "anova",
control = rpart.control(cp = 0)
)
rpart.plot(
iterasi_1,
type = 2,
extra = 1,
fallen.leaves = TRUE,
tweak = 1.3
)
cp_table_1 <- iterasi_1$cptable
min_xerr_1 <- min(cp_table_1[, "xerror"])
min_row_1 <- which.min(cp_table_1[, "xerror"])
xstd_min_1 <- cp_table_1[min_row_1, "xstd"]
best_row_1 <- which(cp_table_1[, "xerror"] <= (min_xerr_1 + xstd_min_1))[1]
best_cp_1 <- cp_table_1[best_row_1, "CP"]
best_cp_1
## [1] 0.006147177
iterasi_1_pruned <- prune(iterasi_1, cp = best_cp_1)
rpart.plot(
iterasi_1_pruned,
type = 2,
extra = 1,
fallen.leaves = TRUE,
tweak = 1.0
)
iterasi_2 <- rpart(
medv ~ rm + lstat + tax,
data = train,
method = "anova",
control = rpart.control(cp = 0)
)
rpart.plot(
iterasi_2,
type = 2,
extra = 1,
fallen.leaves = TRUE,
tweak = 1.3
)
cp_table_2 <- iterasi_2$cptable
min_xerr_2 <- min(cp_table_2[, "xerror"])
min_row_2 <- which.min(cp_table_2[, "xerror"])
xstd_min_2 <- cp_table_2[min_row_2, "xstd"]
best_row_2 <- which(cp_table_2[, "xerror"] <= (min_xerr_2 + xstd_min_2))[1]
best_cp_2 <- cp_table_2[best_row_2, "CP"]
best_cp_2
## [1] 0.01143078
iterasi_2_pruned <- prune(iterasi_2, cp = best_cp_2)
rpart.plot(
iterasi_2_pruned,
type = 2,
extra = 1,
fallen.leaves = TRUE,
tweak = 1.0
)
iterasi_3 <- rpart(
medv ~ rm + lstat + tax + dis,
data = train,
method = "anova",
control = rpart.control(cp = 0)
)
rpart.plot(
iterasi_3,
type = 2,
extra = 1,
fallen.leaves = TRUE,
tweak = 1.3
)
cp_table_3 <- iterasi_3$cptable
min_xerr_3 <- min(cp_table_3[, "xerror"])
min_row_3 <- which.min(cp_table_3[, "xerror"])
xstd_min_3 <- cp_table_3[min_row_3, "xstd"]
best_row_3 <- which(cp_table_3[, "xerror"] <= (min_xerr_3 + xstd_min_3))[1]
best_cp_3 <- cp_table_3[best_row_3, "CP"]
best_cp_3
## [1] 0.01797635
iterasi_3_pruned <- prune(iterasi_3, cp = best_cp_3)
rpart.plot(
iterasi_3_pruned,
type = 2,
extra = 1,
fallen.leaves = TRUE,
tweak = 1.0
)
# Fungsi evaluasi
eval_model <- function(model, name){
data.frame(
Model = name,
Train_RMSE = RMSE(predict(model, train), train$medv),
Test_RMSE = RMSE(predict(model, test), test$medv),
Train_MAE = MAE(predict(model, train), train$medv),
Test_MAE = MAE(predict(model, test), test$medv),
Train_R2 = R2(predict(model, train), train$medv),
Test_R2 = R2(predict(model, test), test$medv)
)
}
# Gabung semua model
final_results <- bind_rows(
eval_model(iterasi_1, "Model 1 - Sebelum"),
eval_model(iterasi_1_pruned, "Model 1 - Sesudah"),
eval_model(iterasi_2, "Model 2 - Sebelum"),
eval_model(iterasi_2_pruned, "Model 2 - Sesudah"),
eval_model(iterasi_3, "Model 3 - Sebelum"),
eval_model(iterasi_3_pruned, "Model 3 - Sesudah")
) %>%
mutate(
Status = ifelse(grepl("Sesudah", Model), "Sesudah Pruning", "Sebelum Pruning"),
Model = sub(" - .*", "", Model)
) %>%
select(Model, Status, everything()) %>%
mutate(across(where(is.numeric), round, 4))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.numeric), round, 4)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
# Tampilkan tabel evaluasi
kable(final_results,
caption = "Perbandingan Performa Model CART",
align = "c",
booktabs = TRUE) %>%
kable_styling(full_width = FALSE, position = "center")
| Model | Status | Train_RMSE | Test_RMSE | Train_MAE | Test_MAE | Train_R2 | Test_R2 |
|---|---|---|---|---|---|---|---|
| Model 1 | Sebelum Pruning | 3.4019 | 4.2668 | 2.2904 | 3.1442 | 0.8632 | 0.7834 |
| Model 1 | Sesudah Pruning | 3.9463 | 4.3348 | 2.8175 | 3.2454 | 0.8159 | 0.7771 |
| Model 2 | Sebelum Pruning | 3.4216 | 4.7689 | 2.2774 | 3.5457 | 0.8616 | 0.7360 |
| Model 2 | Sesudah Pruning | 4.0674 | 4.6006 | 2.8746 | 3.2281 | 0.8044 | 0.7476 |
| Model 3 | Sebelum Pruning | 3.3655 | 4.1820 | 2.2564 | 3.0979 | 0.8661 | 0.7916 |
| Model 3 | Sesudah Pruning | 4.1345 | 4.4708 | 2.9084 | 3.3613 | 0.7979 | 0.7638 |
Berdasarkan perbandingan performa tiga model CART sebelum dan sesudah pruning menggunakan metrik RMSE, MAE, dan R² pada data training dan testing, dapat disimpulkan bahwa efek pruning tidak selalu meningkatkan kemampuan generalisasi model.
Pada Model 1 dan Model 3, proses pruning justru menyebabkan peningkatan nilai RMSE dan MAE pada data testing serta penurunan nilai R². Hal ini menunjukkan bahwa sebelum pruning, kedua model tersebut telah memiliki kompleksitas yang relatif optimal sehingga pemangkasan cabang mengurangi kemampuan model dalam menangkap pola data (indikasi underfitting ringan setelah pruning).
Sebaliknya, pada Model 2, pruning memberikan perbaikan performa pada data testing, yang ditunjukkan oleh penurunan Test_RMSE dan peningkatan Test_R². Hal ini mengindikasikan bahwa Model 2 sebelum pruning mengalami overfitting yang lebih kuat dibandingkan dua model lainnya, sehingga penyederhanaan struktur pohon membantu meningkatkan generalisasi.
Dengan kata lain Model 3 sebelum pruning merupakan model terbaik, dengan nilai Test_RMSE paling rendah dan Test_R² paling tinggi (0.7916). Model ini mampu menjelaskan sekitar 79,16% variasi pada data uji dan memiliki keseimbangan bias–variance yang paling baik di antara seluruh model yang dibandingkan.
Dari sini, diketahui, proses pruning tidak selalu meningkatkan peforma model secara otomatis. pruning merupakan teknik yang efektivitasnya bergantung pada tingkat kompleksitas awal model dan karakteristik data. Oleh karena itu, pemilihan parameter pruning seharusnya dilakukan secara sistematis, misalnya melalui cross-validation, agar diperoleh struktur pohon yang benar-benar optimal.