Library

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(mlbench)
library(skimr)
library(visdat)
library(corrplot)
## corrplot 0.95 loaded
library(rpart)
library(rpart.plot)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

Import Dataset

data("BostonHousing", package = "mlbench")
df <- BostonHousing

Eksplorasi Data

skim(df)
Data summary
Name df
Number of rows 506
Number of columns 14
_______________________
Column type frequency:
factor 1
numeric 13
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
chas 0 1 FALSE 2 0: 471, 1: 35

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
crim 0 1 3.61 8.60 0.01 0.08 0.26 3.68 88.98 ▇▁▁▁▁
zn 0 1 11.36 23.32 0.00 0.00 0.00 12.50 100.00 ▇▁▁▁▁
indus 0 1 11.14 6.86 0.46 5.19 9.69 18.10 27.74 ▇▆▁▇▁
nox 0 1 0.55 0.12 0.38 0.45 0.54 0.62 0.87 ▇▇▆▅▁
rm 0 1 6.28 0.70 3.56 5.89 6.21 6.62 8.78 ▁▂▇▂▁
age 0 1 68.57 28.15 2.90 45.02 77.50 94.07 100.00 ▂▂▂▃▇
dis 0 1 3.80 2.11 1.13 2.10 3.21 5.19 12.13 ▇▅▂▁▁
rad 0 1 9.55 8.71 1.00 4.00 5.00 24.00 24.00 ▇▂▁▁▃
tax 0 1 408.24 168.54 187.00 279.00 330.00 666.00 711.00 ▇▇▃▁▇
ptratio 0 1 18.46 2.16 12.60 17.40 19.05 20.20 22.00 ▁▃▅▅▇
b 0 1 356.67 91.29 0.32 375.38 391.44 396.22 396.90 ▁▁▁▁▇
lstat 0 1 12.65 7.14 1.73 6.95 11.36 16.96 37.97 ▇▇▅▂▁
medv 0 1 22.53 9.20 5.00 17.02 21.20 25.00 50.00 ▂▇▅▁▁
vis_dat(df)

dim(df)
## [1] 506  14
summary(df)
##       crim                zn             indus       chas         nox        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   0:471   Min.   :0.3850  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1: 35   1st Qu.:0.4490  
##  Median : 0.25651   Median :  0.00   Median : 9.69           Median :0.5380  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14           Mean   :0.5547  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10           3rd Qu.:0.6240  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74           Max.   :0.8710  
##        rm             age              dis              rad        
##  Min.   :3.561   Min.   :  2.90   Min.   : 1.130   Min.   : 1.000  
##  1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100   1st Qu.: 4.000  
##  Median :6.208   Median : 77.50   Median : 3.207   Median : 5.000  
##  Mean   :6.285   Mean   : 68.57   Mean   : 3.795   Mean   : 9.549  
##  3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188   3rd Qu.:24.000  
##  Max.   :8.780   Max.   :100.00   Max.   :12.127   Max.   :24.000  
##       tax           ptratio            b              lstat      
##  Min.   :187.0   Min.   :12.60   Min.   :  0.32   Min.   : 1.73  
##  1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38   1st Qu.: 6.95  
##  Median :330.0   Median :19.05   Median :391.44   Median :11.36  
##  Mean   :408.2   Mean   :18.46   Mean   :356.67   Mean   :12.65  
##  3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23   3rd Qu.:16.95  
##  Max.   :711.0   Max.   :22.00   Max.   :396.90   Max.   :37.97  
##       medv      
##  Min.   : 5.00  
##  1st Qu.:17.02  
##  Median :21.20  
##  Mean   :22.53  
##  3rd Qu.:25.00  
##  Max.   :50.00

Plot Histogram

hist(df$medv, main = "Distribusi medv", xlab = "medv")

Plot Boxplot

boxplot(df$medv, horizontal = TRUE, main = "Boxplot medv")

boxplot(medv ~ chas, data = df, main = "medv per kategori chas", xlab = "chas", ylab = "medv")

Plot Korelasi

num_df <- df %>%
  select(where(is.numeric))

corr_mat <- cor(num_df, use = "complete.obs")

corrplot(
  corr_mat,
  method = "color",
  type = "upper",
  order = "hclust",
  addCoef.col = "black",
  tl.cex = 0.8,
  number.cex = 0.6,
  col = colorRampPalette(c("green", "white", "red"))(200),
  diag = FALSE
)

Splitting Data

set.seed(100)
idx <- createDataPartition(df$medv, p = 0.8, list = FALSE)
train <- df[idx, ]
test  <- df[-idx, ]

dim(train); dim(test)
## [1] 407  14
## [1] 99 14

Model CART

Iterasi 1

iterasi_1 <- rpart(
  medv ~ rm + lstat + dis,
  data = train,
  method = "anova",
  control = rpart.control(cp = 0)  
)

Visualisasi

rpart.plot(
  iterasi_1,
  type = 2,
  extra = 1,
  fallen.leaves = TRUE,
  tweak = 1.3
)

Pruning

cp_table_1 <- iterasi_1$cptable

min_xerr_1 <- min(cp_table_1[, "xerror"])
min_row_1  <- which.min(cp_table_1[, "xerror"])
xstd_min_1 <- cp_table_1[min_row_1, "xstd"]
best_row_1 <- which(cp_table_1[, "xerror"] <= (min_xerr_1 + xstd_min_1))[1]
best_cp_1  <- cp_table_1[best_row_1, "CP"]

best_cp_1
## [1] 0.006147177
iterasi_1_pruned <- prune(iterasi_1, cp = best_cp_1)

rpart.plot(
  iterasi_1_pruned,
  type = 2,
  extra = 1,
  fallen.leaves = TRUE,
  tweak = 1.0
)

Iterasi 2

iterasi_2 <- rpart(
  medv ~ rm + lstat + tax,
  data = train,
  method = "anova",
  control = rpart.control(cp = 0) 
)

Visualisasi

rpart.plot(
  iterasi_2,
  type = 2,
  extra = 1,
  fallen.leaves = TRUE,
  tweak = 1.3
)

Pruning

cp_table_2 <- iterasi_2$cptable
min_xerr_2 <- min(cp_table_2[, "xerror"])
min_row_2  <- which.min(cp_table_2[, "xerror"])
xstd_min_2 <- cp_table_2[min_row_2, "xstd"]
best_row_2 <- which(cp_table_2[, "xerror"] <= (min_xerr_2 + xstd_min_2))[1]
best_cp_2  <- cp_table_2[best_row_2, "CP"]
best_cp_2
## [1] 0.01143078
iterasi_2_pruned <- prune(iterasi_2, cp = best_cp_2)
rpart.plot(
  iterasi_2_pruned,
  type = 2,
  extra = 1,
  fallen.leaves = TRUE,
  tweak = 1.0
)

Iterasi 3

iterasi_3 <- rpart(
  medv ~ rm + lstat + tax + dis,
  data = train,
  method = "anova",
  control = rpart.control(cp = 0) 
)

Visualisasi

rpart.plot(
  iterasi_3,
  type = 2,
  extra = 1,
  fallen.leaves = TRUE,
  tweak = 1.3
)

Pruning

cp_table_3 <- iterasi_3$cptable
min_xerr_3 <- min(cp_table_3[, "xerror"])
min_row_3  <- which.min(cp_table_3[, "xerror"])
xstd_min_3 <- cp_table_3[min_row_3, "xstd"]
best_row_3 <- which(cp_table_3[, "xerror"] <= (min_xerr_3 + xstd_min_3))[1]
best_cp_3  <- cp_table_3[best_row_3, "CP"]
best_cp_3
## [1] 0.01797635
iterasi_3_pruned <- prune(iterasi_3, cp = best_cp_3)
rpart.plot(
  iterasi_3_pruned,
  type = 2,
  extra = 1,
  fallen.leaves = TRUE,
  tweak = 1.0
)

Evaluasi Model

# Fungsi evaluasi
eval_model <- function(model, name){
  data.frame(
    Model = name,
    Train_RMSE = RMSE(predict(model, train), train$medv),
    Test_RMSE  = RMSE(predict(model, test),  test$medv),
    Train_MAE  = MAE(predict(model, train), train$medv),
    Test_MAE   = MAE(predict(model, test),  test$medv),
    Train_R2   = R2(predict(model, train), train$medv),
    Test_R2    = R2(predict(model, test),  test$medv)
  )
}

# Gabung semua model
final_results <- bind_rows(
  eval_model(iterasi_1, "Model 1 - Sebelum"),
  eval_model(iterasi_1_pruned, "Model 1 - Sesudah"),
  eval_model(iterasi_2, "Model 2 - Sebelum"),
  eval_model(iterasi_2_pruned, "Model 2 - Sesudah"),
  eval_model(iterasi_3, "Model 3 - Sebelum"),
  eval_model(iterasi_3_pruned, "Model 3 - Sesudah")
) %>%
  mutate(
    Status = ifelse(grepl("Sesudah", Model), "Sesudah Pruning", "Sebelum Pruning"),
    Model  = sub(" - .*", "", Model)
  ) %>%
  select(Model, Status, everything()) %>%
  mutate(across(where(is.numeric), round, 4))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.numeric), round, 4)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
# Tampilkan tabel evaluasi
kable(final_results,
      caption = "Perbandingan Performa Model CART",
      align = "c",
      booktabs = TRUE) %>%
  kable_styling(full_width = FALSE, position = "center")
Perbandingan Performa Model CART
Model Status Train_RMSE Test_RMSE Train_MAE Test_MAE Train_R2 Test_R2
Model 1 Sebelum Pruning 3.4019 4.2668 2.2904 3.1442 0.8632 0.7834
Model 1 Sesudah Pruning 3.9463 4.3348 2.8175 3.2454 0.8159 0.7771
Model 2 Sebelum Pruning 3.4216 4.7689 2.2774 3.5457 0.8616 0.7360
Model 2 Sesudah Pruning 4.0674 4.6006 2.8746 3.2281 0.8044 0.7476
Model 3 Sebelum Pruning 3.3655 4.1820 2.2564 3.0979 0.8661 0.7916
Model 3 Sesudah Pruning 4.1345 4.4708 2.9084 3.3613 0.7979 0.7638

Kesimpulan

Berdasarkan perbandingan performa tiga model CART sebelum dan sesudah pruning menggunakan metrik RMSE, MAE, dan R² pada data training dan testing, dapat disimpulkan bahwa efek pruning tidak selalu meningkatkan kemampuan generalisasi model.

Pada Model 1 dan Model 3, proses pruning justru menyebabkan peningkatan nilai RMSE dan MAE pada data testing serta penurunan nilai R². Hal ini menunjukkan bahwa sebelum pruning, kedua model tersebut telah memiliki kompleksitas yang relatif optimal sehingga pemangkasan cabang mengurangi kemampuan model dalam menangkap pola data (indikasi underfitting ringan setelah pruning).

Sebaliknya, pada Model 2, pruning memberikan perbaikan performa pada data testing, yang ditunjukkan oleh penurunan Test_RMSE dan peningkatan Test_R². Hal ini mengindikasikan bahwa Model 2 sebelum pruning mengalami overfitting yang lebih kuat dibandingkan dua model lainnya, sehingga penyederhanaan struktur pohon membantu meningkatkan generalisasi.

Dengan kata lain Model 3 sebelum pruning merupakan model terbaik, dengan nilai Test_RMSE paling rendah dan Test_R² paling tinggi (0.7916). Model ini mampu menjelaskan sekitar 79,16% variasi pada data uji dan memiliki keseimbangan bias–variance yang paling baik di antara seluruh model yang dibandingkan.

Dari sini, diketahui, proses pruning tidak selalu meningkatkan peforma model secara otomatis. pruning merupakan teknik yang efektivitasnya bergantung pada tingkat kompleksitas awal model dan karakteristik data. Oleh karena itu, pemilihan parameter pruning seharusnya dilakukan secara sistematis, misalnya melalui cross-validation, agar diperoleh struktur pohon yang benar-benar optimal.