Library

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(mlbench)
library(skimr)
library(visdat)
library(corrplot)

## corrplot 0.95 loaded

library(rpart)
library(rpart.plot)
library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(kableExtra)

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

Import Dataset

data("BostonHousing", package = "mlbench")
df <- BostonHousing

Eksplorasi Data

skim(df)

Data summary
Name	df
Number of rows	506
Number of columns	14
_______________________
Column type frequency:
factor	1
numeric	13
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
chas	0	1	FALSE	2	0: 471, 1: 35

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
crim	1	3.61	8.60	0.01	0.08	0.26	3.68	88.98	▇▁▁▁▁
zn	1	11.36	23.32	0.00	0.00	0.00	12.50	100.00	▇▁▁▁▁
indus	1	11.14	6.86	0.46	5.19	9.69	18.10	27.74	▇▆▁▇▁
nox	1	0.55	0.12	0.38	0.45	0.54	0.62	0.87	▇▇▆▅▁
rm	1	6.28	0.70	3.56	5.89	6.21	6.62	8.78	▁▂▇▂▁
age	1	68.57	28.15	2.90	45.02	77.50	94.07	100.00	▂▂▂▃▇
dis	1	3.80	2.11	1.13	2.10	3.21	5.19	12.13	▇▅▂▁▁
rad	1	9.55	8.71	1.00	4.00	5.00	24.00	24.00	▇▂▁▁▃
tax	1	408.24	168.54	187.00	279.00	330.00	666.00	711.00	▇▇▃▁▇
ptratio	1	18.46	2.16	12.60	17.40	19.05	20.20	22.00	▁▃▅▅▇
b	1	356.67	91.29	0.32	375.38	391.44	396.22	396.90	▁▁▁▁▇
lstat	1	12.65	7.14	1.73	6.95	11.36	16.96	37.97	▇▇▅▂▁
medv	1	22.53	9.20	5.00	17.02	21.20	25.00	50.00	▂▇▅▁▁

vis_dat(df)

dim(df)

## [1] 506  14

summary(df)

##       crim                zn             indus       chas         nox        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   0:471   Min.   :0.3850  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1: 35   1st Qu.:0.4490  
##  Median : 0.25651   Median :  0.00   Median : 9.69           Median :0.5380  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14           Mean   :0.5547  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10           3rd Qu.:0.6240  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74           Max.   :0.8710  
##        rm             age              dis              rad        
##  Min.   :3.561   Min.   :  2.90   Min.   : 1.130   Min.   : 1.000  
##  1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100   1st Qu.: 4.000  
##  Median :6.208   Median : 77.50   Median : 3.207   Median : 5.000  
##  Mean   :6.285   Mean   : 68.57   Mean   : 3.795   Mean   : 9.549  
##  3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188   3rd Qu.:24.000  
##  Max.   :8.780   Max.   :100.00   Max.   :12.127   Max.   :24.000  
##       tax           ptratio            b              lstat      
##  Min.   :187.0   Min.   :12.60   Min.   :  0.32   Min.   : 1.73  
##  1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38   1st Qu.: 6.95  
##  Median :330.0   Median :19.05   Median :391.44   Median :11.36  
##  Mean   :408.2   Mean   :18.46   Mean   :356.67   Mean   :12.65  
##  3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23   3rd Qu.:16.95  
##  Max.   :711.0   Max.   :22.00   Max.   :396.90   Max.   :37.97  
##       medv      
##  Min.   : 5.00  
##  1st Qu.:17.02  
##  Median :21.20  
##  Mean   :22.53  
##  3rd Qu.:25.00  
##  Max.   :50.00

Plot Histogram

hist(df$medv, main = "Distribusi medv", xlab = "medv")

Plot Boxplot

boxplot(df$medv, horizontal = TRUE, main = "Boxplot medv")

boxplot(medv ~ chas, data = df, main = "medv per kategori chas", xlab = "chas", ylab = "medv")

Plot Korelasi

num_df <- df %>%
  select(where(is.numeric))

corr_mat <- cor(num_df, use = "complete.obs")

corrplot(
  corr_mat,
  method = "color",
  type = "upper",
  order = "hclust",
  addCoef.col = "black",
  tl.cex = 0.8,
  number.cex = 0.6,
  col = colorRampPalette(c("green", "white", "red"))(200),
  diag = FALSE
)

Splitting Data

set.seed(100)
idx <- createDataPartition(df$medv, p = 0.8, list = FALSE)
train <- df[idx, ]
test  <- df[-idx, ]

dim(train); dim(test)

## [1] 407  14

## [1] 99 14

Model CART

Iterasi 1

iterasi_1 <- rpart(
  medv ~ rm + lstat + dis,
  data = train,
  method = "anova",
  control = rpart.control(cp = 0)  
)

Visualisasi

rpart.plot(
  iterasi_1,
  type = 2,
  extra = 1,
  fallen.leaves = TRUE,
  tweak = 1.3
)

Pruning

cp_table_1 <- iterasi_1$cptable

min_xerr_1 <- min(cp_table_1[, "xerror"])
min_row_1  <- which.min(cp_table_1[, "xerror"])
xstd_min_1 <- cp_table_1[min_row_1, "xstd"]

best_row_1 <- which(cp_table_1[, "xerror"] <= (min_xerr_1 + xstd_min_1))[1]
best_cp_1  <- cp_table_1[best_row_1, "CP"]

best_cp_1

## [1] 0.006147177

iterasi_1_pruned <- prune(iterasi_1, cp = best_cp_1)

rpart.plot(
  iterasi_1_pruned,
  type = 2,
  extra = 1,
  fallen.leaves = TRUE,
  tweak = 1.0
)

Iterasi 2

iterasi_2 <- rpart(
  medv ~ rm + lstat + tax,
  data = train,
  method = "anova",
  control = rpart.control(cp = 0) 
)

Visualisasi

rpart.plot(
  iterasi_2,
  type = 2,
  extra = 1,
  fallen.leaves = TRUE,
  tweak = 1.3
)

Pruning

cp_table_2 <- iterasi_2$cptable
min_xerr_2 <- min(cp_table_2[, "xerror"])
min_row_2  <- which.min(cp_table_2[, "xerror"])
xstd_min_2 <- cp_table_2[min_row_2, "xstd"]

best_row_2 <- which(cp_table_2[, "xerror"] <= (min_xerr_2 + xstd_min_2))[1]
best_cp_2  <- cp_table_2[best_row_2, "CP"]
best_cp_2

## [1] 0.01143078

iterasi_2_pruned <- prune(iterasi_2, cp = best_cp_2)
rpart.plot(
  iterasi_2_pruned,
  type = 2,
  extra = 1,
  fallen.leaves = TRUE,
  tweak = 1.0
)

Iterasi 3

iterasi_3 <- rpart(
  medv ~ rm + lstat + tax + dis,
  data = train,
  method = "anova",
  control = rpart.control(cp = 0) 
)

Visualisasi

rpart.plot(
  iterasi_3,
  type = 2,
  extra = 1,
  fallen.leaves = TRUE,
  tweak = 1.3
)

Pruning

cp_table_3 <- iterasi_3$cptable
min_xerr_3 <- min(cp_table_3[, "xerror"])
min_row_3  <- which.min(cp_table_3[, "xerror"])
xstd_min_3 <- cp_table_3[min_row_3, "xstd"]

best_row_3 <- which(cp_table_3[, "xerror"] <= (min_xerr_3 + xstd_min_3))[1]
best_cp_3  <- cp_table_3[best_row_3, "CP"]
best_cp_3

## [1] 0.01797635

iterasi_3_pruned <- prune(iterasi_3, cp = best_cp_3)
rpart.plot(
  iterasi_3_pruned,
  type = 2,
  extra = 1,
  fallen.leaves = TRUE,
  tweak = 1.0
)

Evaluasi Model

# Fungsi evaluasi
eval_model <- function(model, name){
  data.frame(
    Model = name,
    Train_RMSE = RMSE(predict(model, train), train$medv),
    Test_RMSE  = RMSE(predict(model, test),  test$medv),
    Train_MAE  = MAE(predict(model, train), train$medv),
    Test_MAE   = MAE(predict(model, test),  test$medv),
    Train_R2   = R2(predict(model, train), train$medv),
    Test_R2    = R2(predict(model, test),  test$medv)
  )
}

# Gabung semua model
final_results <- bind_rows(
  eval_model(iterasi_1, "Model 1 - Sebelum"),
  eval_model(iterasi_1_pruned, "Model 1 - Sesudah"),
  eval_model(iterasi_2, "Model 2 - Sebelum"),
  eval_model(iterasi_2_pruned, "Model 2 - Sesudah"),
  eval_model(iterasi_3, "Model 3 - Sebelum"),
  eval_model(iterasi_3_pruned, "Model 3 - Sesudah")
) %>%
  mutate(
    Status = ifelse(grepl("Sesudah", Model), "Sesudah Pruning", "Sebelum Pruning"),
    Model  = sub(" - .*", "", Model)
  ) %>%
  select(Model, Status, everything()) %>%
  mutate(across(where(is.numeric), round, 4))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(where(is.numeric), round, 4)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))

# Tampilkan tabel evaluasi
kable(final_results,
      caption = "Perbandingan Performa Model CART",
      align = "c",
      booktabs = TRUE) %>%
  kable_styling(full_width = FALSE, position = "center")

Perbandingan Performa Model CART
Model	Status	Train_RMSE	Test_RMSE	Train_MAE	Test_MAE	Train_R2	Test_R2
Model 1	Sebelum Pruning	3.4019	4.2668	2.2904	3.1442	0.8632	0.7834
Model 1	Sesudah Pruning	3.9463	4.3348	2.8175	3.2454	0.8159	0.7771
Model 2	Sebelum Pruning	3.4216	4.7689	2.2774	3.5457	0.8616	0.7360
Model 2	Sesudah Pruning	4.0674	4.6006	2.8746	3.2281	0.8044	0.7476
Model 3	Sebelum Pruning	3.3655	4.1820	2.2564	3.0979	0.8661	0.7916
Model 3	Sesudah Pruning	4.1345	4.4708	2.9084	3.3613	0.7979	0.7638

Kesimpulan

Berdasarkan perbandingan performa tiga model CART sebelum dan sesudah pruning menggunakan metrik RMSE, MAE, dan R² pada data training dan testing, dapat disimpulkan bahwa efek pruning tidak selalu meningkatkan kemampuan generalisasi model.

Pada Model 1 dan Model 3, proses pruning justru menyebabkan peningkatan nilai RMSE dan MAE pada data testing serta penurunan nilai R². Hal ini menunjukkan bahwa sebelum pruning, kedua model tersebut telah memiliki kompleksitas yang relatif optimal sehingga pemangkasan cabang mengurangi kemampuan model dalam menangkap pola data (indikasi underfitting ringan setelah pruning).

Sebaliknya, pada Model 2, pruning memberikan perbaikan performa pada data testing, yang ditunjukkan oleh penurunan Test_RMSE dan peningkatan Test_R². Hal ini mengindikasikan bahwa Model 2 sebelum pruning mengalami overfitting yang lebih kuat dibandingkan dua model lainnya, sehingga penyederhanaan struktur pohon membantu meningkatkan generalisasi.

Dengan kata lain Model 3 sebelum pruning merupakan model terbaik, dengan nilai Test_RMSE paling rendah dan Test_R² paling tinggi (0.7916). Model ini mampu menjelaskan sekitar 79,16% variasi pada data uji dan memiliki keseimbangan bias–variance yang paling baik di antara seluruh model yang dibandingkan.

Dari sini, diketahui, proses pruning tidak selalu meningkatkan peforma model secara otomatis. pruning merupakan teknik yang efektivitasnya bergantung pada tingkat kompleksitas awal model dan karakteristik data. Oleh karena itu, pemilihan parameter pruning seharusnya dilakukan secara sistematis, misalnya melalui cross-validation, agar diperoleh struktur pohon yang benar-benar optimal.

G1401231100 | Fathoni Sabri | Tugas 1 Praktikum

Fathoni Sabri

2026-02-20

Library

Import Dataset

Eksplorasi Data

Plot Histogram

Plot Boxplot

Plot Korelasi

Splitting Data

Model CART

Iterasi 1

Visualisasi

Pruning

Iterasi 2

Visualisasi

Pruning

Iterasi 3

Visualisasi

Pruning

Evaluasi Model

Kesimpulan