SOAL

Lakukan tuning (cukup coba 3 kombinasi peubah) dan pruning, lalu jelaskan! 1. Pilih minimal 3 peubah penjelas 2. Membuat decision tree 3. Melakukan pruning 4. Mengulang dari langkah 1 sampai 3 iterasi

Library

library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.5.1
## Warning: package 'readr' was built under R version 4.5.1
## Warning: package 'stringr' was built under R version 4.5.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mlbench)
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.5.2
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.5.2
## 
## Attaching package: 'Metrics'
## 
## The following objects are masked from 'package:caret':
## 
##     precision, recall
library(corrplot)
## corrplot 0.95 loaded

Import Dataset

data("BostonHousing", package = "mlbench")
df <- BostonHousing

glimpse(df)
## Rows: 506
## Columns: 14
## $ crim    <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.08829,…
## $ zn      <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 1…
## $ indus   <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.…
## $ chas    <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nox     <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524,…
## $ rm      <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.631,…
## $ age     <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9, 9…
## $ dis     <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9505…
## $ rad     <dbl> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ tax     <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311, 31…
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2, 15…
## $ b       <dbl> 396.90, 396.90, 392.83, 394.63, 396.90, 394.12, 395.60, 396.90…
## $ lstat   <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17.10…
## $ medv    <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15…
summary(df$medv)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    5.00   17.02   21.20   22.53   25.00   50.00

Eksplorasi Data

dim(df)
## [1] 506  14
summary(df)
##       crim                zn             indus       chas         nox        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   0:471   Min.   :0.3850  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1: 35   1st Qu.:0.4490  
##  Median : 0.25651   Median :  0.00   Median : 9.69           Median :0.5380  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14           Mean   :0.5547  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10           3rd Qu.:0.6240  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74           Max.   :0.8710  
##        rm             age              dis              rad        
##  Min.   :3.561   Min.   :  2.90   Min.   : 1.130   Min.   : 1.000  
##  1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100   1st Qu.: 4.000  
##  Median :6.208   Median : 77.50   Median : 3.207   Median : 5.000  
##  Mean   :6.285   Mean   : 68.57   Mean   : 3.795   Mean   : 9.549  
##  3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188   3rd Qu.:24.000  
##  Max.   :8.780   Max.   :100.00   Max.   :12.127   Max.   :24.000  
##       tax           ptratio            b              lstat      
##  Min.   :187.0   Min.   :12.60   Min.   :  0.32   Min.   : 1.73  
##  1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38   1st Qu.: 6.95  
##  Median :330.0   Median :19.05   Median :391.44   Median :11.36  
##  Mean   :408.2   Mean   :18.46   Mean   :356.67   Mean   :12.65  
##  3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23   3rd Qu.:16.95  
##  Max.   :711.0   Max.   :22.00   Max.   :396.90   Max.   :37.97  
##       medv      
##  Min.   : 5.00  
##  1st Qu.:17.02  
##  Median :21.20  
##  Mean   :22.53  
##  3rd Qu.:25.00  
##  Max.   :50.00
colSums(is.na(df))
##    crim      zn   indus    chas     nox      rm     age     dis     rad     tax 
##       0       0       0       0       0       0       0       0       0       0 
## ptratio       b   lstat    medv 
##       0       0       0       0
# Cek Distribusi Variabel Target
hist(df$medv, main = "Distribusi medv", xlab = "medv")

boxplot(df$medv, horizontal = TRUE, main = "Boxplot medv")

num_df <- df %>% dplyr::select(where(is.numeric))
corr_mat <- cor(num_df)

corrplot(
  corr_mat,
  method = "color",
  type = "upper",
  order = "hclust",
  addCoef.col = "black",
  tl.cex = 0.8,
  number.cex = 0.6
)

boxplot(medv ~ chas, data = df, main = "medv per kategori chas", xlab = "chas", ylab = "medv")

Splitting Data

set.seed(123)
idx <- createDataPartition(df$medv, p = 0.8, list = FALSE)
train <- df[idx, ]
test  <- df[-idx, ]

dim(train); dim(test)
## [1] 407  14
## [1] 99 14

Pembuatan Model CART

Kombinasi Peubah 1: rm, lstat, tax

fit1 <- rpart(
  medv ~ lstat + rm + tax,
  data = train,
  method = "anova",     # kunci untuk regresi
  control = rpart.control(
    minsplit = 20,
    minbucket = 7,
    maxdepth = 30,
    cp = 0              # grow dulu
  )
)

fit1
## n= 407 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##    1) root 407 34125.580000 22.51057  
##      2) rm< 6.8375 334 12681.220000 19.62695  
##        4) lstat>=14.4 146  2607.915000 15.05822  
##          8) tax>=567.5 82  1147.139000 12.94146  
##           16) lstat>=19.73 43   463.719100 10.99535  
##             32) lstat>=26.425 16   165.764400  9.41875 *
##             33) lstat< 26.425 27   234.616300 11.92963  
##               66) lstat< 21.15 7    56.848570 10.21429 *
##               67) lstat>=21.15 20   149.962000 12.53000  
##                134) lstat>=23.315 10    50.461000 12.07000 *
##                135) lstat< 23.315 10    95.269000 12.99000 *
##           17) lstat< 19.73 39   341.003600 15.08718  
##             34) rm>=6.176 23   105.204300 14.42609  
##               68) rm< 6.4775 16    92.159380 14.15625 *
##               69) rm>=6.4775 7     9.217143 15.04286 *
##             35) rm< 6.176 16   211.297500 16.03750 *
##          9) tax< 567.5 64   622.613600 17.77031  
##           18) tax>=305.5 45   384.056400 16.89111  
##             36) rm< 5.705 14    61.729290 15.09286 *
##             37) rm>=5.705 31   256.609700 17.70323  
##               74) tax< 309 9    53.462220 15.84444 *
##               75) tax>=309 22   159.330900 18.46364  
##                150) tax>=434.5 9    32.980000 16.83333 *
##                151) tax< 434.5 13    85.869230 19.59231 *
##           19) tax< 305.5 19   121.387400 19.85263 *
##        5) lstat< 14.4 188  4659.132000 23.17500  
##         10) lstat>=5.51 167  2966.036000 22.35329  
##           20) lstat>=9.705 85   540.668900 20.85647  
##             40) tax>=281.5 64   360.709800 20.50156  
##               80) lstat>=11.67 39   224.383100 19.96923  
##                160) tax< 417.5 23    85.856520 19.04348  
##                  320) rm< 5.9315 11    32.976360 18.01818 *
##                  321) rm>=5.9315 12    30.716670 19.98333 *
##                161) tax>=417.5 16    90.480000 21.30000 *
##               81) lstat< 11.67 25   108.034400 21.33200  
##                162) lstat< 10.755 16    62.737500 20.73750 *
##                163) lstat>=10.755 9    29.588890 22.38889 *
##             41) tax< 281.5 21   147.329500 21.93810  
##               82) lstat< 13.04 14    53.437140 21.31429 *
##               83) lstat>=13.04 7    77.548570 23.18571 *
##           21) lstat< 9.705 82  2037.518000 23.90488  
##             42) tax< 416.5 75   562.559500 23.30267  
##               84) rm< 6.133 21   120.172400 20.55238  
##                168) tax< 290 9    73.875560 19.57778 *
##                169) tax>=290 12    31.336670 21.28333 *
##               85) rm>=6.133 54   221.768300 24.37222  
##                170) rm< 6.611 43   118.864200 23.88837  
##                  340) rm< 6.4815 31    64.576770 23.65484  
##                    680) lstat>=6.725 21    48.571430 23.34286  
##                     1360) lstat< 8.52 14    15.403570 22.92143 *
##                     1361) lstat>=8.52 7    25.708570 24.18571 *
##                    681) lstat< 6.725 10     9.669000 24.31000 *
##                  341) rm>=6.4815 12    48.229170 24.49167 *
##                171) rm>=6.611 11    53.485450 26.26364 *
##             43) tax>=416.5 7  1156.337000 30.35714 *
##         11) lstat< 5.51 21   683.638100 29.70952  
##           22) lstat>=4.44 14   178.268600 27.52857 *
##           23) lstat< 4.44 7   305.594300 34.07143 *
##      3) rm>=6.8375 73  5959.989000 35.70411  
##        6) rm< 7.443 49  2037.207000 31.28367  
##         12) lstat>=9.76 8   440.575000 23.02500 *
##         13) lstat< 9.76 41   944.519000 32.89512  
##           26) lstat>=5.495 18   296.116100 30.27222 *
##           27) lstat< 5.495 23   427.657400 34.94783  
##             54) tax< 384 16   117.830000 33.52500 *
##             55) tax>=384 7   203.400000 38.20000 *
##        7) rm>=7.443 24  1010.470000 44.72917  
##         14) tax>=270 12   687.242500 43.02500 *
##         15) tax< 270 12   253.526700 46.43333 *

Visualisasi CART

rpart.plot(fit1,
  type = 2,
  extra = 1,
  fallen.leaves = TRUE,
  tweak = 1.0
)

Pruning

cp_table1 <- fit1$cptable

min_xerr1 <- min(cp_table1[, "xerror"])
min_row1  <- which.min(cp_table1[, "xerror"])
xstd_min1 <- cp_table1[min_row1, "xstd"]

# ambil pohon paling sederhana yang masih dalam 1 std dari minimum
best_row1 <- which(cp_table1[, "xerror"] <= (min_xerr1 + xstd_min1))[1]
best_cp1 <- cp_table1[best_row1, "CP"]

fit1_pruned <- prune(fit1, cp = best_cp1)
best_cp1
## [1] 0.01910921
fit1_pruned <- prune(fit1, cp = best_cp1)

rpart.plot(
  fit1_pruned,
  type = 2,
  extra = 1,
  fallen.leaves = TRUE,
  tweak = 1.0
)

Evaluasi

Evaluasi Sebelum Pruning

## Evaluasi Model Sebelum Pruning (Tree besar: cp=0)

### 1) Prediksi pada data train & test
pred_train1 <- predict(fit1, newdata = train)
pred_test1 <- predict(fit1, newdata = test)

### 2) Fungsi metrik (tanpa paket tambahan)
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae  <- function(y, yhat) mean(abs(y - yhat))
r2   <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)

### 3) Hitung metrik
train1_rmse <- rmse(train$medv, pred_train1)
train1_mae  <- mae(train$medv, pred_train1)
train1_r2   <- r2(train$medv, pred_train1)

test1_rmse  <- rmse(test$medv, pred_test1)
test1_mae   <- mae(test$medv, pred_test1)
test1_r2    <- r2(test$medv, pred_test1)

data.frame(
  dataset = c("train", "test"),
  RMSE = c(train1_rmse, test1_rmse),
  MAE  = c(train1_mae,  test1_mae),
  R2   = c(train1_r2,   test1_r2)
)
##   dataset     RMSE      MAE        R2
## 1   train 3.612199 2.430827 0.8443828
## 2    test 3.989146 2.896804 0.8165922

Plot Diagnostik Sebelum Pruning

## 4) Prediksi vs Aktual (Test)
plot(test$medv, pred_test1,
     xlab = "Aktual (medv)", ylab = "Prediksi",
     main = "Prediksi vs Aktual (Test) - Sebelum Pruning")
abline(0, 1)

## 5) Residual (Test)
res_test1 <- test$medv - pred_test1

hist(res_test1, breaks = 30,
     main = "Distribusi Residual (Test) - Sebelum Pruning",
     xlab = "Residual (Aktual - Prediksi)")

plot(pred_test1, res_test1,
     xlab = "Prediksi", ylab = "Residual",
     main = "Residual vs Prediksi (Test) - Sebelum Pruning")
abline(h = 0, lty = 2)

Evaluasi Sesudah Pruning

## Prediksi
pred_train1_pr <- predict(fit1_pruned, newdata = train)
pred_test1_pr  <- predict(fit1_pruned, newdata = test)

## Metrik
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae  <- function(y, yhat) mean(abs(y - yhat))
r2   <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)

## Hasil (pruned)
res1_pruned <- data.frame(
  dataset = c("train", "test"),
  RMSE = c(rmse(train$medv, pred_train1_pr), rmse(test$medv, pred_test1_pr)),
  MAE  = c(mae(train$medv,  pred_train1_pr), mae(test$medv,  pred_test1_pr)),
  R2   = c(r2(train$medv,   pred_train1_pr), r2(test$medv,   pred_test1_pr))
)

res1_pruned
##   dataset     RMSE      MAE        R2
## 1   train 4.561107 3.096522 0.7518840
## 2    test 4.491834 3.326420 0.7674558

Plot Diagnostik Sesudah Pruning

## Prediksi vs Aktual (Test) - Pruned
plot(test$medv, pred_test1_pr,
     xlab = "Aktual (medv)", ylab = "Prediksi",
     main = "Prediksi vs Aktual (Test) - Setelah Pruning")
abline(0, 1)

## Residual - Pruned (Test)
res_test1_pr <- test$medv - pred_test1_pr

hist(res_test1_pr, breaks = 30,
     main = "Distribusi Residual (Test) - Setelah Pruning",
     xlab = "Residual (Aktual - Prediksi)")

plot(pred_test1_pr, res_test1_pr,
     xlab = "Prediksi", ylab = "Residual",
     main = "Residual vs Prediksi (Test) - Setelah Pruning")
abline(h = 0, lty = 2)

Kombinasi Peubah 2: rm, lstat, crim

fit2 <- rpart(
  medv ~ lstat + rm + crim,
  data = train,
  method = "anova",     # kunci untuk regresi
  control = rpart.control(
    minsplit = 20,
    minbucket = 7,
    maxdepth = 30,
    cp = 0              # grow dulu
  )
)

fit2
## n= 407 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##   1) root 407 34125.580000 22.510570  
##     2) rm< 6.8375 334 12681.220000 19.626950  
##       4) lstat>=14.4 146  2607.915000 15.058220  
##         8) crim>=7.006285 60   659.084000 11.860000  
##          16) lstat>=19.73 40   382.927700 10.807500  
##            32) lstat>=26.425 15   163.909300  9.506667 *
##            33) lstat< 26.425 25   178.406400 11.588000  
##              66) lstat< 21.15 7    56.848570 10.214290 *
##              67) lstat>=21.15 18   103.211100 12.122220 *
##          17) lstat< 19.73 20   143.225500 13.965000  
##            34) lstat< 17.4 11    81.485450 13.463640 *
##            35) lstat>=17.4 9    55.595560 14.577780 *
##         9) crim< 7.006285 86   906.940600 17.289530  
##          18) crim>=0.614845 42   246.529800 15.702380  
##            36) lstat>=18.885 14    54.474290 14.357140 *
##            37) lstat< 18.885 28   154.052500 16.375000  
##              74) lstat>=15.965 20   121.857500 15.875000  
##               148) crim< 3.11083 8     8.388750 14.612500 *
##               149) crim>=3.11083 12    92.216670 16.716670 *
##              75) lstat< 15.965 8    14.695000 17.625000 *
##          19) crim< 0.614845 44   453.619100 18.804550  
##            38) lstat>=21.23 8   137.328800 16.012500 *
##            39) lstat< 21.23 36   240.067500 19.425000  
##              78) crim>=0.147705 23    84.324350 18.726090  
##               156) crim>=0.25557 9     8.628889 18.088890 *
##               157) crim< 0.25557 14    69.692140 19.135710 *
##              79) crim< 0.147705 13   124.630800 20.661540 *
##       5) lstat< 14.4 188  4659.132000 23.175000  
##        10) lstat>=5.51 167  2966.036000 22.353290  
##          20) crim< 7.24712 160  1380.684000 21.990000  
##            40) rm< 6.062 65   404.937500 20.286150  
##              80) rm>=6.0205 7    59.557140 18.342860 *
##              81) rm< 6.0205 58   315.755200 20.520690  
##               162) rm< 5.8455 22   181.367700 19.868180  
##                 324) lstat>=12.065 11    95.781820 18.972730 *
##                 325) lstat< 12.065 11    67.945450 20.763640 *
##               163) rm>=5.8455 36   119.296400 20.919440  
##                 326) crim< 0.06758 11    41.487270 19.854550 *
##                 327) crim>=0.06758 25    59.846400 21.388000  
##                   654) lstat>=12.675 7     6.194286 20.471430 *
##                   655) lstat< 12.675 18    45.484440 21.744440 *
##            41) rm>=6.062 95   657.934300 23.155790  
##              82) lstat>=9.98 31   181.749700 21.396770  
##               164) rm>=6.158 24    93.020000 20.950000  
##                 328) lstat>=11.655 11    33.725450 20.063640 *
##                 329) lstat< 11.655 13    43.340000 21.700000 *
##               165) rm< 6.158 7    67.514290 22.928570 *
##              83) lstat< 9.98 64   333.806100 24.007810  
##               166) crim< 0.052275 22    88.538640 22.522730  
##                 332) lstat>=8.195 10    48.021000 21.430000 *
##                 333) lstat< 8.195 12    18.626670 23.433330 *
##               167) crim>=0.052275 42   171.331400 24.785710  
##                 334) rm< 6.611 32    85.160000 24.275000  
##                   668) lstat< 7.03 14    12.263570 23.821430 *
##                   669) lstat>=7.03 18    67.776110 24.627780 *
##                 335) rm>=6.611 10    51.116000 26.420000 *
##          21) crim>=7.24712 7  1081.557000 30.657140 *
##        11) lstat< 5.51 21   683.638100 29.709520  
##          22) lstat>=4.44 14   178.268600 27.528570 *
##          23) lstat< 4.44 7   305.594300 34.071430 *
##     3) rm>=6.8375 73  5959.989000 35.704110  
##       6) rm< 7.443 49  2037.207000 31.283670  
##        12) lstat>=9.76 8   440.575000 23.025000 *
##        13) lstat< 9.76 41   944.519000 32.895120  
##          26) lstat>=5.495 18   296.116100 30.272220 *
##          27) lstat< 5.495 23   427.657400 34.947830  
##            54) crim< 0.048955 9    38.915560 33.077780 *
##            55) crim>=0.048955 14   337.035000 36.150000 *
##       7) rm>=7.443 24  1010.470000 44.729170  
##        14) lstat>=4.54 7   692.917100 42.242860 *
##        15) lstat< 4.54 17   256.462400 45.752940 *

Visualisasi CART

rpart.plot(fit2,
  type = 2,
  extra = 1,
  fallen.leaves = TRUE,
  tweak = 1.0
)

Pruning

cp_table2 <- fit2$cptable

min_xerr2 <- min(cp_table2[, "xerror"])
min_row2 <- which.min(cp_table2[, "xerror"])
xstd_min2 <- cp_table2[min_row2, "xstd"]

# ambil pohon paling sederhana yang masih dalam 1 std dari minimum
best_row2 <- which(cp_table2[, "xerror"] <= (min_xerr2 + xstd_min2))[1]
best_cp2 <- cp_table2[best_row2, "CP"]

fit2_pruned <- prune(fit2, cp = best_cp2)
best_cp2
## [1] 0.006468622
fit2_pruned <- prune(fit2, cp = best_cp2)

rpart.plot(
  fit2_pruned,
  type = 2,
  extra = 1,
  fallen.leaves = TRUE,
  tweak = 1.0
)

Evaluasi

Evaluasi Sebelum Pruning

## Evaluasi Model Sebelum Pruning (Tree besar: cp=0)

### 1) Prediksi pada data train & test
pred_train2 <- predict(fit2, newdata = train)
pred_test2 <- predict(fit2, newdata = test)

### 2) Fungsi metrik (tanpa paket tambahan)
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae  <- function(y, yhat) mean(abs(y - yhat))
r2   <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)

### 3) Hitung metrik
train2_rmse <- rmse(train$medv, pred_train2)
train2_mae  <- mae(train$medv, pred_train2)
train2_r2   <- r2(train$medv, pred_train2)

test2_rmse  <- rmse(test$medv, pred_test2)
test2_mae   <- mae(test$medv, pred_test2)
test2_r2    <- r2(test$medv, pred_test2)

data.frame(
  dataset = c("train", "test"),
  RMSE = c(train2_rmse, test2_rmse),
  MAE  = c(train2_mae,  test2_mae),
  R2   = c(train2_r2,   test2_r2)
)
##   dataset     RMSE      MAE        R2
## 1   train 3.594078 2.390291 0.8459402
## 2    test 4.028534 2.668065 0.8129524

Plot Diagnostik Sebelum Pruning

## 4) Prediksi vs Aktual (Test)
plot(test$medv, pred_test2,
     xlab = "Aktual (medv)", ylab = "Prediksi",
     main = "Prediksi vs Aktual (Test) - Sebelum Pruning")
abline(0, 1)

## 5) Residual (Test)
res_test2 <- test$medv - pred_test2

hist(res_test2, breaks = 30,
     main = "Distribusi Residual (Test) - Sebelum Pruning",
     xlab = "Residual (Aktual - Prediksi)")

plot(pred_test2, res_test2,
     xlab = "Prediksi", ylab = "Residual",
     main = "Residual vs Prediksi (Test) - Sebelum Pruning")
abline(h = 0, lty = 2)

Evaluasi Sesudah Pruning

## Prediksi
pred_train2_pr <- predict(fit2_pruned, newdata = train)
pred_test2_pr  <- predict(fit2_pruned, newdata = test)

## Metrik
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae  <- function(y, yhat) mean(abs(y - yhat))
r2   <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)

## Hasil (pruned)
res2_pruned <- data.frame(
  dataset = c("train", "test"),
  RMSE = c(rmse(train$medv, pred_train2_pr), rmse(test$medv, pred_test2_pr)),
  MAE  = c(mae(train$medv,  pred_train2_pr), mae(test$medv,  pred_test2_pr)),
  R2   = c(r2(train$medv,   pred_train2_pr), r2(test$medv,   pred_test2_pr))
)

res2_pruned
##   dataset     RMSE     MAE        R2
## 1   train 4.084385 2.85765 0.8010392
## 2    test 4.604186 3.25753 0.7556773

Plot Diagnostik Sesudah Pruning

## Prediksi vs Aktual (Test) - Pruned
plot(test$medv, pred_test2_pr,
     xlab = "Aktual (medv)", ylab = "Prediksi",
     main = "Prediksi vs Aktual (Test) - Setelah Pruning")
abline(0, 1)

## Residual - Pruned (Test)
res_test2_pr <- test$medv - pred_test2_pr

hist(res_test2_pr, breaks = 30,
     main = "Distribusi Residual (Test) - Setelah Pruning",
     xlab = "Residual (Aktual - Prediksi)")

plot(pred_test2_pr, res_test2_pr,
     xlab = "Prediksi", ylab = "Residual",
     main = "Residual vs Prediksi (Test) - Setelah Pruning")
abline(h = 0, lty = 2)

Kombinasi Peubah 3: rm, lstat, dis

fit3 <- rpart(
  medv ~ lstat + rm + dis,
  data = train,
  method = "anova",     # kunci untuk regresi
  control = rpart.control(
    minsplit = 20,
    minbucket = 7,
    maxdepth = 30,
    cp = 0              # grow dulu
  )
)

fit3
## n= 407 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##    1) root 407 34125.58000 22.510570  
##      2) rm< 6.8375 334 12681.22000 19.626950  
##        4) lstat>=14.4 146  2607.91500 15.058220  
##          8) dis< 2.0037 68   911.63220 12.516180  
##           16) lstat>=19.11 48   534.88980 11.402080  
##             32) lstat>=29.605 12   158.12920  9.491667 *
##             33) lstat< 29.605 36   318.36560 12.038890  
##               66) dis>=1.62265 17   102.42470 10.517650 *
##               67) dis< 1.62265 19   141.40000 13.400000 *
##           17) lstat< 19.11 20   174.17800 15.190000  
##             34) lstat>=15.965 12    56.98667 14.566670 *
##             35) lstat< 15.965 8   105.53500 16.125000 *
##          9) dis>=2.0037 78   873.78870 17.274360  
##           18) dis< 2.44945 30   247.98970 15.703330  
##             36) rm>=6.077 18    59.38000 14.600000 *
##             37) rm< 6.077 12   133.82920 17.358330 *
##           19) dis>=2.44945 48   505.47810 18.256250  
##             38) lstat>=16.215 30   363.70970 17.396670  
##               76) rm< 5.9505 14   130.57430 16.442860 *
##               77) rm>=5.9505 16   209.25440 18.231250 *
##             39) lstat< 16.215 18    82.65778 19.688890 *
##        5) lstat< 14.4 188  4659.13200 23.175000  
##         10) lstat>=5.51 167  2966.03600 22.353290  
##           20) dis>=1.7548 160  1307.66400 21.983120  
##             40) rm< 6.062 63   325.52000 20.233330  
##               80) rm>=6.0205 7    59.55714 18.342860 *
##               81) rm< 6.0205 56   237.81840 20.469640  
##                162) dis>=4.5006 26    61.32462 19.453850  
##                  324) lstat>=10.465 11    28.42182 18.572730 *
##                  325) lstat< 10.465 15    18.10000 20.100000 *
##                163) dis< 4.5006 30   126.41500 21.350000  
##                  326) rm< 5.852 13    72.76308 20.476920 *
##                  327) rm>=5.852 17    36.16471 22.017650 *
##             41) rm>=6.062 97   663.97280 23.119590  
##               82) lstat>=9.98 33   181.74970 21.396970  
##                164) rm>=6.158 26    93.39385 20.984620  
##                  328) lstat>=11.655 13    36.74769 20.269230 *
##                  329) lstat< 11.655 13    43.34000 21.700000 *
##                165) rm< 6.158 7    67.51429 22.928570 *
##               83) lstat< 9.98 64   333.80610 24.007810  
##                166) rm< 6.611 52   212.70060 23.536540  
##                  332) dis>=6.2284 15    65.25733 22.553330 *
##                  333) dis< 6.2284 37   127.06430 23.935140  
##                    666) rm< 6.168 7    20.72857 22.514290 *
##                    667) rm>=6.168 30    88.90667 24.266670  
##                     1334) dis< 3.7917 9    10.44222 23.355560 *
##                     1335) dis>=3.7917 21    67.79143 24.657140  
##                       2670) dis>=4.75025 12    29.24667 24.166670 *
##                       2671) dis< 4.75025 9    31.80889 25.311110 *
##                167) rm>=6.611 12    59.51000 26.050000 *
##           21) dis< 1.7548 7  1135.32900 30.814290 *
##         11) lstat< 5.51 21   683.63810 29.709520  
##           22) lstat>=4.44 14   178.26860 27.528570 *
##           23) lstat< 4.44 7   305.59430 34.071430 *
##      3) rm>=6.8375 73  5959.98900 35.704110  
##        6) rm< 7.443 49  2037.20700 31.283670  
##         12) lstat>=9.76 8   440.57500 23.025000 *
##         13) lstat< 9.76 41   944.51900 32.895120  
##           26) lstat>=5.495 18   296.11610 30.272220 *
##           27) lstat< 5.495 23   427.65740 34.947830  
##             54) dis>=4.417 16    97.83750 33.887500 *
##             55) dis< 4.417 7   270.71430 37.371430 *
##        7) rm>=7.443 24  1010.47000 44.729170  
##         14) lstat>=4.54 7   692.91710 42.242860 *
##         15) lstat< 4.54 17   256.46240 45.752940 *

Visualisasi CART

rpart.plot(fit3,
  type = 2,
  extra = 1,
  fallen.leaves = TRUE,
  tweak = 1.0
)

Pruning

cp_table3 <- fit3$cptable

min_xerr3 <- min(cp_table3[, "xerror"])
min_row3 <- which.min(cp_table3[, "xerror"])
xstd_min3 <- cp_table3[min_row3, "xstd"]

# ambil pohon paling sederhana yang masih dalam 1 std dari minimum
best_row3 <- which(cp_table3[, "xerror"] <= (min_xerr3 + xstd_min3))[1]
best_cp3 <- cp_table3[best_row3, "CP"]

fit3_pruned <- prune(fit3, cp = best_cp3)
best_cp3
## [1] 0.009323552
fit3_pruned <- prune(fit3, cp = best_cp3)

rpart.plot(
  fit2_pruned,
  type = 2,
  extra = 1,
  fallen.leaves = TRUE,
  tweak = 1.0
)

Evaluasi

Evaluasi Sebelum Pruning

## Evaluasi Model Sebelum Pruning (Tree besar: cp=0)

### 1) Prediksi pada data train & test
pred_train3 <- predict(fit3, newdata = train)
pred_test3 <- predict(fit3, newdata = test)

### 2) Fungsi metrik (tanpa paket tambahan)
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae  <- function(y, yhat) mean(abs(y - yhat))
r2   <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)

### 3) Hitung metrik
train3_rmse <- rmse(train$medv, pred_train3)
train3_mae  <- mae(train$medv, pred_train3)
train3_r2   <- r2(train$medv, pred_train3)

test3_rmse  <- rmse(test$medv, pred_test3)
test3_mae   <- mae(test$medv, pred_test3)
test3_r2    <- r2(test$medv, pred_test3)

data.frame(
  dataset = c("train", "test"),
  RMSE = c(train3_rmse, test3_rmse),
  MAE  = c(train3_mae,  test3_mae),
  R2   = c(train3_r2,   test3_r2)
)
##   dataset     RMSE      MAE        R2
## 1   train 3.653811 2.490784 0.8407767
## 2    test 4.444349 3.070000 0.7723465

Plot Diagnostik Sebelum Pruning

## 4) Prediksi vs Aktual (Test)
plot(test$medv, pred_test3,
     xlab = "Aktual (medv)", ylab = "Prediksi",
     main = "Prediksi vs Aktual (Test) - Sebelum Pruning")
abline(0, 1)

## 5) Residual (Test)
res_test3 <- test$medv - pred_test3

hist(res_test3, breaks = 30,
     main = "Distribusi Residual (Test) - Sebelum Pruning",
     xlab = "Residual (Aktual - Prediksi)")

plot(pred_test3, res_test3,
     xlab = "Prediksi", ylab = "Residual",
     main = "Residual vs Prediksi (Test) - Sebelum Pruning")
abline(h = 0, lty = 2)

Evaluasi Sesudah Pruning

## Prediksi
pred_train3_pr <- predict(fit3_pruned, newdata = train)
pred_test3_pr  <- predict(fit3_pruned, newdata = test)

## Metrik
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae  <- function(y, yhat) mean(abs(y - yhat))
r2   <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)

## Hasil (pruned)
res3_pruned <- data.frame(
  dataset = c("train", "test"),
  RMSE = c(rmse(train$medv, pred_train3_pr), rmse(test$medv, pred_test3_pr)),
  MAE  = c(mae(train$medv,  pred_train3_pr), mae(test$medv,  pred_test3_pr)),
  R2   = c(r2(train$medv,   pred_train3_pr), r2(test$medv,   pred_test3_pr))
)

res3_pruned
##   dataset     RMSE      MAE        R2
## 1   train 4.237314 3.090222 0.7858611
## 2    test 4.870643 3.466103 0.7265797

Plot Diagnostik Sesudah Pruning

## Prediksi vs Aktual (Test) - Pruned
plot(test$medv, pred_test3_pr,
     xlab = "Aktual (medv)", ylab = "Prediksi",
     main = "Prediksi vs Aktual (Test) - Setelah Pruning")
abline(0, 1)

## Residual - Pruned (Test)
res_test3_pr <- test$medv - pred_test3_pr

hist(res_test3_pr, breaks = 30,
     main = "Distribusi Residual (Test) - Setelah Pruning",
     xlab = "Residual (Aktual - Prediksi)")

plot(pred_test3_pr, res_test3_pr,
     xlab = "Prediksi", ylab = "Residual",
     main = "Residual vs Prediksi (Test) - Setelah Pruning")
abline(h = 0, lty = 2)

Perbandingan

# KOMBINASI 1

res1_before <- data.frame(
  Model   = "Kombinasi 1",
  Tahap   = "Before Pruning",
  dataset = c("train", "test"),
  RMSE = c(train1_rmse, test1_rmse),
  MAE  = c(train1_mae,  test1_mae),
  R2   = c(train1_r2,   test1_r2)
)

res1_after <- data.frame(
  Model   = "Kombinasi 1",
  Tahap   = "After Pruning",
  dataset = c("train", "test"),
  RMSE = c(rmse(train$medv, pred_train1_pr),
           rmse(test$medv,  pred_test1_pr)),
  MAE  = c(mae(train$medv,  pred_train1_pr),
           mae(test$medv,   pred_test1_pr)),
  R2   = c(r2(train$medv,   pred_train1_pr),
           r2(test$medv,    pred_test1_pr))
)

# KOMBINASI 2

res2_before <- data.frame(
  Model   = "Kombinasi 2",
  Tahap   = "Before Pruning",
  dataset = c("train", "test"),
  RMSE = c(train2_rmse, test2_rmse),
  MAE  = c(train2_mae,  test2_mae),
  R2   = c(train2_r2,   test2_r2)
)

res2_after <- data.frame(
  Model   = "Kombinasi 2",
  Tahap   = "After Pruning",
  dataset = c("train", "test"),
  RMSE = c(rmse(train$medv, pred_train2_pr),
           rmse(test$medv,  pred_test2_pr)),
  MAE  = c(mae(train$medv,  pred_train2_pr),
           mae(test$medv,   pred_test2_pr)),
  R2   = c(r2(train$medv,   pred_train2_pr),
           r2(test$medv,    pred_test2_pr))
)

# KOMBINASI 3

res3_before <- data.frame(
  Model   = "Kombinasi 3",
  Tahap   = "Before Pruning",
  dataset = c("train", "test"),
  RMSE = c(train3_rmse, test3_rmse),
  MAE  = c(train3_mae,  test3_mae),
  R2   = c(train3_r2,   test3_r2)
)

res3_after <- data.frame(
  Model   = "Kombinasi 3",
  Tahap   = "After Pruning",
  dataset = c("train", "test"),
  RMSE = c(rmse(train$medv, pred_train3_pr),
           rmse(test$medv,  pred_test3_pr)),
  MAE  = c(mae(train$medv,  pred_train3_pr),
           mae(test$medv,   pred_test3_pr)),
  R2   = c(r2(train$medv,   pred_train3_pr),
           r2(test$medv,    pred_test3_pr))
)

# GABUNGKAN SEMUA

res_all_models <- rbind(
  res1_before, res1_after,
  res2_before, res2_after,
  res3_before, res3_after
)

res_all_models
##          Model          Tahap dataset     RMSE      MAE        R2
## 1  Kombinasi 1 Before Pruning   train 3.612199 2.430827 0.8443828
## 2  Kombinasi 1 Before Pruning    test 3.989146 2.896804 0.8165922
## 3  Kombinasi 1  After Pruning   train 4.561107 3.096522 0.7518840
## 4  Kombinasi 1  After Pruning    test 4.491834 3.326420 0.7674558
## 5  Kombinasi 2 Before Pruning   train 3.594078 2.390291 0.8459402
## 6  Kombinasi 2 Before Pruning    test 4.028534 2.668065 0.8129524
## 7  Kombinasi 2  After Pruning   train 4.084385 2.857650 0.8010392
## 8  Kombinasi 2  After Pruning    test 4.604186 3.257530 0.7556773
## 9  Kombinasi 3 Before Pruning   train 3.653811 2.490784 0.8407767
## 10 Kombinasi 3 Before Pruning    test 4.444349 3.070000 0.7723465
## 11 Kombinasi 3  After Pruning   train 4.237314 3.090222 0.7858611
## 12 Kombinasi 3  After Pruning    test 4.870643 3.466103 0.7265797

Kesimpulan

Model kombinasi 1 sebelum pruning memiliki performa yang baik pada data train (RMSE 3.61; MAE 2.43; R² 0.84), namun menurun pada data test (RMSE 3.99; MAE 2.90; R² 0.82). Perbedaan performa train-test tersebut menjadi indikasi adanya overfitting karena lebih menyesuaikan data train dibandingkan kemampuan generalisasi.

Model kombinasi 1 sesudah pruning memiliki performa yang menurun tetapi lebih stabil dengan data train (RMSE 4.27; MAE 2.91; R² 0.78) dan data test (RMSE 4.25; MAE 3.08; R² 0.79). Kondisi ini menunjukkan model cukup sederhana, namun memiliki kemampuan generalisasi yang lebih baik.

Model kombinasi 2 sebelum pruning memiliki performa yang sangat baik pada data train (RMSE 3.59; MAE 2.39; R² 0.85), namun menurun pada data test (RMSE 4.03; MAE 2.67; R² 0.81). Perbedaan performa train-test tersebut menjadi indikasi adanya overfitting karena lebih menyesuaikan data train dibandingkan kemampuan generalisasi.

Model kombinasi 2 sesudah pruning memiliki performa yang kurang baik meskipun cukup stabil dengan nilai data train (RMSE 4.51; MAE 3.05; R² 0.76) dan data test (RMSE 4.48; MAE 3.27; R² 0.77). Kondisi ini menunjukkan pruning yang dilakukan terlalu agresif sehingga model menjadi terlalu sederhana dan kurang baik untuk prediksi.

Model kombinasi 3 sebelum pruning memiliki performa yang baik pada data train (RMSE 3.65; MAE 2.49; R² 0.84), namun menurun jauh pada data test (RMSE 4.44; MAE 3.07; R² 0.77). Perbedaan performa train-test tersebut menjadi indikasi adanya overfitting karena lebih menyesuaikan data train dibandingkan kemampuan generalisasi.

Model kombinasi 3 sesudah pruning memiliki performa yang kurang baik meskipun cukup stabil dengan nilai data train (RMSE 4.39; MAE 3.09; R² 0.77) dan data test (RMSE 4.42; MAE 3.24; R² 0.77). Kondisi ini menunjukkan pruning yang dilakukan terlalu agresif sehingga model menjadi terlalu sederhana dan kurang baik untuk prediksi.

Secara keseluruhan, model kombinasi 1 sebelum pruning menjadi model terbaik karena memiliki nilai RMSE terkecil pada data test (3.99) serta nilai R² tertinggi pada data test (0.82).