Lakukan tuning (cukup coba 3 kombinasi peubah) dan pruning, lalu jelaskan! 1. Pilih minimal 3 peubah penjelas 2. Membuat decision tree 3. Melakukan pruning 4. Mengulang dari langkah 1 sampai 3 iterasi
library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.5.1
## Warning: package 'readr' was built under R version 4.5.1
## Warning: package 'stringr' was built under R version 4.5.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mlbench)
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.5.2
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.5.2
##
## Attaching package: 'Metrics'
##
## The following objects are masked from 'package:caret':
##
## precision, recall
library(corrplot)
## corrplot 0.95 loaded
data("BostonHousing", package = "mlbench")
df <- BostonHousing
glimpse(df)
## Rows: 506
## Columns: 14
## $ crim <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, 0.08829,…
## $ zn <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 1…
## $ indus <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.…
## $ chas <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nox <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524,…
## $ rm <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172, 5.631,…
## $ age <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0, 85.9, 9…
## $ dis <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605, 5.9505…
## $ rad <dbl> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,…
## $ tax <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311, 311, 31…
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2, 15…
## $ b <dbl> 396.90, 396.90, 392.83, 394.63, 396.90, 394.12, 395.60, 396.90…
## $ lstat <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.93, 17.10…
## $ medv <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15…
summary(df$medv)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.00 17.02 21.20 22.53 25.00 50.00
dim(df)
## [1] 506 14
summary(df)
## crim zn indus chas nox
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 0:471 Min. :0.3850
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1: 35 1st Qu.:0.4490
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.5380
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.5547
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.6240
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :0.8710
## rm age dis rad
## Min. :3.561 Min. : 2.90 Min. : 1.130 Min. : 1.000
## 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100 1st Qu.: 4.000
## Median :6.208 Median : 77.50 Median : 3.207 Median : 5.000
## Mean :6.285 Mean : 68.57 Mean : 3.795 Mean : 9.549
## 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188 3rd Qu.:24.000
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.000
## tax ptratio b lstat
## Min. :187.0 Min. :12.60 Min. : 0.32 Min. : 1.73
## 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38 1st Qu.: 6.95
## Median :330.0 Median :19.05 Median :391.44 Median :11.36
## Mean :408.2 Mean :18.46 Mean :356.67 Mean :12.65
## 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23 3rd Qu.:16.95
## Max. :711.0 Max. :22.00 Max. :396.90 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
colSums(is.na(df))
## crim zn indus chas nox rm age dis rad tax
## 0 0 0 0 0 0 0 0 0 0
## ptratio b lstat medv
## 0 0 0 0
# Cek Distribusi Variabel Target
hist(df$medv, main = "Distribusi medv", xlab = "medv")
boxplot(df$medv, horizontal = TRUE, main = "Boxplot medv")
num_df <- df %>% dplyr::select(where(is.numeric))
corr_mat <- cor(num_df)
corrplot(
corr_mat,
method = "color",
type = "upper",
order = "hclust",
addCoef.col = "black",
tl.cex = 0.8,
number.cex = 0.6
)
boxplot(medv ~ chas, data = df, main = "medv per kategori chas", xlab = "chas", ylab = "medv")
set.seed(123)
idx <- createDataPartition(df$medv, p = 0.8, list = FALSE)
train <- df[idx, ]
test <- df[-idx, ]
dim(train); dim(test)
## [1] 407 14
## [1] 99 14
fit1 <- rpart(
medv ~ lstat + rm + tax,
data = train,
method = "anova", # kunci untuk regresi
control = rpart.control(
minsplit = 20,
minbucket = 7,
maxdepth = 30,
cp = 0 # grow dulu
)
)
fit1
## n= 407
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 407 34125.580000 22.51057
## 2) rm< 6.8375 334 12681.220000 19.62695
## 4) lstat>=14.4 146 2607.915000 15.05822
## 8) tax>=567.5 82 1147.139000 12.94146
## 16) lstat>=19.73 43 463.719100 10.99535
## 32) lstat>=26.425 16 165.764400 9.41875 *
## 33) lstat< 26.425 27 234.616300 11.92963
## 66) lstat< 21.15 7 56.848570 10.21429 *
## 67) lstat>=21.15 20 149.962000 12.53000
## 134) lstat>=23.315 10 50.461000 12.07000 *
## 135) lstat< 23.315 10 95.269000 12.99000 *
## 17) lstat< 19.73 39 341.003600 15.08718
## 34) rm>=6.176 23 105.204300 14.42609
## 68) rm< 6.4775 16 92.159380 14.15625 *
## 69) rm>=6.4775 7 9.217143 15.04286 *
## 35) rm< 6.176 16 211.297500 16.03750 *
## 9) tax< 567.5 64 622.613600 17.77031
## 18) tax>=305.5 45 384.056400 16.89111
## 36) rm< 5.705 14 61.729290 15.09286 *
## 37) rm>=5.705 31 256.609700 17.70323
## 74) tax< 309 9 53.462220 15.84444 *
## 75) tax>=309 22 159.330900 18.46364
## 150) tax>=434.5 9 32.980000 16.83333 *
## 151) tax< 434.5 13 85.869230 19.59231 *
## 19) tax< 305.5 19 121.387400 19.85263 *
## 5) lstat< 14.4 188 4659.132000 23.17500
## 10) lstat>=5.51 167 2966.036000 22.35329
## 20) lstat>=9.705 85 540.668900 20.85647
## 40) tax>=281.5 64 360.709800 20.50156
## 80) lstat>=11.67 39 224.383100 19.96923
## 160) tax< 417.5 23 85.856520 19.04348
## 320) rm< 5.9315 11 32.976360 18.01818 *
## 321) rm>=5.9315 12 30.716670 19.98333 *
## 161) tax>=417.5 16 90.480000 21.30000 *
## 81) lstat< 11.67 25 108.034400 21.33200
## 162) lstat< 10.755 16 62.737500 20.73750 *
## 163) lstat>=10.755 9 29.588890 22.38889 *
## 41) tax< 281.5 21 147.329500 21.93810
## 82) lstat< 13.04 14 53.437140 21.31429 *
## 83) lstat>=13.04 7 77.548570 23.18571 *
## 21) lstat< 9.705 82 2037.518000 23.90488
## 42) tax< 416.5 75 562.559500 23.30267
## 84) rm< 6.133 21 120.172400 20.55238
## 168) tax< 290 9 73.875560 19.57778 *
## 169) tax>=290 12 31.336670 21.28333 *
## 85) rm>=6.133 54 221.768300 24.37222
## 170) rm< 6.611 43 118.864200 23.88837
## 340) rm< 6.4815 31 64.576770 23.65484
## 680) lstat>=6.725 21 48.571430 23.34286
## 1360) lstat< 8.52 14 15.403570 22.92143 *
## 1361) lstat>=8.52 7 25.708570 24.18571 *
## 681) lstat< 6.725 10 9.669000 24.31000 *
## 341) rm>=6.4815 12 48.229170 24.49167 *
## 171) rm>=6.611 11 53.485450 26.26364 *
## 43) tax>=416.5 7 1156.337000 30.35714 *
## 11) lstat< 5.51 21 683.638100 29.70952
## 22) lstat>=4.44 14 178.268600 27.52857 *
## 23) lstat< 4.44 7 305.594300 34.07143 *
## 3) rm>=6.8375 73 5959.989000 35.70411
## 6) rm< 7.443 49 2037.207000 31.28367
## 12) lstat>=9.76 8 440.575000 23.02500 *
## 13) lstat< 9.76 41 944.519000 32.89512
## 26) lstat>=5.495 18 296.116100 30.27222 *
## 27) lstat< 5.495 23 427.657400 34.94783
## 54) tax< 384 16 117.830000 33.52500 *
## 55) tax>=384 7 203.400000 38.20000 *
## 7) rm>=7.443 24 1010.470000 44.72917
## 14) tax>=270 12 687.242500 43.02500 *
## 15) tax< 270 12 253.526700 46.43333 *
rpart.plot(fit1,
type = 2,
extra = 1,
fallen.leaves = TRUE,
tweak = 1.0
)
cp_table1 <- fit1$cptable
min_xerr1 <- min(cp_table1[, "xerror"])
min_row1 <- which.min(cp_table1[, "xerror"])
xstd_min1 <- cp_table1[min_row1, "xstd"]
# ambil pohon paling sederhana yang masih dalam 1 std dari minimum
best_row1 <- which(cp_table1[, "xerror"] <= (min_xerr1 + xstd_min1))[1]
best_cp1 <- cp_table1[best_row1, "CP"]
fit1_pruned <- prune(fit1, cp = best_cp1)
best_cp1
## [1] 0.01910921
fit1_pruned <- prune(fit1, cp = best_cp1)
rpart.plot(
fit1_pruned,
type = 2,
extra = 1,
fallen.leaves = TRUE,
tweak = 1.0
)
## Evaluasi Model Sebelum Pruning (Tree besar: cp=0)
### 1) Prediksi pada data train & test
pred_train1 <- predict(fit1, newdata = train)
pred_test1 <- predict(fit1, newdata = test)
### 2) Fungsi metrik (tanpa paket tambahan)
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae <- function(y, yhat) mean(abs(y - yhat))
r2 <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)
### 3) Hitung metrik
train1_rmse <- rmse(train$medv, pred_train1)
train1_mae <- mae(train$medv, pred_train1)
train1_r2 <- r2(train$medv, pred_train1)
test1_rmse <- rmse(test$medv, pred_test1)
test1_mae <- mae(test$medv, pred_test1)
test1_r2 <- r2(test$medv, pred_test1)
data.frame(
dataset = c("train", "test"),
RMSE = c(train1_rmse, test1_rmse),
MAE = c(train1_mae, test1_mae),
R2 = c(train1_r2, test1_r2)
)
## dataset RMSE MAE R2
## 1 train 3.612199 2.430827 0.8443828
## 2 test 3.989146 2.896804 0.8165922
## 4) Prediksi vs Aktual (Test)
plot(test$medv, pred_test1,
xlab = "Aktual (medv)", ylab = "Prediksi",
main = "Prediksi vs Aktual (Test) - Sebelum Pruning")
abline(0, 1)
## 5) Residual (Test)
res_test1 <- test$medv - pred_test1
hist(res_test1, breaks = 30,
main = "Distribusi Residual (Test) - Sebelum Pruning",
xlab = "Residual (Aktual - Prediksi)")
plot(pred_test1, res_test1,
xlab = "Prediksi", ylab = "Residual",
main = "Residual vs Prediksi (Test) - Sebelum Pruning")
abline(h = 0, lty = 2)
## Prediksi
pred_train1_pr <- predict(fit1_pruned, newdata = train)
pred_test1_pr <- predict(fit1_pruned, newdata = test)
## Metrik
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae <- function(y, yhat) mean(abs(y - yhat))
r2 <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)
## Hasil (pruned)
res1_pruned <- data.frame(
dataset = c("train", "test"),
RMSE = c(rmse(train$medv, pred_train1_pr), rmse(test$medv, pred_test1_pr)),
MAE = c(mae(train$medv, pred_train1_pr), mae(test$medv, pred_test1_pr)),
R2 = c(r2(train$medv, pred_train1_pr), r2(test$medv, pred_test1_pr))
)
res1_pruned
## dataset RMSE MAE R2
## 1 train 4.561107 3.096522 0.7518840
## 2 test 4.491834 3.326420 0.7674558
## Prediksi vs Aktual (Test) - Pruned
plot(test$medv, pred_test1_pr,
xlab = "Aktual (medv)", ylab = "Prediksi",
main = "Prediksi vs Aktual (Test) - Setelah Pruning")
abline(0, 1)
## Residual - Pruned (Test)
res_test1_pr <- test$medv - pred_test1_pr
hist(res_test1_pr, breaks = 30,
main = "Distribusi Residual (Test) - Setelah Pruning",
xlab = "Residual (Aktual - Prediksi)")
plot(pred_test1_pr, res_test1_pr,
xlab = "Prediksi", ylab = "Residual",
main = "Residual vs Prediksi (Test) - Setelah Pruning")
abline(h = 0, lty = 2)
fit2 <- rpart(
medv ~ lstat + rm + crim,
data = train,
method = "anova", # kunci untuk regresi
control = rpart.control(
minsplit = 20,
minbucket = 7,
maxdepth = 30,
cp = 0 # grow dulu
)
)
fit2
## n= 407
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 407 34125.580000 22.510570
## 2) rm< 6.8375 334 12681.220000 19.626950
## 4) lstat>=14.4 146 2607.915000 15.058220
## 8) crim>=7.006285 60 659.084000 11.860000
## 16) lstat>=19.73 40 382.927700 10.807500
## 32) lstat>=26.425 15 163.909300 9.506667 *
## 33) lstat< 26.425 25 178.406400 11.588000
## 66) lstat< 21.15 7 56.848570 10.214290 *
## 67) lstat>=21.15 18 103.211100 12.122220 *
## 17) lstat< 19.73 20 143.225500 13.965000
## 34) lstat< 17.4 11 81.485450 13.463640 *
## 35) lstat>=17.4 9 55.595560 14.577780 *
## 9) crim< 7.006285 86 906.940600 17.289530
## 18) crim>=0.614845 42 246.529800 15.702380
## 36) lstat>=18.885 14 54.474290 14.357140 *
## 37) lstat< 18.885 28 154.052500 16.375000
## 74) lstat>=15.965 20 121.857500 15.875000
## 148) crim< 3.11083 8 8.388750 14.612500 *
## 149) crim>=3.11083 12 92.216670 16.716670 *
## 75) lstat< 15.965 8 14.695000 17.625000 *
## 19) crim< 0.614845 44 453.619100 18.804550
## 38) lstat>=21.23 8 137.328800 16.012500 *
## 39) lstat< 21.23 36 240.067500 19.425000
## 78) crim>=0.147705 23 84.324350 18.726090
## 156) crim>=0.25557 9 8.628889 18.088890 *
## 157) crim< 0.25557 14 69.692140 19.135710 *
## 79) crim< 0.147705 13 124.630800 20.661540 *
## 5) lstat< 14.4 188 4659.132000 23.175000
## 10) lstat>=5.51 167 2966.036000 22.353290
## 20) crim< 7.24712 160 1380.684000 21.990000
## 40) rm< 6.062 65 404.937500 20.286150
## 80) rm>=6.0205 7 59.557140 18.342860 *
## 81) rm< 6.0205 58 315.755200 20.520690
## 162) rm< 5.8455 22 181.367700 19.868180
## 324) lstat>=12.065 11 95.781820 18.972730 *
## 325) lstat< 12.065 11 67.945450 20.763640 *
## 163) rm>=5.8455 36 119.296400 20.919440
## 326) crim< 0.06758 11 41.487270 19.854550 *
## 327) crim>=0.06758 25 59.846400 21.388000
## 654) lstat>=12.675 7 6.194286 20.471430 *
## 655) lstat< 12.675 18 45.484440 21.744440 *
## 41) rm>=6.062 95 657.934300 23.155790
## 82) lstat>=9.98 31 181.749700 21.396770
## 164) rm>=6.158 24 93.020000 20.950000
## 328) lstat>=11.655 11 33.725450 20.063640 *
## 329) lstat< 11.655 13 43.340000 21.700000 *
## 165) rm< 6.158 7 67.514290 22.928570 *
## 83) lstat< 9.98 64 333.806100 24.007810
## 166) crim< 0.052275 22 88.538640 22.522730
## 332) lstat>=8.195 10 48.021000 21.430000 *
## 333) lstat< 8.195 12 18.626670 23.433330 *
## 167) crim>=0.052275 42 171.331400 24.785710
## 334) rm< 6.611 32 85.160000 24.275000
## 668) lstat< 7.03 14 12.263570 23.821430 *
## 669) lstat>=7.03 18 67.776110 24.627780 *
## 335) rm>=6.611 10 51.116000 26.420000 *
## 21) crim>=7.24712 7 1081.557000 30.657140 *
## 11) lstat< 5.51 21 683.638100 29.709520
## 22) lstat>=4.44 14 178.268600 27.528570 *
## 23) lstat< 4.44 7 305.594300 34.071430 *
## 3) rm>=6.8375 73 5959.989000 35.704110
## 6) rm< 7.443 49 2037.207000 31.283670
## 12) lstat>=9.76 8 440.575000 23.025000 *
## 13) lstat< 9.76 41 944.519000 32.895120
## 26) lstat>=5.495 18 296.116100 30.272220 *
## 27) lstat< 5.495 23 427.657400 34.947830
## 54) crim< 0.048955 9 38.915560 33.077780 *
## 55) crim>=0.048955 14 337.035000 36.150000 *
## 7) rm>=7.443 24 1010.470000 44.729170
## 14) lstat>=4.54 7 692.917100 42.242860 *
## 15) lstat< 4.54 17 256.462400 45.752940 *
rpart.plot(fit2,
type = 2,
extra = 1,
fallen.leaves = TRUE,
tweak = 1.0
)
cp_table2 <- fit2$cptable
min_xerr2 <- min(cp_table2[, "xerror"])
min_row2 <- which.min(cp_table2[, "xerror"])
xstd_min2 <- cp_table2[min_row2, "xstd"]
# ambil pohon paling sederhana yang masih dalam 1 std dari minimum
best_row2 <- which(cp_table2[, "xerror"] <= (min_xerr2 + xstd_min2))[1]
best_cp2 <- cp_table2[best_row2, "CP"]
fit2_pruned <- prune(fit2, cp = best_cp2)
best_cp2
## [1] 0.006468622
fit2_pruned <- prune(fit2, cp = best_cp2)
rpart.plot(
fit2_pruned,
type = 2,
extra = 1,
fallen.leaves = TRUE,
tweak = 1.0
)
## Evaluasi Model Sebelum Pruning (Tree besar: cp=0)
### 1) Prediksi pada data train & test
pred_train2 <- predict(fit2, newdata = train)
pred_test2 <- predict(fit2, newdata = test)
### 2) Fungsi metrik (tanpa paket tambahan)
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae <- function(y, yhat) mean(abs(y - yhat))
r2 <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)
### 3) Hitung metrik
train2_rmse <- rmse(train$medv, pred_train2)
train2_mae <- mae(train$medv, pred_train2)
train2_r2 <- r2(train$medv, pred_train2)
test2_rmse <- rmse(test$medv, pred_test2)
test2_mae <- mae(test$medv, pred_test2)
test2_r2 <- r2(test$medv, pred_test2)
data.frame(
dataset = c("train", "test"),
RMSE = c(train2_rmse, test2_rmse),
MAE = c(train2_mae, test2_mae),
R2 = c(train2_r2, test2_r2)
)
## dataset RMSE MAE R2
## 1 train 3.594078 2.390291 0.8459402
## 2 test 4.028534 2.668065 0.8129524
## 4) Prediksi vs Aktual (Test)
plot(test$medv, pred_test2,
xlab = "Aktual (medv)", ylab = "Prediksi",
main = "Prediksi vs Aktual (Test) - Sebelum Pruning")
abline(0, 1)
## 5) Residual (Test)
res_test2 <- test$medv - pred_test2
hist(res_test2, breaks = 30,
main = "Distribusi Residual (Test) - Sebelum Pruning",
xlab = "Residual (Aktual - Prediksi)")
plot(pred_test2, res_test2,
xlab = "Prediksi", ylab = "Residual",
main = "Residual vs Prediksi (Test) - Sebelum Pruning")
abline(h = 0, lty = 2)
## Prediksi
pred_train2_pr <- predict(fit2_pruned, newdata = train)
pred_test2_pr <- predict(fit2_pruned, newdata = test)
## Metrik
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae <- function(y, yhat) mean(abs(y - yhat))
r2 <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)
## Hasil (pruned)
res2_pruned <- data.frame(
dataset = c("train", "test"),
RMSE = c(rmse(train$medv, pred_train2_pr), rmse(test$medv, pred_test2_pr)),
MAE = c(mae(train$medv, pred_train2_pr), mae(test$medv, pred_test2_pr)),
R2 = c(r2(train$medv, pred_train2_pr), r2(test$medv, pred_test2_pr))
)
res2_pruned
## dataset RMSE MAE R2
## 1 train 4.084385 2.85765 0.8010392
## 2 test 4.604186 3.25753 0.7556773
## Prediksi vs Aktual (Test) - Pruned
plot(test$medv, pred_test2_pr,
xlab = "Aktual (medv)", ylab = "Prediksi",
main = "Prediksi vs Aktual (Test) - Setelah Pruning")
abline(0, 1)
## Residual - Pruned (Test)
res_test2_pr <- test$medv - pred_test2_pr
hist(res_test2_pr, breaks = 30,
main = "Distribusi Residual (Test) - Setelah Pruning",
xlab = "Residual (Aktual - Prediksi)")
plot(pred_test2_pr, res_test2_pr,
xlab = "Prediksi", ylab = "Residual",
main = "Residual vs Prediksi (Test) - Setelah Pruning")
abline(h = 0, lty = 2)
fit3 <- rpart(
medv ~ lstat + rm + dis,
data = train,
method = "anova", # kunci untuk regresi
control = rpart.control(
minsplit = 20,
minbucket = 7,
maxdepth = 30,
cp = 0 # grow dulu
)
)
fit3
## n= 407
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 407 34125.58000 22.510570
## 2) rm< 6.8375 334 12681.22000 19.626950
## 4) lstat>=14.4 146 2607.91500 15.058220
## 8) dis< 2.0037 68 911.63220 12.516180
## 16) lstat>=19.11 48 534.88980 11.402080
## 32) lstat>=29.605 12 158.12920 9.491667 *
## 33) lstat< 29.605 36 318.36560 12.038890
## 66) dis>=1.62265 17 102.42470 10.517650 *
## 67) dis< 1.62265 19 141.40000 13.400000 *
## 17) lstat< 19.11 20 174.17800 15.190000
## 34) lstat>=15.965 12 56.98667 14.566670 *
## 35) lstat< 15.965 8 105.53500 16.125000 *
## 9) dis>=2.0037 78 873.78870 17.274360
## 18) dis< 2.44945 30 247.98970 15.703330
## 36) rm>=6.077 18 59.38000 14.600000 *
## 37) rm< 6.077 12 133.82920 17.358330 *
## 19) dis>=2.44945 48 505.47810 18.256250
## 38) lstat>=16.215 30 363.70970 17.396670
## 76) rm< 5.9505 14 130.57430 16.442860 *
## 77) rm>=5.9505 16 209.25440 18.231250 *
## 39) lstat< 16.215 18 82.65778 19.688890 *
## 5) lstat< 14.4 188 4659.13200 23.175000
## 10) lstat>=5.51 167 2966.03600 22.353290
## 20) dis>=1.7548 160 1307.66400 21.983120
## 40) rm< 6.062 63 325.52000 20.233330
## 80) rm>=6.0205 7 59.55714 18.342860 *
## 81) rm< 6.0205 56 237.81840 20.469640
## 162) dis>=4.5006 26 61.32462 19.453850
## 324) lstat>=10.465 11 28.42182 18.572730 *
## 325) lstat< 10.465 15 18.10000 20.100000 *
## 163) dis< 4.5006 30 126.41500 21.350000
## 326) rm< 5.852 13 72.76308 20.476920 *
## 327) rm>=5.852 17 36.16471 22.017650 *
## 41) rm>=6.062 97 663.97280 23.119590
## 82) lstat>=9.98 33 181.74970 21.396970
## 164) rm>=6.158 26 93.39385 20.984620
## 328) lstat>=11.655 13 36.74769 20.269230 *
## 329) lstat< 11.655 13 43.34000 21.700000 *
## 165) rm< 6.158 7 67.51429 22.928570 *
## 83) lstat< 9.98 64 333.80610 24.007810
## 166) rm< 6.611 52 212.70060 23.536540
## 332) dis>=6.2284 15 65.25733 22.553330 *
## 333) dis< 6.2284 37 127.06430 23.935140
## 666) rm< 6.168 7 20.72857 22.514290 *
## 667) rm>=6.168 30 88.90667 24.266670
## 1334) dis< 3.7917 9 10.44222 23.355560 *
## 1335) dis>=3.7917 21 67.79143 24.657140
## 2670) dis>=4.75025 12 29.24667 24.166670 *
## 2671) dis< 4.75025 9 31.80889 25.311110 *
## 167) rm>=6.611 12 59.51000 26.050000 *
## 21) dis< 1.7548 7 1135.32900 30.814290 *
## 11) lstat< 5.51 21 683.63810 29.709520
## 22) lstat>=4.44 14 178.26860 27.528570 *
## 23) lstat< 4.44 7 305.59430 34.071430 *
## 3) rm>=6.8375 73 5959.98900 35.704110
## 6) rm< 7.443 49 2037.20700 31.283670
## 12) lstat>=9.76 8 440.57500 23.025000 *
## 13) lstat< 9.76 41 944.51900 32.895120
## 26) lstat>=5.495 18 296.11610 30.272220 *
## 27) lstat< 5.495 23 427.65740 34.947830
## 54) dis>=4.417 16 97.83750 33.887500 *
## 55) dis< 4.417 7 270.71430 37.371430 *
## 7) rm>=7.443 24 1010.47000 44.729170
## 14) lstat>=4.54 7 692.91710 42.242860 *
## 15) lstat< 4.54 17 256.46240 45.752940 *
rpart.plot(fit3,
type = 2,
extra = 1,
fallen.leaves = TRUE,
tweak = 1.0
)
cp_table3 <- fit3$cptable
min_xerr3 <- min(cp_table3[, "xerror"])
min_row3 <- which.min(cp_table3[, "xerror"])
xstd_min3 <- cp_table3[min_row3, "xstd"]
# ambil pohon paling sederhana yang masih dalam 1 std dari minimum
best_row3 <- which(cp_table3[, "xerror"] <= (min_xerr3 + xstd_min3))[1]
best_cp3 <- cp_table3[best_row3, "CP"]
fit3_pruned <- prune(fit3, cp = best_cp3)
best_cp3
## [1] 0.009323552
fit3_pruned <- prune(fit3, cp = best_cp3)
rpart.plot(
fit2_pruned,
type = 2,
extra = 1,
fallen.leaves = TRUE,
tweak = 1.0
)
## Evaluasi Model Sebelum Pruning (Tree besar: cp=0)
### 1) Prediksi pada data train & test
pred_train3 <- predict(fit3, newdata = train)
pred_test3 <- predict(fit3, newdata = test)
### 2) Fungsi metrik (tanpa paket tambahan)
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae <- function(y, yhat) mean(abs(y - yhat))
r2 <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)
### 3) Hitung metrik
train3_rmse <- rmse(train$medv, pred_train3)
train3_mae <- mae(train$medv, pred_train3)
train3_r2 <- r2(train$medv, pred_train3)
test3_rmse <- rmse(test$medv, pred_test3)
test3_mae <- mae(test$medv, pred_test3)
test3_r2 <- r2(test$medv, pred_test3)
data.frame(
dataset = c("train", "test"),
RMSE = c(train3_rmse, test3_rmse),
MAE = c(train3_mae, test3_mae),
R2 = c(train3_r2, test3_r2)
)
## dataset RMSE MAE R2
## 1 train 3.653811 2.490784 0.8407767
## 2 test 4.444349 3.070000 0.7723465
## 4) Prediksi vs Aktual (Test)
plot(test$medv, pred_test3,
xlab = "Aktual (medv)", ylab = "Prediksi",
main = "Prediksi vs Aktual (Test) - Sebelum Pruning")
abline(0, 1)
## 5) Residual (Test)
res_test3 <- test$medv - pred_test3
hist(res_test3, breaks = 30,
main = "Distribusi Residual (Test) - Sebelum Pruning",
xlab = "Residual (Aktual - Prediksi)")
plot(pred_test3, res_test3,
xlab = "Prediksi", ylab = "Residual",
main = "Residual vs Prediksi (Test) - Sebelum Pruning")
abline(h = 0, lty = 2)
## Prediksi
pred_train3_pr <- predict(fit3_pruned, newdata = train)
pred_test3_pr <- predict(fit3_pruned, newdata = test)
## Metrik
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae <- function(y, yhat) mean(abs(y - yhat))
r2 <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)
## Hasil (pruned)
res3_pruned <- data.frame(
dataset = c("train", "test"),
RMSE = c(rmse(train$medv, pred_train3_pr), rmse(test$medv, pred_test3_pr)),
MAE = c(mae(train$medv, pred_train3_pr), mae(test$medv, pred_test3_pr)),
R2 = c(r2(train$medv, pred_train3_pr), r2(test$medv, pred_test3_pr))
)
res3_pruned
## dataset RMSE MAE R2
## 1 train 4.237314 3.090222 0.7858611
## 2 test 4.870643 3.466103 0.7265797
## Prediksi vs Aktual (Test) - Pruned
plot(test$medv, pred_test3_pr,
xlab = "Aktual (medv)", ylab = "Prediksi",
main = "Prediksi vs Aktual (Test) - Setelah Pruning")
abline(0, 1)
## Residual - Pruned (Test)
res_test3_pr <- test$medv - pred_test3_pr
hist(res_test3_pr, breaks = 30,
main = "Distribusi Residual (Test) - Setelah Pruning",
xlab = "Residual (Aktual - Prediksi)")
plot(pred_test3_pr, res_test3_pr,
xlab = "Prediksi", ylab = "Residual",
main = "Residual vs Prediksi (Test) - Setelah Pruning")
abline(h = 0, lty = 2)
# KOMBINASI 1
res1_before <- data.frame(
Model = "Kombinasi 1",
Tahap = "Before Pruning",
dataset = c("train", "test"),
RMSE = c(train1_rmse, test1_rmse),
MAE = c(train1_mae, test1_mae),
R2 = c(train1_r2, test1_r2)
)
res1_after <- data.frame(
Model = "Kombinasi 1",
Tahap = "After Pruning",
dataset = c("train", "test"),
RMSE = c(rmse(train$medv, pred_train1_pr),
rmse(test$medv, pred_test1_pr)),
MAE = c(mae(train$medv, pred_train1_pr),
mae(test$medv, pred_test1_pr)),
R2 = c(r2(train$medv, pred_train1_pr),
r2(test$medv, pred_test1_pr))
)
# KOMBINASI 2
res2_before <- data.frame(
Model = "Kombinasi 2",
Tahap = "Before Pruning",
dataset = c("train", "test"),
RMSE = c(train2_rmse, test2_rmse),
MAE = c(train2_mae, test2_mae),
R2 = c(train2_r2, test2_r2)
)
res2_after <- data.frame(
Model = "Kombinasi 2",
Tahap = "After Pruning",
dataset = c("train", "test"),
RMSE = c(rmse(train$medv, pred_train2_pr),
rmse(test$medv, pred_test2_pr)),
MAE = c(mae(train$medv, pred_train2_pr),
mae(test$medv, pred_test2_pr)),
R2 = c(r2(train$medv, pred_train2_pr),
r2(test$medv, pred_test2_pr))
)
# KOMBINASI 3
res3_before <- data.frame(
Model = "Kombinasi 3",
Tahap = "Before Pruning",
dataset = c("train", "test"),
RMSE = c(train3_rmse, test3_rmse),
MAE = c(train3_mae, test3_mae),
R2 = c(train3_r2, test3_r2)
)
res3_after <- data.frame(
Model = "Kombinasi 3",
Tahap = "After Pruning",
dataset = c("train", "test"),
RMSE = c(rmse(train$medv, pred_train3_pr),
rmse(test$medv, pred_test3_pr)),
MAE = c(mae(train$medv, pred_train3_pr),
mae(test$medv, pred_test3_pr)),
R2 = c(r2(train$medv, pred_train3_pr),
r2(test$medv, pred_test3_pr))
)
# GABUNGKAN SEMUA
res_all_models <- rbind(
res1_before, res1_after,
res2_before, res2_after,
res3_before, res3_after
)
res_all_models
## Model Tahap dataset RMSE MAE R2
## 1 Kombinasi 1 Before Pruning train 3.612199 2.430827 0.8443828
## 2 Kombinasi 1 Before Pruning test 3.989146 2.896804 0.8165922
## 3 Kombinasi 1 After Pruning train 4.561107 3.096522 0.7518840
## 4 Kombinasi 1 After Pruning test 4.491834 3.326420 0.7674558
## 5 Kombinasi 2 Before Pruning train 3.594078 2.390291 0.8459402
## 6 Kombinasi 2 Before Pruning test 4.028534 2.668065 0.8129524
## 7 Kombinasi 2 After Pruning train 4.084385 2.857650 0.8010392
## 8 Kombinasi 2 After Pruning test 4.604186 3.257530 0.7556773
## 9 Kombinasi 3 Before Pruning train 3.653811 2.490784 0.8407767
## 10 Kombinasi 3 Before Pruning test 4.444349 3.070000 0.7723465
## 11 Kombinasi 3 After Pruning train 4.237314 3.090222 0.7858611
## 12 Kombinasi 3 After Pruning test 4.870643 3.466103 0.7265797
Model kombinasi 1 sebelum pruning memiliki performa yang baik pada data train (RMSE 3.61; MAE 2.43; R² 0.84), namun menurun pada data test (RMSE 3.99; MAE 2.90; R² 0.82). Perbedaan performa train-test tersebut menjadi indikasi adanya overfitting karena lebih menyesuaikan data train dibandingkan kemampuan generalisasi.
Model kombinasi 1 sesudah pruning memiliki performa yang menurun tetapi lebih stabil dengan data train (RMSE 4.27; MAE 2.91; R² 0.78) dan data test (RMSE 4.25; MAE 3.08; R² 0.79). Kondisi ini menunjukkan model cukup sederhana, namun memiliki kemampuan generalisasi yang lebih baik.
Model kombinasi 2 sebelum pruning memiliki performa yang sangat baik pada data train (RMSE 3.59; MAE 2.39; R² 0.85), namun menurun pada data test (RMSE 4.03; MAE 2.67; R² 0.81). Perbedaan performa train-test tersebut menjadi indikasi adanya overfitting karena lebih menyesuaikan data train dibandingkan kemampuan generalisasi.
Model kombinasi 2 sesudah pruning memiliki performa yang kurang baik meskipun cukup stabil dengan nilai data train (RMSE 4.51; MAE 3.05; R² 0.76) dan data test (RMSE 4.48; MAE 3.27; R² 0.77). Kondisi ini menunjukkan pruning yang dilakukan terlalu agresif sehingga model menjadi terlalu sederhana dan kurang baik untuk prediksi.
Model kombinasi 3 sebelum pruning memiliki performa yang baik pada data train (RMSE 3.65; MAE 2.49; R² 0.84), namun menurun jauh pada data test (RMSE 4.44; MAE 3.07; R² 0.77). Perbedaan performa train-test tersebut menjadi indikasi adanya overfitting karena lebih menyesuaikan data train dibandingkan kemampuan generalisasi.
Model kombinasi 3 sesudah pruning memiliki performa yang kurang baik meskipun cukup stabil dengan nilai data train (RMSE 4.39; MAE 3.09; R² 0.77) dan data test (RMSE 4.42; MAE 3.24; R² 0.77). Kondisi ini menunjukkan pruning yang dilakukan terlalu agresif sehingga model menjadi terlalu sederhana dan kurang baik untuk prediksi.
Secara keseluruhan, model kombinasi 1 sebelum pruning menjadi model terbaik karena memiliki nilai RMSE terkecil pada data test (3.99) serta nilai R² tertinggi pada data test (0.82).