library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(recipes)
##
## Attaching package: 'recipes'
##
## The following object is masked from 'package:stringr':
##
## fixed
##
## The following object is masked from 'package:stats':
##
## step
library(rsample)
library(yardstick)
##
## Attaching package: 'yardstick'
##
## The following object is masked from 'package:readr':
##
## spec
library(parsnip)
library(broom)
library(tictoc)
Skenario Pembangkitan Data Membuat data simulasi untuk memodelkan variabel fisiologis (misal triceps).
x1 (age) → dua kelompok (child dan teen/adult) dengan distribusi acak. y (respon) → berat badan
set.seed(123)
n <- 150
# Variabel utama
p_child <- 0.55
n_child <- round(n * p_child)
x1_child <- runif(n_child, 0, 12)
x1_teenadult <- runif(n - n_child, 12, 52)
x1 <- sample(c(x1_child, x1_teenadult)) # sebaran tidak merata
# Rata-rata triceps bergantung pada x1 (nonlinear) + pengaruh x2 & x3
mu <- 5.5 + 0.06 * x1 + 1.5 * pmax(x1 - 15, 0) / 20
# Heteroskedastisitas: makin tua makin bervariasi
sd_noise <- 1.0 + 0.10 * x1
# Bangkitkan respon (triceps-like)
y <- rnorm(n, mean = mu, sd = sd_noise)
y <- pmax(y, 3)
# Gabungkan ke data frame
dat <- data.frame(y=y,x1 = x1)
# Visualisasi
ggplot(dat,aes(x=x1, y=y)) +
geom_point(color="#03A9F4") +
theme_bw()
Berdasarkan pola hubungan yang terlihat pada scatterplot, kita akan mencoba untuk mencari model linear yang dapat merepresentasikan pola hubungan non linear dengan baik. Model-model yang akan kita bahas antara lain:
Regresi Polinomial Regresi Fungsi Tangga
Kode itu membuat dataset baru dengan menambahkan fitur polinomial (x1 dan x1²) menggunakan pipeline recipes; prep() menyiapkan transformasi, lalu bake() menerapkannya ke data sehingga siap dipakai untuk modeling.
dat_poly2 <- recipe(y~.,data = dat) %>%
step_poly(x1,degree = 2,options = list(raw=TRUE) ) %>%
prep() %>%
bake(new_data=NULL)
dat_poly2
## # A tibble: 150 × 3
## y x1_poly_1 x1_poly_2
## <dbl> <dbl> <dbl>
## 1 11.6 38.3 1465.
## 2 5.10 4.22 17.8
## 3 3.88 11.9 142.
## 4 9.45 14.4 208.
## 5 8.88 48.6 2360.
## 6 5.10 32.8 1079.
## 7 6.96 25.3 641.
## 8 9.45 45.7 2089.
## 9 8.54 11.3 127.
## 10 5.95 5.28 27.9
## # ℹ 140 more rows
mod_poly2 = lm(y ~ .,
data=dat_poly2)
Fungsi tidy() (dari paket broom) mengubah output lm() menjadi tabel koefisien yang mudah dibaca dan diolah lanjut (misalnya untuk laporan atau visualisasi).
tidy(mod_poly2)
## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 5.52 0.531 10.4 2.62e-19
## 2 x1_poly_1 0.0711 0.0624 1.14 2.56e- 1
## 3 x1_poly_2 0.000955 0.00126 0.759 4.49e- 1
augment(mod_poly2) (dari paket broom) membuat tabel berisi data asli + kolom tambahan seperti .fitted (nilai prediksi dari model) dan .resid (selisih antara nilai aktual dan prediksi).
mod_poly2_fitted <- augment(mod_poly2)
mod_poly2_fitted
## # A tibble: 150 × 9
## y x1_poly_1 x1_poly_2 .fitted .resid .hat .sigma .cooksd .std.resid
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 11.6 38.3 1465. 9.64 2.01 0.0191 3.00 2.97e-3 0.677
## 2 5.10 4.22 17.8 5.84 -0.738 0.0143 3.00 2.98e-4 -0.249
## 3 3.88 11.9 142. 6.51 -2.62 0.0107 3.00 2.79e-3 -0.881
## 4 9.45 14.4 208. 6.75 2.70 0.0130 2.99 3.64e-3 0.909
## 5 8.88 48.6 2360. 11.2 -2.34 0.0518 3.00 1.18e-2 -0.805
## 6 5.10 32.8 1079. 8.89 -3.79 0.0198 2.99 1.10e-2 -1.28
## 7 6.96 25.3 641. 7.93 -0.976 0.0215 3.00 7.95e-4 -0.330
## 8 9.45 45.7 2089. 10.8 -1.32 0.0349 3.00 2.43e-3 -0.449
## 9 8.54 11.3 127. 6.45 2.09 0.0102 3.00 1.70e-3 0.703
## 10 5.95 5.28 27.9 5.92 0.0215 0.0121 3.00 2.14e-7 0.00724
## # ℹ 140 more rows
# Visualisasi
ggplot(mod_poly2_fitted,aes(x=x1_poly_1, y=y)) +
geom_point(color="#03A9F4") +
geom_line(aes(x=x1_poly_1,y=.fitted),color="#E54D03",
linewidth=1.5) +
annotate("text",x = 5,y=33,
label=str_c("R-squared = ",100*round(glance(mod_poly2)$adj.r.squared,3)))+
theme_bw()
persiapkan datanya terlebih dahulu, untuk menambahkan variabel berderajat 3
# Poly 3
dat_poly3 <- recipe(y~.,data = dat) %>%
step_poly(x1,degree = 3,options = list(raw=TRUE) ) %>%
prep() %>%
bake(new_data=NULL)
dat_poly3
## # A tibble: 150 × 4
## y x1_poly_1 x1_poly_2 x1_poly_3
## <dbl> <dbl> <dbl> <dbl>
## 1 11.6 38.3 1465. 56051.
## 2 5.10 4.22 17.8 75.2
## 3 3.88 11.9 142. 1698.
## 4 9.45 14.4 208. 3004.
## 5 8.88 48.6 2360. 114632.
## 6 5.10 32.8 1079. 35434.
## 7 6.96 25.3 641. 16219.
## 8 9.45 45.7 2089. 95501.
## 9 8.54 11.3 127. 1437.
## 10 5.95 5.28 27.9 147.
## # ℹ 140 more rows
# Pemodelan
mod_poly3 = lm(y ~ .,
data=dat_poly3)
mod_poly3_fitted <- augment(mod_poly3)
mod_poly3_fitted
## # A tibble: 150 × 10
## y x1_poly_1 x1_poly_2 x1_poly_3 .fitted .resid .hat .sigma .cooksd
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 11.6 38.3 1465. 56051. 9.93 1.72 0.0349 3.00 0.00307
## 2 5.10 4.22 17.8 75.2 5.87 -0.768 0.0144 3.01 0.000244
## 3 3.88 11.9 142. 1698. 6.30 -2.42 0.0186 3.00 0.00314
## 4 9.45 14.4 208. 3004. 6.55 2.91 0.0206 3.00 0.00505
## 5 8.88 48.6 2360. 114632. 11.1 -2.17 0.0572 3.00 0.00848
## 6 5.10 32.8 1079. 35434. 9.15 -4.04 0.0321 2.99 0.0156
## 7 6.96 25.3 641. 16219. 7.99 -1.03 0.0221 3.01 0.000687
## 8 9.45 45.7 2089. 95501. 10.8 -1.37 0.0353 3.00 0.00197
## 9 8.54 11.3 127. 1437. 6.25 2.29 0.0178 3.00 0.00270
## 10 5.95 5.28 27.9 147. 5.90 0.0508 0.0123 3.01 0.000000905
## # ℹ 140 more rows
## # ℹ 1 more variable: .std.resid <dbl>
# Visualisasi
ggplot(mod_poly3_fitted,aes(x=x1_poly_1, y=y)) +
geom_point(color="#03A9F4") +
geom_line(aes(x=x1_poly_1,y=.fitted),color="#E54D03",
linewidth=1.5) +
annotate("text",x = 5,y=33,
label=str_c("R-squared = ",100*round(glance(mod_poly3)$adj.r.squared,3)))+
theme_bw()
mod_poly_n <- function(data,d=2){
dat_poly <- recipe(y~.,data = dat) %>%
step_poly(x1,degree = d,options = list(raw=TRUE) ) %>%
prep()
mod_poly = lm(y ~ .,data= bake(dat_poly,new_data=NULL) )
output <- list("model"=mod_poly,"recipe_result"=dat_poly)
return(output)
}
Isi dari fungsi mod_poly_n bisa digunakan untuk sembarang data hanya dengan mengganti syntax triceps~.,data = triceps dan step_poly(age,…) dengan data dan variabel respon yang diinginkan.
mod_poly_n(data = dat,d = 2)
## $model
##
## Call:
## lm(formula = y ~ ., data = bake(dat_poly, new_data = NULL))
##
## Coefficients:
## (Intercept) x1_poly_1 x1_poly_2
## 5.5227237 0.0711039 0.0009547
##
##
## $recipe_result
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 1
##
## ── Training information
## Training data contained 150 data points and no incomplete rows.
##
## ── Operations
## • Polynomial expansion on: x1 | Trained
# Bangkitkan Data
set.seed(2045)
cross_val <- vfold_cv(dat,v=10)
cross_val
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [135/15]> Fold01
## 2 <split [135/15]> Fold02
## 3 <split [135/15]> Fold03
## 4 <split [135/15]> Fold04
## 5 <split [135/15]> Fold05
## 6 <split [135/15]> Fold06
## 7 <split [135/15]> Fold07
## 8 <split [135/15]> Fold08
## 9 <split [135/15]> Fold09
## 10 <split [135/15]> Fold10
Untuk melakukan Cross Validation kita akan menggunakan fungsi mod_poly_n_train_test dibawah ini. Fungsi mod_poly_n_train_test() adalah wrapper function untuk workflow modeling: ambil data train–test → fit model polynomial → transform test → prediksi → kembalikan hasil perbandingan prediksi & aktual
# Lakukan Cross Validation
mod_poly_n_train_test <- function(split,derajat=2,y){
train_dt <- training(split)
test_dt <- testing(split)
model <- mod_poly_n(data = train_dt,d = derajat)
test_dt <- bake(model$recipe_result,new_data=test_dt)
pred_test_dt <- predict(model$model,newdata=test_dt)
response <- test_dt %>% pull(y)
result <- data.frame(degree=derajat,estimate=pred_test_dt,truth=response)
return(result)
}
Kemudian, fungsi mod_poly_n_train_test akan kita coba jalankan
mod_poly_n_train_test(cross_val$splits[[1]],derajat = 3,y="y")
## degree estimate truth
## 1 3 6.546075 9.451542
## 2 3 6.125392 5.266654
## 3 3 6.345608 4.020051
## 4 3 6.031335 8.031354
## 5 3 6.200623 6.268357
## 6 3 6.691449 5.021718
## 7 3 5.988831 5.631993
## 8 3 5.892825 5.591014
## 9 3 8.467500 11.459906
## 10 3 11.232018 12.341180
## 11 3 7.334609 10.708091
## 12 3 8.460018 5.461137
## 13 3 6.110999 4.238305
## 14 3 5.865963 5.937156
## 15 3 6.127333 5.390345
eval_metric_poly() melakukan cross-validation untuk beberapa derajat polynomial, menghitung metrik performa tiap model (misalnya RMSE), lalu merata-ratakannya per derajat untuk menentukan model terbaik.
# Menentukan model terbaik dengan eval_poly()
eval_metric_poly <- function(cross_val,metric,list_degree,y){
grid1 <- expand_grid(splits=cross_val$splits,list_degree)
cv_pred <- map2(.x=grid1$splits,.y=grid1$list_degree,mod_poly_n_train_test,y = y)
metric_custom <- function(x,truth,estimate){
cbind("degree"=x$degree[1],metric(x,truth,estimate))
}
result <- map(cv_pred,metric_custom,truth = truth,estimate = estimate) %>%
list_rbind() %>%
select(-.estimator) %>%
group_by(degree,.metric) %>%
summarise(mean_estimate=mean(.estimate),
num_folds = n(),
std_error = sd(.estimate) / sqrt(num_folds) )
return(result)
}
# MSE
mse <- function(data,truth,estimate){
mse <- mean((data$truth - data$estimate) ^ 2)
data.frame(.metric="mse",
.estimator="standard",
.estimate=mse)
}
Berikut kita jalankan fungsi eval_metric_poly untuk mendapatkan evaluasi model dari beberapa derajat polinomial. Evaluasi model dilakukan dengan metrik MSE. Fungsi tic dan toc merupakan fungsi untuk mengukur waktu running.
list_degree <- 2:10
list_degree
## [1] 2 3 4 5 6 7 8 9 10
multi_metric <- mse
tic()
mod_poly_cv_metric <- eval_metric_poly(cross_val=cross_val,
metric=multi_metric,
list_degree = list_degree,
y="y")
## `summarise()` has grouped output by 'degree'. You can override using the
## `.groups` argument.
toc()
## 3.86 sec elapsed
mod_poly_cv_metric
## # A tibble: 9 × 5
## # Groups: degree [9]
## degree .metric mean_estimate num_folds std_error
## <int> <chr> <dbl> <int> <dbl>
## 1 2 mse 8.78 10 1.19
## 2 3 mse 8.74 10 1.19
## 3 4 mse 8.69 10 1.12
## 4 5 mse 8.63 10 1.01
## 5 6 mse 8.62 10 1.00
## 6 7 mse 8.53 10 1.04
## 7 8 mse 8.53 10 1.03
## 8 9 mse 8.43 10 0.932
## 9 10 mse 8.00 10 0.783
Berdsarakan tabel diatas dapat disimpulkan bahwa degree=10 merupakan derajat polinomial terbaik berdasarkan metric MSE. Sehingga model terbaiknya adalah Regresi Polinomial dengan derajat 10. Namun, jika kita ingin mendapatkan model yang lebih sederhana kita dapat menggunakan one standard error rule dengan grafik dibawah ini.
# Visualisasi
ggplot(mod_poly_cv_metric,aes(degree,mean_estimate)) +
geom_point()+
geom_errorbar(aes(ymin = mean_estimate-std_error, ymax = mean_estimate+std_error))+
ylab("MSE") +
scale_x_continuous(n.breaks = nrow(mod_poly_cv_metric)) +
theme_bw()
Berdasarkan one standard error rule, maka model terbaik adalah degree=3 yang berarti model terbaik adalah Regresi Polinomial dengan derajat 3.
Mendefinisikan recipe (recipe(y~., data = dat)) → Membuat “resep” pemrosesan data, di mana y adalah variabel target dan semua kolom lainnya (.) menjadi prediktor.
Langkah step_discretize(x1, num_breaks = 5, options = list(prefix = NULL)) → Mengubah variabel kontinu x1 menjadi variabel kategorikal (diskrit) dengan 5 interval (bin) berdasarkan rentang nilainya. Jadi, x1 dibagi menjadi 5 “tangga” atau kelompok nilai.
prep() → Menjalankan resep untuk menghitung batas antar-bin (cut points) berdasarkan data dat.
bake(new_data = NULL) → Menerapkan hasil “resep” ke data asli, menghasilkan dataset baru (dat_tangga5) di mana x1 sudah berubah menjadi versi diskrit.
# Num Breaks = 5
dat_tangga5 <- recipe(y~.,data = dat) %>%
step_discretize(x1,num_breaks = 5,
options = list(prefix = NULL)) %>%
prep() %>%
bake(new_data=NULL)
dat_tangga5
## # A tibble: 150 × 2
## x1 y
## <fct> <dbl>
## 1 (34.3, Inf] 11.6
## 2 [-Inf,4.54] 5.10
## 3 (8.83,17.7] 3.88
## 4 (8.83,17.7] 9.45
## 5 (34.3, Inf] 8.88
## 6 (17.7,34.3] 5.10
## 7 (17.7,34.3] 6.96
## 8 (34.3, Inf] 9.45
## 9 (8.83,17.7] 8.54
## 10 (4.54,8.83] 5.95
## # ℹ 140 more rows
mod_tangga5 = lm(y ~ .,
data=dat_tangga5)
tidy(mod_tangga5)
## # A tibble: 5 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.16 0.558 11.1 5.45e-21
## 2 x1(4.54,8.83] -0.234 0.789 -0.297 7.67e- 1
## 3 x1(8.83,17.7] -0.378 0.789 -0.480 6.32e- 1
## 4 x1(17.7,34.3] 2.49 0.789 3.16 1.94e- 3
## 5 x1(34.3, Inf] 3.97 0.789 5.04 1.37e- 6
Karena model regresi tangga (piecewise constant) menggunakan dummy variabel (kategorikal). Artinya:
Dari 5 bin hasil step_discretize(),
R hanya membuat 4 kolom dummy di model,
Karena 1 bin pertama dijadikan baseline (acuan).
awal tadi ada fungsi step_discretize(x1, num_breaks = 5), yang mengubah variabel jadi bin-bin, akibatnya, setelah augment(mod_tangga5), kita tidak lagi punya nilai asli x1 (yang numerik).
cbind() di sini digunakan untuk menggabungkan hasil augment (prediksi, residual) dengan nilai asli x1, supaya kamu bisa menganalisis atau memvisualisasikan model tangga terhadap data aslinya.
mod_tangga5_fitted <- cbind(augment(mod_tangga5),"x1_init"=dat$x1)
mod_tangga5_fitted
## y x1 .fitted .resid .hat .sigma
## 1 11.648607 (34.3, Inf] 10.138827 1.50977995 0.03333333 3.062052
## 2 5.101427 [-Inf,4.54] 6.164829 -1.06340257 0.03333333 3.063399
## 3 3.883868 (8.83,17.7] 5.786473 -1.90260461 0.03333333 3.060479
## 4 9.451542 (8.83,17.7] 5.786473 3.66506899 0.03333333 3.048941
## 5 8.884801 (34.3, Inf] 10.138827 -1.25402627 0.03333333 3.062881
## 6 5.100806 (17.7,34.3] 8.654198 -3.55339188 0.03333333 3.049890
## 7 6.957874 (17.7,34.3] 8.654198 -1.69632336 0.03333333 3.061351
## 8 9.447288 (34.3, Inf] 10.138827 -0.69153924 0.03333333 3.064164
## 9 8.539669 (8.83,17.7] 5.786473 2.75319613 0.03333333 3.055828
## 10 5.946140 (4.54,8.83] 5.930910 0.01522988 0.03333333 3.064725
## 11 7.716147 (8.83,17.7] 5.786473 1.92967440 0.03333333 3.060358
## 12 5.157525 (8.83,17.7] 5.786473 -0.62894836 0.03333333 3.064261
## 13 6.596293 (8.83,17.7] 5.786473 0.80982011 0.03333333 3.063956
## 14 5.469197 (8.83,17.7] 5.786473 -0.31727573 0.03333333 3.064607
## 15 6.062134 (4.54,8.83] 5.930910 0.13122340 0.03333333 3.064705
## 16 4.436167 (4.54,8.83] 5.930910 -1.49474308 0.03333333 3.062105
## 17 3.176721 (17.7,34.3] 8.654198 -5.47747673 0.03333333 3.029357
## 18 8.488874 [-Inf,4.54] 6.164829 2.32404478 0.03333333 3.058388
## 19 6.276347 [-Inf,4.54] 6.164829 0.11151807 0.03333333 3.064710
## 20 3.884567 (4.54,8.83] 5.930910 -2.04634337 0.03333333 3.059813
## 21 4.883745 (4.54,8.83] 5.930910 -1.04716552 0.03333333 3.063439
## 22 3.872000 (34.3, Inf] 10.138827 -6.26682728 0.03333333 3.018345
## 23 8.392461 [-Inf,4.54] 6.164829 2.22763161 0.03333333 3.058903
## 24 10.397693 (17.7,34.3] 8.654198 1.74349545 0.03333333 3.061160
## 25 9.470339 (34.3, Inf] 10.138827 -0.66848763 0.03333333 3.064201
## 26 12.286498 (34.3, Inf] 10.138827 2.14767150 0.03333333 3.059314
## 27 5.266654 (8.83,17.7] 5.786473 -0.51981908 0.03333333 3.064408
## 28 5.066459 [-Inf,4.54] 6.164829 -1.09836993 0.03333333 3.063311
## 29 4.658975 [-Inf,4.54] 6.164829 -1.50585455 0.03333333 3.062066
## 30 4.906333 [-Inf,4.54] 6.164829 -1.25849639 0.03333333 3.062868
## 31 18.415276 (34.3, Inf] 10.138827 8.27644949 0.03333333 2.983361
## 32 6.675506 (17.7,34.3] 8.654198 -1.97869231 0.03333333 3.060133
## 33 11.467101 (34.3, Inf] 10.138827 1.32827434 0.03333333 3.062656
## 34 9.332378 (17.7,34.3] 8.654198 0.67818060 0.03333333 3.064186
## 35 17.931579 (34.3, Inf] 10.138827 7.79275182 0.03333333 2.992705
## 36 7.833123 (34.3, Inf] 10.138827 -2.30570421 0.03333333 3.058488
## 37 4.020051 (8.83,17.7] 5.786473 -1.76642246 0.03333333 3.061066
## 38 17.176263 (34.3, Inf] 10.138827 7.03743606 0.03333333 3.006119
## 39 5.240769 (8.83,17.7] 5.786473 -0.54570394 0.03333333 3.064376
## 40 6.797823 (34.3, Inf] 10.138827 -3.34100360 0.03333333 3.051614
## 41 3.645220 (8.83,17.7] 5.786473 -2.14125279 0.03333333 3.059347
## 42 3.258330 (17.7,34.3] 8.654198 -5.39586750 0.03333333 3.030409
## 43 4.940949 (4.54,8.83] 5.930910 -0.98996144 0.03333333 3.063576
## 44 6.764298 (4.54,8.83] 5.930910 0.83338751 0.03333333 3.063911
## 45 8.031354 (4.54,8.83] 5.930910 2.10044407 0.03333333 3.059550
## 46 6.624928 [-Inf,4.54] 6.164829 0.46009825 0.03333333 3.064477
## 47 5.253913 (4.54,8.83] 5.930910 -0.67699770 0.03333333 3.064188
## 48 6.268357 (8.83,17.7] 5.786473 0.48188421 0.03333333 3.064453
## 49 6.479705 (34.3, Inf] 10.138827 -3.65912167 0.03333333 3.048992
## 50 4.648192 (8.83,17.7] 5.786473 -1.13828113 0.03333333 3.063206
## 51 13.839264 (34.3, Inf] 10.138827 3.70043742 0.03333333 3.048634
## 52 4.039348 (8.83,17.7] 5.786473 -1.74712493 0.03333333 3.061145
## 53 7.880922 [-Inf,4.54] 6.164829 1.71609295 0.03333333 3.061271
## 54 6.314603 (8.83,17.7] 5.786473 0.52813013 0.03333333 3.064398
## 55 6.152940 (4.54,8.83] 5.930910 0.22202919 0.03333333 3.064667
## 56 6.057920 (34.3, Inf] 10.138827 -4.08090701 0.03333333 3.045144
## 57 5.021718 (8.83,17.7] 5.786473 -0.76475487 0.03333333 3.064039
## 58 3.138689 (17.7,34.3] 8.654198 -5.51550873 0.03333333 3.028861
## 59 5.631993 (4.54,8.83] 5.930910 -0.29891720 0.03333333 3.064620
## 60 6.369999 [-Inf,4.54] 6.164829 0.20516946 0.03333333 3.064676
## 61 11.974368 (34.3, Inf] 10.138827 1.83554107 0.03333333 3.060774
## 62 5.659794 (34.3, Inf] 10.138827 -4.47903273 0.03333333 3.041121
## 63 4.676873 [-Inf,4.54] 6.164829 -1.48795666 0.03333333 3.062129
## 64 5.065006 (4.54,8.83] 5.930910 -0.86590399 0.03333333 3.063846
## 65 15.539523 (17.7,34.3] 8.654198 6.88532549 0.03333333 3.008649
## 66 3.917354 (4.54,8.83] 5.930910 -2.01355616 0.03333333 3.059969
## 67 7.099060 (17.7,34.3] 8.654198 -1.55513755 0.03333333 3.061889
## 68 12.571977 (17.7,34.3] 8.654198 3.91777889 0.03333333 3.046682
## 69 5.672460 (4.54,8.83] 5.930910 -0.25845063 0.03333333 3.064647
## 70 3.000000 (34.3, Inf] 10.138827 -7.13882696 0.03333333 3.004401
## 71 4.792371 (4.54,8.83] 5.930910 -1.13853920 0.03333333 3.063205
## 72 6.673345 (4.54,8.83] 5.930910 0.74243442 0.03333333 3.064079
## 73 6.126403 (17.7,34.3] 8.654198 -2.52779434 0.03333333 3.057227
## 74 4.972586 (8.83,17.7] 5.786473 -0.81388683 0.03333333 3.063948
## 75 5.360472 (4.54,8.83] 5.930910 -0.57043819 0.03333333 3.064344
## 76 5.591014 [-Inf,4.54] 6.164829 -0.57381504 0.03333333 3.064339
## 77 21.067773 (34.3, Inf] 10.138827 10.92894641 0.03333333 2.921383
## 78 5.647085 (4.54,8.83] 5.930910 -0.28382495 0.03333333 3.064631
## 79 15.379259 (34.3, Inf] 10.138827 5.24043216 0.03333333 3.032368
## 80 7.076722 (4.54,8.83] 5.930910 1.14581139 0.03333333 3.063186
## 81 5.799881 (4.54,8.83] 5.930910 -0.13102916 0.03333333 3.064705
## 82 3.000000 (34.3, Inf] 10.138827 -7.13882696 0.03333333 3.004401
## 83 4.990955 [-Inf,4.54] 6.164829 -1.17387464 0.03333333 3.063110
## 84 5.048344 [-Inf,4.54] 6.164829 -1.11648579 0.03333333 3.063264
## 85 6.065957 (4.54,8.83] 5.930910 0.13504619 0.03333333 3.064704
## 86 8.621938 (8.83,17.7] 5.786473 2.83546473 0.03333333 3.055287
## 87 8.638851 [-Inf,4.54] 6.164829 2.47402205 0.03333333 3.057543
## 88 8.793746 (4.54,8.83] 5.930910 2.86283609 0.03333333 3.055104
## 89 5.598625 (4.54,8.83] 5.930910 -0.33228521 0.03333333 3.064596
## 90 3.000000 (34.3, Inf] 10.138827 -7.13882696 0.03333333 3.004401
## 91 5.277418 (4.54,8.83] 5.930910 -0.65349235 0.03333333 3.064224
## 92 7.522015 (17.7,34.3] 8.654198 -1.13218309 0.03333333 3.063222
## 93 11.459906 (17.7,34.3] 8.654198 2.80570840 0.03333333 3.055485
## 94 7.611494 (4.54,8.83] 5.930910 1.68058376 0.03333333 3.061413
## 95 6.761421 [-Inf,4.54] 6.164829 0.59659135 0.03333333 3.064308
## 96 3.384708 (8.83,17.7] 5.786473 -2.40176511 0.03333333 3.057957
## 97 10.683042 (17.7,34.3] 8.654198 2.02884399 0.03333333 3.059897
## 98 5.645174 (17.7,34.3] 8.654198 -3.00902347 0.03333333 3.054095
## 99 8.216825 (17.7,34.3] 8.654198 -0.43737331 0.03333333 3.064501
## 100 11.559355 (34.3, Inf] 10.138827 1.42052774 0.03333333 3.062359
## 101 6.065286 [-Inf,4.54] 6.164829 -0.09954371 0.03333333 3.064713
## 102 5.631719 [-Inf,4.54] 6.164829 -0.53311007 0.03333333 3.064392
## 103 3.000000 (8.83,17.7] 5.786473 -2.78647307 0.03333333 3.055611
## 104 11.884307 (17.7,34.3] 8.654198 3.23010962 0.03333333 3.052472
## 105 13.683904 (34.3, Inf] 10.138827 3.54507743 0.03333333 3.049960
## 106 5.252626 [-Inf,4.54] 6.164829 -0.91220291 0.03333333 3.063750
## 107 6.087006 (4.54,8.83] 5.930910 0.15609574 0.03333333 3.064696
## 108 8.116559 (17.7,34.3] 8.654198 -0.53763924 0.03333333 3.064386
## 109 5.991349 [-Inf,4.54] 6.164829 -0.17348073 0.03333333 3.064690
## 110 7.802580 [-Inf,4.54] 6.164829 1.63775050 0.03333333 3.061580
## 111 6.178516 (17.7,34.3] 8.654198 -2.47568190 0.03333333 3.057533
## 112 12.341180 (34.3, Inf] 10.138827 2.20235329 0.03333333 3.059035
## 113 9.882012 (8.83,17.7] 5.786473 4.09553914 0.03333333 3.045003
## 114 16.217275 (34.3, Inf] 10.138827 6.07844766 0.03333333 3.021111
## 115 10.708091 (17.7,34.3] 8.654198 2.05389328 0.03333333 3.059777
## 116 5.866338 (17.7,34.3] 8.654198 -2.78785952 0.03333333 3.055602
## 117 16.952729 (17.7,34.3] 8.654198 8.29853122 0.03333333 2.982921
## 118 9.459561 (34.3, Inf] 10.138827 -0.67926594 0.03333333 3.064184
## 119 7.491357 [-Inf,4.54] 6.164829 1.32652804 0.03333333 3.062662
## 120 3.020675 (34.3, Inf] 10.138827 -7.11815225 0.03333333 3.004754
## 121 6.819372 (8.83,17.7] 5.786473 1.03289899 0.03333333 3.063474
## 122 13.766832 (17.7,34.3] 8.654198 5.11263416 0.03333333 3.033935
## 123 5.461137 (17.7,34.3] 8.654198 -3.19306092 0.03333333 3.052752
## 124 4.556611 (8.83,17.7] 5.786473 -1.22986246 0.03333333 3.062952
## 125 4.238305 (8.83,17.7] 5.786473 -1.54816758 0.03333333 3.061915
## 126 4.225406 (4.54,8.83] 5.930910 -1.70550467 0.03333333 3.061314
## 127 6.510262 (17.7,34.3] 8.654198 -2.14393537 0.03333333 3.059333
## 128 5.937156 [-Inf,4.54] 6.164829 -0.22767382 0.03333333 3.064664
## 129 3.000000 (8.83,17.7] 5.786473 -2.78647307 0.03333333 3.055611
## 130 6.572372 (8.83,17.7] 5.786473 0.78589879 0.03333333 3.064001
## 131 10.657669 (17.7,34.3] 8.654198 2.00347103 0.03333333 3.060017
## 132 8.275148 [-Inf,4.54] 6.164829 2.11031827 0.03333333 3.059501
## 133 8.156658 (4.54,8.83] 5.930910 2.22574784 0.03333333 3.058913
## 134 11.593498 (17.7,34.3] 8.654198 2.93929989 0.03333333 3.054582
## 135 3.000000 (34.3, Inf] 10.138827 -7.13882696 0.03333333 3.004401
## 136 4.896942 (8.83,17.7] 5.786473 -0.88953077 0.03333333 3.063797
## 137 5.390345 (8.83,17.7] 5.786473 -0.39612806 0.03333333 3.064541
## 138 7.314412 (4.54,8.83] 5.930910 1.38350118 0.03333333 3.062481
## 139 10.194076 (34.3, Inf] 10.138827 0.05524932 0.03333333 3.064721
## 140 4.221897 [-Inf,4.54] 6.164829 -1.94293212 0.03333333 3.060297
## 141 9.361984 (8.83,17.7] 5.786473 3.57551139 0.03333333 3.049705
## 142 11.933911 (17.7,34.3] 8.654198 3.27971366 0.03333333 3.052092
## 143 5.783487 [-Inf,4.54] 6.164829 -0.38134261 0.03333333 3.064555
## 144 7.433543 [-Inf,4.54] 6.164829 1.26871382 0.03333333 3.062838
## 145 4.069980 [-Inf,4.54] 6.164829 -2.09484939 0.03333333 3.059577
## 146 7.617861 (8.83,17.7] 5.786473 1.83138783 0.03333333 3.060791
## 147 4.997450 [-Inf,4.54] 6.164829 -1.16737962 0.03333333 3.063127
## 148 6.517121 [-Inf,4.54] 6.164829 0.35229141 0.03333333 3.064580
## 149 7.094161 (17.7,34.3] 8.654198 -1.56003646 0.03333333 3.061871
## 150 6.803693 (4.54,8.83] 5.930910 0.87278215 0.03333333 3.063832
## .cooksd .std.resid x1_init
## 1 1.743432e-03 0.502789885 38.27032512
## 2 8.649145e-04 -0.354136412 4.22157491
## 3 2.768693e-03 -0.633609123 11.93123732
## 4 1.027406e-02 1.220548473 14.42882286
## 5 1.202793e-03 -0.417618292 48.57752748
## 6 9.657483e-03 -1.183357543 32.84542903
## 7 2.200873e-03 -0.564912935 25.31294161
## 8 3.657728e-04 -0.230297754 45.70917276
## 9 5.797651e-03 0.916874782 11.28560741
## 10 1.774069e-07 0.005071884 5.27798025
## 11 2.848038e-03 0.642623960 10.79789964
## 12 3.025574e-04 -0.209453618 15.06764661
## 13 5.015969e-04 0.269687884 10.82758854
## 14 7.699304e-05 -0.105659788 10.67447179
## 15 1.317044e-05 0.043700274 6.73137581
## 16 1.708877e-03 -0.497782277 5.70379889
## 17 2.294768e-02 -1.824120057 28.70587119
## 18 4.131110e-03 0.773957956 3.81817209
## 19 9.511921e-06 0.037137967 1.46279112
## 20 3.202836e-03 -0.681477286 5.59154940
## 21 8.387034e-04 -0.348729118 4.55779845
## 22 3.003815e-02 -2.086991127 41.48310952
## 23 3.795461e-03 0.741850252 2.47837668
## 24 2.324981e-03 0.580622280 17.69177182
## 25 3.417940e-04 -0.222621062 49.41199213
## 26 3.527877e-03 0.715221782 38.91996370
## 27 2.066723e-04 -0.173111172 9.74867411
## 28 9.227308e-04 -0.365781312 3.45093024
## 29 1.734378e-03 -0.501482638 2.77950942
## 30 1.211384e-03 -0.419106940 1.76536377
## 31 5.239211e-02 2.756239461 41.28540822
## 32 2.994569e-03 -0.658947998 18.16809203
## 33 1.349439e-03 0.442344528 47.45876243
## 34 3.517778e-04 0.225849034 29.57726145
## 35 4.644719e-02 2.595157513 47.72204458
## 36 4.066164e-03 -0.767850143 47.65576469
## 37 2.386530e-03 -0.588257478 12.41868447
## 38 3.787971e-02 2.343620776 36.77025934
## 39 2.277676e-04 -0.181731397 11.45404379
## 40 8.537518e-03 -1.112627580 50.17895310
## 41 3.506821e-03 -0.713084211 9.72077224
## 42 2.226898e-02 -1.796942391 25.74065889
## 43 7.495737e-04 -0.329678904 5.73355165
## 44 5.312167e-04 0.277536347 5.30640089
## 45 3.374427e-03 0.699494009 8.31364087
## 46 1.619120e-04 0.153222824 3.19167168
## 47 3.505517e-04 -0.225455104 4.97455603
## 48 1.776083e-04 0.160478027 10.74054431
## 49 1.024074e-02 -1.218567883 43.52783336
## 50 9.910072e-04 -0.379072616 11.48200014
## 51 1.047331e-02 1.232326936 38.39353799
## 52 2.334671e-03 -0.581830974 10.70902853
## 53 2.252472e-03 0.571496644 1.66567276
## 54 2.133338e-04 0.175878934 16.11458577
## 55 3.770491e-05 0.073940597 5.38219610
## 56 1.273770e-02 -1.359031665 39.60028406
## 57 4.473241e-04 -0.254680167 15.74379947
## 58 2.326746e-02 -1.836785547 24.46808808
## 59 6.834072e-05 -0.099545994 7.55065358
## 60 3.219609e-05 0.068325937 4.42614541
## 61 2.576950e-03 0.611275491 43.45126207
## 62 1.534427e-02 -1.491616274 36.34939929
## 63 1.693395e-03 -0.495522248 1.82933697
## 64 5.734786e-04 -0.288365046 6.87160082
## 65 3.625991e-02 2.292964615 33.97138624
## 66 3.101025e-03 -0.670558427 8.28846334
## 67 1.849760e-03 -0.517894961 24.79282468
## 68 1.173972e-02 1.304706419 19.35398096
## 69 5.108965e-05 -0.086069735 5.47937682
## 70 3.897907e-02 -2.377386173 38.12407700
## 71 9.914566e-04 -0.379158561 6.61722017
## 72 4.215935e-04 0.247246969 6.33726586
## 73 4.887213e-03 -0.841811036 21.82894712
## 74 5.066474e-04 -0.271042190 9.03969437
## 75 2.488828e-04 -0.189968447 7.98138234
## 76 2.518382e-04 -0.191093012 0.00749728
## 77 9.135543e-02 3.639579193 51.19287669
## 78 6.161393e-05 -0.094519944 4.60763565
## 79 2.100448e-02 1.745179010 40.82385094
## 80 1.004163e-03 0.381580358 7.68608177
## 81 1.313148e-05 -0.043635587 8.50236562
## 82 3.897907e-02 -2.377386173 43.29177205
## 83 1.053953e-03 -0.390926039 1.53037980
## 84 9.534198e-04 -0.371814290 3.46991685
## 85 1.394898e-05 0.044973347 8.01666705
## 86 6.149308e-03 0.944272033 9.58709815
## 87 4.681498e-03 0.823903682 2.92343367
## 88 6.268602e-03 0.953387296 8.13084763
## 89 8.445002e-05 -0.110658272 4.96469192
## 90 3.897907e-02 -2.377386173 35.99955837
## 91 3.266320e-04 -0.217627305 7.86846959
## 92 9.804176e-04 -0.377041837 21.24647128
## 93 6.020919e-03 0.934362521 28.42759106
## 94 2.160220e-03 0.559671304 7.35325204
## 95 2.722273e-04 0.198678021 4.49355331
## 96 4.412034e-03 -0.799840533 9.05370190
## 97 3.148292e-03 0.675649609 24.81492970
## 98 6.925146e-03 -1.002070906 19.00210601
## 99 1.463128e-04 -0.145654919 24.04915600
## 100 1.543395e-03 0.473066938 49.90907760
## 101 7.578886e-06 -0.033150241 1.33362509
## 102 2.173760e-04 -0.177537362 1.71360027
## 103 5.938646e-03 -0.927956733 9.10151445
## 104 7.980172e-03 1.075697450 32.46021840
## 105 9.612342e-03 1.180588648 51.39827920
## 106 6.364447e-04 -0.303783604 0.54667799
## 107 1.863631e-05 0.051983310 6.52879230
## 108 2.210852e-04 -0.179045675 24.30880043
## 109 2.301867e-05 -0.057772892 3.29260373
## 110 2.051508e-03 0.545406888 2.95305281
## 111 4.687782e-03 -0.824456449 17.88378764
## 112 3.709811e-03 0.733432019 51.36876814
## 113 1.282921e-02 1.363904487 17.22782766
## 114 2.825941e-02 2.024256576 44.87221841
## 115 3.226514e-03 0.683991575 20.79070525
## 116 5.944557e-03 -0.928418454 26.77955463
## 117 5.267205e-02 2.763593164 31.54452135
## 118 3.529047e-04 -0.226210477 35.41933412
## 119 1.345893e-03 0.441762972 0.50471440
## 120 3.875362e-02 -2.370501040 37.91573917
## 121 8.160062e-04 0.343978049 17.67627631
## 122 1.999250e-02 1.702619468 31.31609589
## 123 7.798160e-03 -1.063359418 28.37899810
## 124 1.156887e-03 -0.409571211 15.64176000
## 125 1.833216e-03 -0.515573809 9.54560901
## 126 2.224762e-03 -0.567970514 4.90772306
## 127 3.515614e-03 -0.713977566 28.18041127
## 128 3.964642e-05 -0.075820383 1.13808793
## 129 5.938646e-03 -0.927956733 9.45966163
## 130 4.724012e-04 0.261721558 10.59620885
## 131 3.070039e-03 0.667199857 19.50764477
## 132 3.406228e-03 0.702782334 2.79640919
## 133 3.789044e-03 0.741222917 7.12970425
## 134 6.607932e-03 0.978851421 30.67116166
## 135 3.897907e-02 -2.377386173 50.16364954
## 136 6.052011e-04 -0.296233285 10.29393258
## 137 1.200187e-04 -0.131919346 9.77568047
## 138 1.463986e-03 0.460736280 8.52218882
## 139 2.334703e-06 0.018399237 47.61400888
## 140 2.887307e-03 -0.647039071 0.29536421
## 141 9.778091e-03 1.190723826 9.53210785
## 142 8.227154e-03 1.092216685 29.39570966
## 143 1.112265e-04 -0.126995466 0.54997400
## 144 1.231133e-03 0.422509567 3.93504863
## 145 3.356475e-03 -0.697630858 1.23509619
## 146 2.565301e-03 0.609892371 11.55629079
## 147 1.042322e-03 -0.388763053 2.64142662
## 148 9.492527e-05 0.117320778 2.59689523
## 149 1.861432e-03 -0.519526405 21.56399823
## 150 5.826255e-04 0.290655627 5.44000987
# Visualisasi
ggplot(mod_tangga5_fitted,aes(x=x1_init, y=y)) +
geom_point(color="#03A9F4") +
geom_line(aes(x=x1_init,y=.fitted),color="#E54D03",
linewidth=1.5) +
annotate("text",x = 5,y=33,
label=str_c("R-squared = ",100*round(glance(mod_tangga5)$adj.r.squared,3)))+
theme_bw()
mod_tangga_n <- function(data,num_breaks=2){
dat_tangga <- recipe(y~.,data = dat) %>%
step_discretize(x1,num_breaks = num_breaks,
options = list(prefix = NULL)) %>%
prep()
mod_tangga = lm(y ~ .,data= bake(dat_tangga,new_data=NULL) )
output <- list("model"=mod_tangga,"recipe_result"=dat_tangga)
return(output)
}
# Isi dari fungsi mod_tangga_n tidak perlu dipahami terlalu serius karena bisa digunakan untuk sembarang data hanya dengan mengganti syntax triceps~.,data = triceps dan step_discretize(age,...) dengan data dan variabel respon yang diinginkan.
mod_tangga_n(data = dat,num_breaks=3)
## $model
##
## Call:
## lm(formula = y ~ ., data = bake(dat_tangga, new_data = NULL))
##
## Coefficients:
## (Intercept) x1(7.48,22.6] x1(22.6, Inf]
## 5.9921 0.4168 3.6122
##
##
## $recipe_result
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 1
##
## ── Training information
## Training data contained 150 data points and no incomplete rows.
##
## ── Operations
## • Discretize numeric variables from: x1 | Trained
# Kita bagi data menjadi 10 bagian berikut sintaksnya
set.seed(2045)
cross_val <- vfold_cv(dat,v=10)
cross_val
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [135/15]> Fold01
## 2 <split [135/15]> Fold02
## 3 <split [135/15]> Fold03
## 4 <split [135/15]> Fold04
## 5 <split [135/15]> Fold05
## 6 <split [135/15]> Fold06
## 7 <split [135/15]> Fold07
## 8 <split [135/15]> Fold08
## 9 <split [135/15]> Fold09
## 10 <split [135/15]> Fold10
mod_tangga_n_train_test <- function(split,num_breaks=2,y){
train_dt <- training(split)
test_dt <- testing(split)
model <- mod_tangga_n(data = train_dt,num_breaks = num_breaks)
test_dt <- bake(model$recipe_result,new_data=test_dt)
pred_test_dt <- predict(model$model,newdata=test_dt)
response <- test_dt %>% pull(y)
result <- data.frame(num_breaks=num_breaks,estimate=pred_test_dt,truth=response)
return(result)
}
mod_tangga_n_train_test(cross_val$splits[[1]],num_breaks = 3,y="y")
## num_breaks estimate truth
## 1 3 6.408814 9.451542
## 2 3 6.408814 5.266654
## 3 3 6.408814 4.020051
## 4 3 6.408814 8.031354
## 5 3 6.408814 6.268357
## 6 3 6.408814 5.021718
## 7 3 6.408814 5.631993
## 8 3 5.992058 5.591014
## 9 3 9.604271 11.459906
## 10 3 9.604271 12.341180
## 11 3 6.408814 10.708091
## 12 3 9.604271 5.461137
## 13 3 6.408814 4.238305
## 14 3 5.992058 5.937156
## 15 3 6.408814 5.390345
eval_metric_tangga <- function(cross_val,metric,list_num_breaks,y){
grid1 <- expand_grid(splits=cross_val$splits,list_num_breaks)
cv_pred <- map2(.x=grid1$splits,.y=grid1$list_num_breaks,mod_tangga_n_train_test,y = y)
metric_custom <- function(x,truth,estimate){
cbind("num_breaks"=x$num_breaks[1],metric(x,truth,estimate))
}
result <- map(cv_pred,metric_custom,truth = truth,estimate = estimate) %>%
list_rbind() %>%
select(-.estimator) %>%
group_by(num_breaks,.metric) %>%
summarise(mean_estimate=mean(.estimate),
num_folds = n(),
std_error = sd(.estimate) / sqrt(num_folds) )
return(result)
}
Isi dari fungsi eval_metric_tangga tidak perlu dipahami terlalu serius karena bisa digunakan untuk sembarang data asalkan fungsi mod_tangga_n sebelumnya sudah dimodifikasi sesuai dengan data.
list_num_breaks<- 2:10
list_num_breaks
## [1] 2 3 4 5 6 7 8 9 10
multi_metric <- mse
tic()
mod_tangga_cv_metric <- eval_metric_tangga(cross_val=cross_val,
metric=multi_metric,
list_num_breaks = list_num_breaks,
y="y")
## `summarise()` has grouped output by 'num_breaks'. You can override using the
## `.groups` argument.
toc()
## 3.96 sec elapsed
mod_tangga_cv_metric
## # A tibble: 9 × 5
## # Groups: num_breaks [9]
## num_breaks .metric mean_estimate num_folds std_error
## <int> <chr> <dbl> <int> <dbl>
## 1 2 mse 9.97 10 1.38
## 2 3 mse 9.48 10 1.16
## 3 4 mse 8.39 10 1.03
## 4 5 mse 9.02 10 1.30
## 5 6 mse 8.94 10 1.24
## 6 7 mse 8.61 10 1.15
## 7 8 mse 8.30 10 1.06
## 8 9 mse 8.71 10 1.19
## 9 10 mse 8.64 10 1.08
Berdsarakan tabel diatas dapat disimpulkan bahwa num_break=4 merupakan num_break terbaik berdasarkan metric MSE. Sehingga model terbaiknya adalah Regresi Fungsi Tangga dengan num_break 4.
# Visualisasi
ggplot(mod_tangga_cv_metric,aes(num_breaks,mean_estimate)) +
geom_point()+
geom_errorbar(aes(ymin = mean_estimate-std_error, ymax = mean_estimate+std_error))+
ylab("MSE") +
scale_x_continuous(n.breaks = nrow(mod_tangga_cv_metric)) +
theme_bw()
ganti num breaks jadi 6, 7, dan 8
# Num Breaks 6
# Membuat data hasil discretize (6 breaks)
dat_tangga6 <- recipe(y ~ ., data = dat) %>%
step_discretize(x1, num_breaks = 6, options = list(prefix = NULL)) %>%
prep() %>%
bake(new_data = NULL)
# Regresi linear untuk data tangga 6
mod_tangga6 <- lm(y ~ ., data = dat_tangga6)
# Ringkasan model
tidy(mod_tangga6)
## # A tibble: 6 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.03 0.610 9.88 6.65e-18
## 2 x1(3.76,7.48] -0.0791 0.863 -0.0916 9.27e- 1
## 3 x1(7.48,10.8] -0.376 0.863 -0.435 6.64e- 1
## 4 x1(10.8,22.6] 1.13 0.863 1.31 1.92e- 1
## 5 x1(22.6,38] 2.54 0.863 2.95 3.75e- 3
## 6 x1(38, Inf] 4.60 0.863 5.33 3.67e- 7
# Prediksi dan visualisasi
mod_tangga6_fitted <- cbind(augment(mod_tangga6), "x1_init" = dat$x1)
ggplot(mod_tangga6_fitted, aes(x = x1_init, y = y)) +
geom_point(color = "#03A9F4") +
geom_line(aes(x = x1_init, y = .fitted), color = "#E54D03", linewidth = 1.5) +
annotate(
"text", x = 5, y = 33,
label = str_c("R-squared = ", 100 * round(glance(mod_tangga6)$adj.r.squared, 3))
) +
theme_bw()
# Num Breaks 7
dat_tangga7 <- recipe(y ~ ., data = dat) %>%
step_discretize(x1, num_breaks = 7, options = list(prefix = NULL)) %>%
prep() %>%
bake(new_data = NULL)
mod_tangga7 <- lm(y ~ ., data = dat_tangga7)
tidy(mod_tangga7)
## # A tibble: 7 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 6.12 0.641 9.55 4.93e-17
## 2 x1(3.22,6.08] -0.385 0.917 -0.420 6.75e- 1
## 3 x1(6.08,9.52] -0.309 0.917 -0.337 7.37e- 1
## 4 x1(9.52,15.7] -0.158 0.906 -0.175 8.62e- 1
## 5 x1(15.7,26.2] 1.45 0.917 1.59 1.15e- 1
## 6 x1(26.2,38.8] 3.18 0.917 3.47 6.87e- 4
## 7 x1(38.8, Inf] 4.67 0.906 5.15 8.46e- 7
mod_tangga7_fitted <- cbind(augment(mod_tangga7), "x1_init" = dat$x1)
ggplot(mod_tangga7_fitted, aes(x = x1_init, y = y)) +
geom_point(color = "#03A9F4") +
geom_line(aes(x = x1_init, y = .fitted), color = "#E54D03", linewidth = 1.5) +
annotate(
"text", x = 5, y = 33,
label = str_c("R-squared = ", 100 * round(glance(mod_tangga7)$adj.r.squared, 3))
) +
theme_bw()
# Num Breaks 8
dat_tangga8 <- recipe(y ~ ., data = dat) %>%
step_discretize(x1, num_breaks = 8, options = list(prefix = NULL)) %>%
prep() %>%
bake(new_data = NULL)
mod_tangga8 <- lm(y ~ ., data = dat_tangga8)
tidy(mod_tangga8)
## # A tibble: 8 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 5.87 0.679 8.65 1.02e-14
## 2 x1(2.88,5.4] 0.325 0.961 0.339 7.35e- 1
## 3 x1(5.4,8.27] 0.147 0.974 0.151 8.80e- 1
## 4 x1(8.27,10.8] -0.443 0.961 -0.462 6.45e- 1
## 5 x1(10.8,19] 0.671 0.961 0.698 4.86e- 1
## 6 x1(19,29.2] 1.60 0.974 1.65 1.02e- 1
## 7 x1(29.2,41] 4.47 0.961 4.66 7.32e- 6
## 8 x1(41, Inf] 4.84 0.961 5.04 1.39e- 6
mod_tangga8_fitted <- cbind(augment(mod_tangga8), "x1_init" = dat$x1)
ggplot(mod_tangga8_fitted, aes(x = x1_init, y = y)) +
geom_point(color = "#03A9F4") +
geom_line(aes(x = x1_init, y = .fitted), color = "#E54D03", linewidth = 1.5) +
annotate(
"text", x = 5, y = 33,
label = str_c("R-squared = ", 100 * round(glance(mod_tangga8)$adj.r.squared, 3))
) +
theme_bw()
# Buat tabel perbandingan performa
perbandingan_tangga <- tibble(
num_breaks = c(6, 7, 8),
mse = c(
mean((mod_tangga6_fitted$y - mod_tangga6_fitted$.fitted)^2),
mean((mod_tangga7_fitted$y - mod_tangga7_fitted$.fitted)^2),
mean((mod_tangga8_fitted$y - mod_tangga8_fitted$.fitted)^2)
),
adj_r2 = c(
glance(mod_tangga6)$adj.r.squared,
glance(mod_tangga7)$adj.r.squared,
glance(mod_tangga8)$adj.r.squared
)
)
perbandingan_tangga
## # A tibble: 3 × 3
## num_breaks mse adj_r2
## <dbl> <dbl> <dbl>
## 1 6 8.94 0.235
## 2 7 8.61 0.258
## 3 8 8.30 0.279
# Visualisasi
# Gabungkan fitted values dari ketiga model
fitted_compare <- bind_rows(
cbind(augment(mod_tangga6), x1_init = dat$x1, num_breaks = 6),
cbind(augment(mod_tangga7), x1_init = dat$x1, num_breaks = 7),
cbind(augment(mod_tangga8), x1_init = dat$x1, num_breaks = 8)
)
# Plot gabungan
ggplot(fitted_compare, aes(x = x1_init, y = y)) +
geom_point(alpha = 0.3, color = "gray40") +
geom_line(aes(y = .fitted, color = factor(num_breaks)), linewidth = 1.2) +
scale_color_manual(values = c("#E54D03", "#009688", "#3F51B5"),
name = "Num Breaks") +
labs(title = "Perbandingan Model Tangga",
x = "x1", y = "y (aktual & fitted)") +
theme_bw()