library(readxl)
rumah.clean2 = read_excel("C:\\Users\\MUTHI'AH IFFA\\Downloads\\Semester 5\\PSD\\praktikum\\rumah_clean.xlsx")
str(rumah.clean2)
## tibble [1,025 Γ— 16] (S3: tbl_df/tbl/data.frame)
##  $ harga_rp    : num [1:1025] 2.00e+09 7.99e+08 2.30e+09 6.25e+09 6.00e+08 1.10e+09 5.50e+08 2.60e+09 4.50e+09 3.90e+09 ...
##  $ jml_km_buah : num [1:1025] 4 2 11 5 2 3 3 3 4 4 ...
##  $ jml_kt_buah : num [1:1025] 3 2 4 5 1 2 3 3 4 4 ...
##  $ jml_gar_buah: num [1:1025] 2 1 1 2 1 1 0 2 2 0 ...
##  $ lt_m2       : num [1:1025] 90 102 90 294 66 60 60 98 288 80 ...
##  $ lb_m2       : num [1:1025] 130 90 150 350 50 80 112 90 400 200 ...
##  $ lokasi_2    : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
##  $ lokasi_3    : num [1:1025] 0 1 0 0 1 0 0 0 0 0 ...
##  $ lokasi_4    : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
##  $ lokasi_5    : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
##  $ lokasi_6    : num [1:1025] 0 0 0 0 0 0 1 0 0 0 ...
##  $ lokasi_7    : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
##  $ lokasi_8    : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
##  $ lokasi_9    : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
##  $ lokasi_10   : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
##  $ lokasi_11   : num [1:1025] 0 0 0 0 0 0 0 0 1 0 ...
library(leaps)
library(MASS)

set.seed(123)  # biar reproducible

# Variabel target
y_var <- "harga_rp"

# 1. Split data 70:30
n <- nrow(rumah.clean2)
train_index <- sample(1:n, size = 0.7*n)
train <- rumah.clean2[train_index, ]
test  <- rumah.clean2[-train_index, ]
y_test <- test[[y_var]]

# -----------------------------
# 2. Best Subset Selection
best_subset <- regsubsets(
  as.formula(paste(y_var, "~ .")), 
  data = train, 
  nvmax = ncol(train)-1,   # semua variabel prediktor
  method = "exhaustive"
)

rmse_best <- c()
p <- ncol(train)-1

for (k in 1:p) {
  coef_k <- coef(best_subset, k)
  vars <- names(coef_k)[-1]  # buang intercept
  
  model_k <- lm(as.formula(paste(y_var, "~", paste(vars, collapse = "+"))), data = train)
  pred_k  <- predict(model_k, newdata = test)
  
  rmse_best[k] <- sqrt(mean((y_test - pred_k)^2, na.rm = TRUE))
}

best_k <- which.min(rmse_best)
cat("πŸ“Œ Best Subset - Model terbaik:", best_k, "variabel dengan RMSE =", rmse_best[best_k], "\n")
## πŸ“Œ Best Subset - Model terbaik: 13 variabel dengan RMSE = 2185405915
print(coef(best_subset, best_k))
##  (Intercept)  jml_km_buah jml_gar_buah        lt_m2        lb_m2     lokasi_2 
##    318085499   -337021086    567292764      9262047     17351213   -676623532 
##     lokasi_3     lokasi_4     lokasi_5     lokasi_6     lokasi_7     lokasi_8 
##  -1098488248  -1153779053  -1832341958  -1775752705  -1868067658  -2111178940 
##     lokasi_9    lokasi_11 
##  -8557097818  -7268927182
# -----------------------------
# 3. Forward Selection
forward_model <- stepAIC(
  lm(as.formula(paste(y_var, "~ 1")), data = train),
  scope = list(lower = ~1, upper = as.formula(paste("~", paste(names(train)[names(train)!=y_var], collapse = "+")))),
  direction = "forward",
  trace = FALSE
)

pred_fwd <- predict(forward_model, newdata = test)
rmse_fwd <- sqrt(mean((y_test - pred_fwd)^2, na.rm = TRUE))
cat("\nπŸ“Œ Forward Stepwise RMSE =", rmse_fwd, "\n")
## 
## πŸ“Œ Forward Stepwise RMSE = 2202975201
# -----------------------------
# 4. Backward Selection
backward_model <- stepAIC(
  lm(as.formula(paste(y_var, "~ .")), data = train),
  direction = "backward", trace = FALSE
)

pred_bwd <- predict(backward_model, newdata = test)
rmse_bwd <- sqrt(mean((y_test - pred_bwd)^2, na.rm = TRUE))
cat("\nπŸ“Œ Backward Stepwise RMSE =", rmse_bwd, "\n")
## 
## πŸ“Œ Backward Stepwise RMSE = 2202975201
# -----------------------------
# 5. Hybrid Stepwise (both)
hybrid_model <- stepAIC(
  lm(as.formula(paste(y_var, "~ .")), data = train),
  direction = "both", trace = FALSE
)

pred_hyb <- predict(hybrid_model, newdata = test)
rmse_hyb <- sqrt(mean((y_test - pred_hyb)^2, na.rm = TRUE))
cat("\nπŸ“Œ Hybrid Stepwise RMSE =", rmse_hyb, "\n")
## 
## πŸ“Œ Hybrid Stepwise RMSE = 2202975201
# -----------------------------
# 6. Bandingkan semua
cat("\n===== RINGKASAN RMSE =====\n")
## 
## ===== RINGKASAN RMSE =====
cat("Best Subset :", rmse_best[best_k], "\n")
## Best Subset : 2185405915
cat("Forward     :", rmse_fwd, "\n")
## Forward     : 2202975201
cat("Backward    :", rmse_bwd, "\n")
## Backward    : 2202975201
cat("Hybrid      :", rmse_hyb, "\n")
## Hybrid      : 2202975201
# --- Ambil variabel terbaik dari masing-masing metode ---
# Best Subset
coef_best <- coef(best_subset, best_k)
vars_best <- names(coef_best)[-1]  # buang intercept

# Forward
vars_fwd <- names(coef(forward_model))[-1]

# Backward
vars_bwd <- names(coef(backward_model))[-1]

# Hybrid
vars_hyb <- names(coef(hybrid_model))[-1]

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(broom)

# --- Tabel biasa (data.frame) ---
hasil_perbandingan <- data.frame(
  Metode = c("Best Subset", "Forward", "Backward", "Hybrid"),
  Jumlah_Variabel = c(
    length(vars_best), 
    length(vars_fwd), 
    length(vars_bwd), 
    length(vars_hyb)
  ),
  RMSE = c(
    rmse_best[best_k], 
    rmse_fwd, 
    rmse_bwd, 
    rmse_hyb
  ),
  Variabel = c(
    paste(vars_best, collapse = ", "),
    paste(vars_fwd, collapse = ", "),
    paste(vars_bwd, collapse = ", "),
    paste(vars_hyb, collapse = ", ")
  ),
  stringsAsFactors = FALSE
)

# Print tabel biasa
print(hasil_perbandingan)
##        Metode Jumlah_Variabel       RMSE
## 1 Best Subset              13 2185405915
## 2     Forward              11 2202975201
## 3    Backward              11 2202975201
## 4      Hybrid              11 2202975201
##                                                                                                                             Variabel
## 1 jml_km_buah, jml_gar_buah, lt_m2, lb_m2, lokasi_2, lokasi_3, lokasi_4, lokasi_5, lokasi_6, lokasi_7, lokasi_8, lokasi_9, lokasi_11
## 2                     lb_m2, lt_m2, lokasi_11, jml_gar_buah, jml_km_buah, lokasi_6, lokasi_9, lokasi_3, lokasi_7, lokasi_2, lokasi_5
## 3                     jml_km_buah, jml_gar_buah, lt_m2, lb_m2, lokasi_2, lokasi_3, lokasi_5, lokasi_6, lokasi_7, lokasi_9, lokasi_11
## 4                     jml_km_buah, jml_gar_buah, lt_m2, lb_m2, lokasi_2, lokasi_3, lokasi_5, lokasi_6, lokasi_7, lokasi_9, lokasi_11
library(gt)

# Data hasil perbandingan (pakai variabel yang benar)
hasil_perbandingan <- data.frame(
  Metode = c("Best Subset", "Forward", "Backward", "Hybrid"),
  Jumlah_Variabel = c(
    length(vars_best), 
    length(vars_fwd), 
    length(vars_bwd), 
    length(vars_hyb)
  ),
  RMSE = c(
    rmse_best[best_k], 
    rmse_fwd, 
    rmse_bwd, 
    rmse_hyb
  ),
  Variabel = c(
    paste(vars_best, collapse = ", "),
    paste(vars_fwd, collapse = ", "),
    paste(vars_bwd, collapse = ", "),
    paste(vars_hyb, collapse = ", ")
  ),
  stringsAsFactors = FALSE
)

# Tabel gt
hasil_perbandingan %>%
  gt() %>%
  tab_header(
    title = "Perbandingan Metode Seleksi Variabel",
    subtitle = "Berdasarkan RMSE pada Data rumah.clean2"
  ) %>%
  fmt_number(
    columns = vars(RMSE),
    decimals = 0,
    use_seps = TRUE
  ) %>%
  cols_width(
    Metode ~ px(120),
    Jumlah_Variabel ~ px(120),
    RMSE ~ px(180),
    Variabel ~ px(600)
  ) %>%
  tab_options(
    table.border.top.width = px(2),
    table.border.bottom.width = px(2),
    table.font.size = "small"
  )
## Warning: Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## β€’ Please use `columns = c(...)` instead.
Perbandingan Metode Seleksi Variabel
Berdasarkan RMSE pada Data rumah.clean2
Metode Jumlah_Variabel RMSE Variabel
Best Subset 13 2,185,405,915 jml_km_buah, jml_gar_buah, lt_m2, lb_m2, lokasi_2, lokasi_3, lokasi_4, lokasi_5, lokasi_6, lokasi_7, lokasi_8, lokasi_9, lokasi_11
Forward 11 2,202,975,201 lb_m2, lt_m2, lokasi_11, jml_gar_buah, jml_km_buah, lokasi_6, lokasi_9, lokasi_3, lokasi_7, lokasi_2, lokasi_5
Backward 11 2,202,975,201 jml_km_buah, jml_gar_buah, lt_m2, lb_m2, lokasi_2, lokasi_3, lokasi_5, lokasi_6, lokasi_7, lokasi_9, lokasi_11
Hybrid 11 2,202,975,201 jml_km_buah, jml_gar_buah, lt_m2, lb_m2, lokasi_2, lokasi_3, lokasi_5, lokasi_6, lokasi_7, lokasi_9, lokasi_11