library(readxl)
rumah.clean2 = read_excel("C:\\Users\\MUTHI'AH IFFA\\Downloads\\Semester 5\\PSD\\praktikum\\rumah_clean.xlsx")
str(rumah.clean2)
## tibble [1,025 Γ 16] (S3: tbl_df/tbl/data.frame)
## $ harga_rp : num [1:1025] 2.00e+09 7.99e+08 2.30e+09 6.25e+09 6.00e+08 1.10e+09 5.50e+08 2.60e+09 4.50e+09 3.90e+09 ...
## $ jml_km_buah : num [1:1025] 4 2 11 5 2 3 3 3 4 4 ...
## $ jml_kt_buah : num [1:1025] 3 2 4 5 1 2 3 3 4 4 ...
## $ jml_gar_buah: num [1:1025] 2 1 1 2 1 1 0 2 2 0 ...
## $ lt_m2 : num [1:1025] 90 102 90 294 66 60 60 98 288 80 ...
## $ lb_m2 : num [1:1025] 130 90 150 350 50 80 112 90 400 200 ...
## $ lokasi_2 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_3 : num [1:1025] 0 1 0 0 1 0 0 0 0 0 ...
## $ lokasi_4 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_5 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_6 : num [1:1025] 0 0 0 0 0 0 1 0 0 0 ...
## $ lokasi_7 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_8 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_9 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_10 : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ lokasi_11 : num [1:1025] 0 0 0 0 0 0 0 0 1 0 ...
library(leaps)
library(MASS)
set.seed(123) # biar reproducible
# Variabel target
y_var <- "harga_rp"
# 1. Split data 70:30
n <- nrow(rumah.clean2)
train_index <- sample(1:n, size = 0.7*n)
train <- rumah.clean2[train_index, ]
test <- rumah.clean2[-train_index, ]
y_test <- test[[y_var]]
# -----------------------------
# 2. Best Subset Selection
best_subset <- regsubsets(
as.formula(paste(y_var, "~ .")),
data = train,
nvmax = ncol(train)-1, # semua variabel prediktor
method = "exhaustive"
)
rmse_best <- c()
p <- ncol(train)-1
for (k in 1:p) {
coef_k <- coef(best_subset, k)
vars <- names(coef_k)[-1] # buang intercept
model_k <- lm(as.formula(paste(y_var, "~", paste(vars, collapse = "+"))), data = train)
pred_k <- predict(model_k, newdata = test)
rmse_best[k] <- sqrt(mean((y_test - pred_k)^2, na.rm = TRUE))
}
best_k <- which.min(rmse_best)
cat("π Best Subset - Model terbaik:", best_k, "variabel dengan RMSE =", rmse_best[best_k], "\n")
## π Best Subset - Model terbaik: 13 variabel dengan RMSE = 2185405915
print(coef(best_subset, best_k))
## (Intercept) jml_km_buah jml_gar_buah lt_m2 lb_m2 lokasi_2
## 318085499 -337021086 567292764 9262047 17351213 -676623532
## lokasi_3 lokasi_4 lokasi_5 lokasi_6 lokasi_7 lokasi_8
## -1098488248 -1153779053 -1832341958 -1775752705 -1868067658 -2111178940
## lokasi_9 lokasi_11
## -8557097818 -7268927182
# -----------------------------
# 3. Forward Selection
forward_model <- stepAIC(
lm(as.formula(paste(y_var, "~ 1")), data = train),
scope = list(lower = ~1, upper = as.formula(paste("~", paste(names(train)[names(train)!=y_var], collapse = "+")))),
direction = "forward",
trace = FALSE
)
pred_fwd <- predict(forward_model, newdata = test)
rmse_fwd <- sqrt(mean((y_test - pred_fwd)^2, na.rm = TRUE))
cat("\nπ Forward Stepwise RMSE =", rmse_fwd, "\n")
##
## π Forward Stepwise RMSE = 2202975201
# -----------------------------
# 4. Backward Selection
backward_model <- stepAIC(
lm(as.formula(paste(y_var, "~ .")), data = train),
direction = "backward", trace = FALSE
)
pred_bwd <- predict(backward_model, newdata = test)
rmse_bwd <- sqrt(mean((y_test - pred_bwd)^2, na.rm = TRUE))
cat("\nπ Backward Stepwise RMSE =", rmse_bwd, "\n")
##
## π Backward Stepwise RMSE = 2202975201
# -----------------------------
# 5. Hybrid Stepwise (both)
hybrid_model <- stepAIC(
lm(as.formula(paste(y_var, "~ .")), data = train),
direction = "both", trace = FALSE
)
pred_hyb <- predict(hybrid_model, newdata = test)
rmse_hyb <- sqrt(mean((y_test - pred_hyb)^2, na.rm = TRUE))
cat("\nπ Hybrid Stepwise RMSE =", rmse_hyb, "\n")
##
## π Hybrid Stepwise RMSE = 2202975201
# -----------------------------
# 6. Bandingkan semua
cat("\n===== RINGKASAN RMSE =====\n")
##
## ===== RINGKASAN RMSE =====
cat("Best Subset :", rmse_best[best_k], "\n")
## Best Subset : 2185405915
cat("Forward :", rmse_fwd, "\n")
## Forward : 2202975201
cat("Backward :", rmse_bwd, "\n")
## Backward : 2202975201
cat("Hybrid :", rmse_hyb, "\n")
## Hybrid : 2202975201
# --- Ambil variabel terbaik dari masing-masing metode ---
# Best Subset
coef_best <- coef(best_subset, best_k)
vars_best <- names(coef_best)[-1] # buang intercept
# Forward
vars_fwd <- names(coef(forward_model))[-1]
# Backward
vars_bwd <- names(coef(backward_model))[-1]
# Hybrid
vars_hyb <- names(coef(hybrid_model))[-1]
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(broom)
# --- Tabel biasa (data.frame) ---
hasil_perbandingan <- data.frame(
Metode = c("Best Subset", "Forward", "Backward", "Hybrid"),
Jumlah_Variabel = c(
length(vars_best),
length(vars_fwd),
length(vars_bwd),
length(vars_hyb)
),
RMSE = c(
rmse_best[best_k],
rmse_fwd,
rmse_bwd,
rmse_hyb
),
Variabel = c(
paste(vars_best, collapse = ", "),
paste(vars_fwd, collapse = ", "),
paste(vars_bwd, collapse = ", "),
paste(vars_hyb, collapse = ", ")
),
stringsAsFactors = FALSE
)
# Print tabel biasa
print(hasil_perbandingan)
## Metode Jumlah_Variabel RMSE
## 1 Best Subset 13 2185405915
## 2 Forward 11 2202975201
## 3 Backward 11 2202975201
## 4 Hybrid 11 2202975201
## Variabel
## 1 jml_km_buah, jml_gar_buah, lt_m2, lb_m2, lokasi_2, lokasi_3, lokasi_4, lokasi_5, lokasi_6, lokasi_7, lokasi_8, lokasi_9, lokasi_11
## 2 lb_m2, lt_m2, lokasi_11, jml_gar_buah, jml_km_buah, lokasi_6, lokasi_9, lokasi_3, lokasi_7, lokasi_2, lokasi_5
## 3 jml_km_buah, jml_gar_buah, lt_m2, lb_m2, lokasi_2, lokasi_3, lokasi_5, lokasi_6, lokasi_7, lokasi_9, lokasi_11
## 4 jml_km_buah, jml_gar_buah, lt_m2, lb_m2, lokasi_2, lokasi_3, lokasi_5, lokasi_6, lokasi_7, lokasi_9, lokasi_11
library(gt)
# Data hasil perbandingan (pakai variabel yang benar)
hasil_perbandingan <- data.frame(
Metode = c("Best Subset", "Forward", "Backward", "Hybrid"),
Jumlah_Variabel = c(
length(vars_best),
length(vars_fwd),
length(vars_bwd),
length(vars_hyb)
),
RMSE = c(
rmse_best[best_k],
rmse_fwd,
rmse_bwd,
rmse_hyb
),
Variabel = c(
paste(vars_best, collapse = ", "),
paste(vars_fwd, collapse = ", "),
paste(vars_bwd, collapse = ", "),
paste(vars_hyb, collapse = ", ")
),
stringsAsFactors = FALSE
)
# Tabel gt
hasil_perbandingan %>%
gt() %>%
tab_header(
title = "Perbandingan Metode Seleksi Variabel",
subtitle = "Berdasarkan RMSE pada Data rumah.clean2"
) %>%
fmt_number(
columns = vars(RMSE),
decimals = 0,
use_seps = TRUE
) %>%
cols_width(
Metode ~ px(120),
Jumlah_Variabel ~ px(120),
RMSE ~ px(180),
Variabel ~ px(600)
) %>%
tab_options(
table.border.top.width = px(2),
table.border.bottom.width = px(2),
table.font.size = "small"
)
## Warning: Since gt v0.3.0, `columns = vars(...)` has been deprecated.
## β’ Please use `columns = c(...)` instead.
Perbandingan Metode Seleksi Variabel |
Berdasarkan RMSE pada Data rumah.clean2 |
Metode |
Jumlah_Variabel |
RMSE |
Variabel |
Best Subset |
13 |
2,185,405,915 |
jml_km_buah, jml_gar_buah, lt_m2, lb_m2, lokasi_2, lokasi_3, lokasi_4, lokasi_5, lokasi_6, lokasi_7, lokasi_8, lokasi_9, lokasi_11 |
Forward |
11 |
2,202,975,201 |
lb_m2, lt_m2, lokasi_11, jml_gar_buah, jml_km_buah, lokasi_6, lokasi_9, lokasi_3, lokasi_7, lokasi_2, lokasi_5 |
Backward |
11 |
2,202,975,201 |
jml_km_buah, jml_gar_buah, lt_m2, lb_m2, lokasi_2, lokasi_3, lokasi_5, lokasi_6, lokasi_7, lokasi_9, lokasi_11 |
Hybrid |
11 |
2,202,975,201 |
jml_km_buah, jml_gar_buah, lt_m2, lb_m2, lokasi_2, lokasi_3, lokasi_5, lokasi_6, lokasi_7, lokasi_9, lokasi_11 |