Ringkasan analisis: Membandingkan 12 skenario estimasi SAE pada data Kabupaten/Kota Sulawesi menggunakan tiga pendekatan model (EBLUP, EBLUP Logit, HB Beta) dan empat strategi segmentasi domain (Pulau, Provinsi, RSE Natural Break, Clustering).
# ── PENTING: MASS & car di-load DULU sebelum dplyr ───────────────────
# supaya dplyr::select dan dplyr::recode tidak tertimpa oleh car/MASS
library(MASS) # stepAIC (masking dplyr::select dicegah dengan load order)
library(car) # vif (car::recode tidak akan tertimpa dplyr)
# ── Manipulasi data ──────────────────────────────────────────────────
library(readxl)
library(dplyr) # load SETELAH MASS & car → dplyr::select & ::recode menang
library(tidyr)
# ── Visualisasi ──────────────────────────────────────────────────────
library(ggplot2)
library(scales)
library(patchwork)
# ── SAE ──────────────────────────────────────────────────────────────
library(sae) # mseFH (EBLUP)
# ── Segmentasi ───────────────────────────────────────────────────────
library(classInt) # Jenks Natural Breaks
library(factoextra) # visualisasi clustering (opsional)
# ── Tabel output ─────────────────────────────────────────────────────
library(knitr)
library(kableExtra)
# Pastikan fungsi-fungsi kritis memakai namespace yang benar
select <- dplyr::select
recode <- dplyr::recode# ============================================================
# JAGS adalah software eksternal yang HARUS diinstal terpisah
# sebelum package saeHB / rjags bisa digunakan.
#
# Langkah instalasi JAGS:
# 1. Download installer dari:
# https://sourceforge.net/projects/mcmc-jags/files/JAGS/4.x/
# 2. Pilih file: JAGS-4.x.y.exe (Windows)
# atau via brew/apt (Mac/Linux)
# 3. Instal seperti biasa, RESTART R sesudahnya
# 4. Lalu jalankan: install.packages("rjags"); install.packages("saeHB")
# ============================================================
# Deteksi otomatis apakah JAGS + saeHB tersedia
has_jags <- tryCatch({
library(rjags)
library(saeHB)
TRUE
}, error = function(e) {
message("⚠ saeHB/JAGS tidak tersedia — skenario HB Beta akan dilewati.")
message(" Instal JAGS dari: https://sourceforge.net/projects/mcmc-jags/files/")
FALSE
})
cat("Status JAGS/saeHB:", ifelse(has_jags, "✔ Tersedia", "✘ Tidak tersedia"), "\n")## Status JAGS/saeHB: ✔ Tersedia
# ── Raw data ─────────────────────────────────────────────────────────
Estimasi_531 <- read_excel("C:/Users/User/Downloads/Hasil_sulawesi_KabKota.xlsx")
Podes_kab2024 <- read_excel("C:/Users/User/Downloads/Podes_kab_lengkap.xlsx")
Data_Penduduk <- read_excel("C:/Users/User/Downloads/Data_Penduduk.xlsx")
# ── Samakan format kode ───────────────────────────────────────────────
Estimasi_531 <- Estimasi_531 %>% mutate(Kako = sprintf("%04d", Kako))
Podes_kab2024 <- Podes_kab2024 %>% mutate(kode_kab = sprintf("%04s", kode_kab))
# ── Gabung Estimasi + PODES ───────────────────────────────────────────
df_gabungan <- Estimasi_531 %>%
left_join(Podes_kab2024, by = c("Kako" = "kode_kab")) %>%
dplyr::select(Kako, Estimasi, RSE, starts_with("X"))
# ── Gabung dengan data penduduk ───────────────────────────────────────
df_gabungan$Kako <- as.numeric(df_gabungan$Kako)
df_final <- df_gabungan %>%
left_join(Data_Penduduk, by = "Kako")
cat("Jumlah domain:", nrow(df_final), "\n")## Jumlah domain: 81
## Jumlah variabel: 112
df_final <- df_final %>%
mutate(
# Per 1.000 penduduk
X6 = X6_jumlah / Jumlah_Penduduk * 1000,
X7 = X7_jumlah / Jumlah_Penduduk * 1000,
X10 = X10_jumlah / Jumlah_Penduduk * 1000,
X33 = X33_jumlah / Jumlah_Penduduk * 1000,
# Per 1.000 perempuan
X11 = X11_jumlah / Jumlah_P * 1000,
X12 = X12_jumlah / Jumlah_P * 1000,
X27 = X27_jumlah / Jumlah_P * 1000,
X28 = X28_jumlah / Jumlah_P * 1000,
X29 = X29_jumlah / Jumlah_P * 1000,
X30 = X30_jumlah / Jumlah_P * 1000,
X31 = X31_jumlah / Jumlah_P * 1000,
X35 = X35_jumlah / Jumlah_P * 1000,
X36 = X36_jumlah / Jumlah_P * 1000
)Pipeline seleksi: Korelasi tertinggi per kelompok → Backward Stepwise (AIC) → VIF iteratif (≤10) → Anti-overfit (R² ≤ 0.8)
df <- df_final
exclude_vars <- c("Kako","Estimasi","RSE",
"y_eblup","mse","RMSE_direct","RMSE_eblup",
"RSE_direct","RSE_eblup","y_hb","sd_hb",
"RSE_hb","y_hb_percent","vardir",
"Jumlah_L","Jumlah_P","Jumlah_Penduduk")
all_vars <- setdiff(names(df), exclude_vars)
# Ambil basis nama (X1, X2, dst)
get_base <- function(x) sub("(_jumlah|_mean)$", "", x)
base_vars <- unique(sapply(all_vars, get_base))
hasil_vars <- c()
for (b in base_vars) {
group_vars <- all_vars[get_base(all_vars) == b]
cor_vals <- sapply(group_vars, function(v)
cor(df[[v]], df$Estimasi, use = "complete.obs"))
best_var <- names(which.max(abs(cor_vals)))
hasil_vars <- c(hasil_vars, best_var)
}
# Data kandidat
df_gabungan <- df %>%
dplyr::select(Kako, Estimasi, RSE, all_of(hasil_vars))
cat("Variabel kandidat terpilih (", length(hasil_vars), "):\n")## Variabel kandidat terpilih ( 40 ):
## X1_mean, X2, X8, X9, X13_jumlah, X14_jumlah, X15_mean, X16, X17, X18_mean, X19, X21_mean, X22_jumlah, X23_mean, X24_mean, X25_mean, X26, X32, X37_mean, X38_mean, X39_mean, X40_mean, X41_mean, X43, X44, X45, X34_jumlah, X6_mean, X7_jumlah, X10_mean, X11_mean, X12, X27_mean, X28, X29_mean, X30_mean, X31_mean, X33_mean, X35_mean, X36_mean
df_model <- df_gabungan %>%
dplyr::select(-Kako, -RSE) %>%
as.data.frame()
# Backward stepwise
full_model <- lm(Estimasi ~ ., data = df_model)
step_model <- stepAIC(full_model, direction = "backward", trace = FALSE)
vars_global <- names(coef(step_model))[-1]
# VIF iteratif
repeat {
if (length(vars_global) <= 1) break
form_tmp <- as.formula(paste("Estimasi ~", paste(vars_global, collapse = " + ")))
model_tmp <- lm(form_tmp, data = df_model)
vif_values <- vif(model_tmp)
if (all(vif_values <= 10)) break
var_remove <- names(which.max(vif_values))
cat("Hapus (VIF):", var_remove, "=", round(max(vif_values),2), "\n")
vars_global <- vars_global[vars_global != var_remove]
}## Hapus (VIF): X17 = 21.14
##
## Variabel global final: X8 X9 X13_jumlah X15_mean X19 X21_mean X22_jumlah X40_mean X43 X44 X45 X33_mean
Fungsi-fungsi berikut dipakai berulang di seluruh skenario untuk menjaga konsistensi kode.
#' Seleksi variabel untuk satu segmen data
#'
#' @param df data.frame segmen
#' @param y_col nama kolom response (karakter)
#' @param excl_cols kolom yang tidak boleh masuk sebagai prediktor
#' @param max_top jumlah variabel korelasi awal (default 5)
#' @param r2_cap batas R² (default 0.8)
#' @return karakter vektor nama variabel terpilih
seleksi_vars <- function(df, y_col = "y_logit",
excl_cols = c("Estimasi","Kako","RSE","vardir",
"kdprov","y_prop","y_logit"),
max_top = 5, r2_cap = 0.8) {
# ── 1. Buang kolom konstan ─────────────────────────────────────────
keep <- sapply(df, function(x) length(unique(na.omit(x))) > 1)
df <- df[, keep]
kandidat <- setdiff(names(df), excl_cols)
if (length(kandidat) == 0) return(character(0))
# ── 2. Korelasi ────────────────────────────────────────────────────
cor_vals <- sapply(df[kandidat], function(x)
abs(cor(x, df[[y_col]], use = "complete.obs")))
cor_vals <- cor_vals[!is.na(cor_vals)]
if (length(cor_vals) == 0) return(character(0))
vars_awal <- names(sort(cor_vals, decreasing = TRUE))[1:min(max_top, length(cor_vals))]
vars_awal <- vars_awal[!is.na(vars_awal)]
if (length(vars_awal) == 0) return(character(0))
# ── 3. Stepwise ────────────────────────────────────────────────────
vars <- vars_awal
step <- try(stepAIC(
lm(as.formula(paste(y_col, "~", paste(vars_awal, collapse = "+"))), data = df),
direction = "backward", trace = FALSE), silent = TRUE)
if (!inherits(step, "try-error"))
vars <- names(coef(step))[-1]
vars <- vars[!is.na(vars)]
if (length(vars) == 0) vars <- vars_awal[1]
# ── 4. VIF iteratif ────────────────────────────────────────────────
repeat {
if (length(vars) <= 1) break
m <- try(lm(as.formula(paste(y_col, "~", paste(vars, collapse = "+"))), data = df),
silent = TRUE)
if (inherits(m, "try-error")) { vars <- vars[1]; break }
v <- try(vif(m), silent = TRUE)
if (inherits(v, "try-error")) { vars <- vars[1]; break }
# Hapus NA sebelum evaluasi — vif() bisa return NA pada kolom near-constant
v_clean <- v[!is.na(v)]
if (length(v_clean) == 0 || all(v_clean <= 10)) break
vars <- vars[vars != names(which.max(v_clean))]
}
# ── 5. Anti-overfit ────────────────────────────────────────────────
if (length(vars) > 1) {
m <- lm(as.formula(paste(y_col, "~", paste(vars, collapse = "+"))), data = df)
if (summary(m)$r.squared > r2_cap) vars <- vars[1]
}
return(vars)
}#' Jalankan EBLUP Fay-Herriot untuk satu segmen
#'
#' @param df data.frame segmen (harus punya kolom Estimasi, RSE, vardir)
#' @param vars karakter vektor prediktor
#' @return df dengan kolom tambahan: y_eblup, mse, RMSE_eblup, RSE_eblup
run_eblup <- function(df, vars) {
df$vardir <- pmax(df$vardir, 1e-10) # stabilisasi, hindari 0 atau negatif
# scale() aman untuk 1 atau >1 variabel
df[vars] <- as.data.frame(scale(df[vars]))
# Ganti NaN dari scale (kolom konstan) dengan 0
df[vars][is.nan(as.matrix(df[vars]))] <- 0
form <- as.formula(paste("Estimasi ~", paste(vars, collapse = "+")))
m <- try(mseFH(form, vardir = vardir, data = df), silent = TRUE)
if (inherits(m, "try-error") || is.null(m$mse) || is.null(m$est)) {
df$y_eblup <- df$Estimasi
df$mse <- df$vardir
} else {
df$y_eblup <- as.numeric(m$est$eblup)
df$mse <- as.numeric(m$mse)
}
# Cegah NaN dari mse negatif kecil (floating point)
df$mse <- pmax(df$mse, 0)
df$RMSE_direct <- sqrt(df$vardir)
df$RMSE_eblup <- sqrt(df$mse)
df$RSE_direct <- df$RMSE_direct / pmax(df$Estimasi, 1e-6) * 100
df$RSE_eblup <- df$RMSE_eblup / pmax(df$y_eblup, 1e-6) * 100
return(df)
}#' Jalankan EBLUP dengan transformasi logit untuk satu segmen
#'
#' @param df data.frame segmen
#' @param vars karakter vektor prediktor (diseleksi pada skala logit)
#' @return df dengan kolom tambahan: y_logit_bt, mse_logit, RMSE_logit, RSE_logit
run_eblup_logit <- function(df, vars) {
eps <- 1e-4
df$y_prop <- df$Estimasi / 100
df$y_prop <- pmax(pmin(df$y_prop, 1 - eps), eps)
df$y_logit <- log(df$y_prop / (1 - df$y_prop))
# Vardir pada skala logit (delta method)
df$vardir <- pmax(df$vardir, 1e-10)
df$vardir_logit <- df$vardir / (df$y_prop * (1 - df$y_prop))^2
df$vardir_logit <- pmax(df$vardir_logit, 1e-10)
# scale() aman untuk 1 atau >1 variabel
df[vars] <- as.data.frame(scale(df[vars]))
df[vars][is.nan(as.matrix(df[vars]))] <- 0
form <- as.formula(paste("y_logit ~", paste(vars, collapse = "+")))
m <- try(mseFH(form, vardir = vardir_logit, data = df), silent = TRUE)
if (inherits(m, "try-error") || is.null(m$mse) || is.null(m$est)) {
df$eblup_logit <- df$y_logit
df$mse_logit <- df$vardir_logit
} else {
df$eblup_logit <- as.numeric(m$est$eblup)
df$mse_logit <- as.numeric(m$mse)
}
df$mse_logit <- pmax(df$mse_logit, 0) # cegah NaN
# Back-transform ke skala proporsi → persen
df$y_logit_bt <- plogis(df$eblup_logit) * 100
# MSE back-transform (delta method: g'(x) = p*(1-p))
p_bt <- plogis(df$eblup_logit)
df$mse_logit_bt <- pmax(df$mse_logit * (p_bt * (1 - p_bt))^2, 0)
df$RMSE_logit <- sqrt(df$mse_logit_bt)
df$RSE_logit <- df$RMSE_logit / pmax(df$y_logit_bt, 1e-6) * 100
# ── RSE_direct (wajib ada untuk plot & summary) ──────────────────────
df$RMSE_direct <- sqrt(df$vardir)
df$RSE_direct <- df$RMSE_direct / pmax(df$Estimasi, 1e-6) * 100
return(df)
}#' Jalankan HB Beta untuk satu segmen
#'
#' Membutuhkan JAGS + saeHB. Jika tidak tersedia, fallback ke direct estimator.
#' @param df data.frame segmen
#' @param vars karakter vektor prediktor
#' @param has_jags logical flag — deteksi otomatis dari chunk cek-jags
#' @return df dengan kolom tambahan: y_hb_pct, sd_hb, RSE_hb
run_hbbeta <- function(df, vars, has_jags = get("has_jags", envir = .GlobalEnv)) {
# ── Fallback jika JAGS tidak terinstal ──────────────────────────────
if (!has_jags) {
df$y_hb_pct <- df$Estimasi
df$sd_hb <- NA_real_
df$RSE_hb <- df$RSE
return(df)
}
eps <- 1e-4
df$y_prop <- df$Estimasi / 100
df$y_prop <- pmax(pmin(df$y_prop, 1 - eps), eps)
df[vars] <- as.data.frame(scale(df[vars]))
df[vars][is.nan(as.matrix(df[vars]))] <- 0
form_hb <- as.formula(paste("y_prop ~", paste(vars, collapse = "+")))
hb <- try(saeHB::Beta(
formula = form_hb,
data = df,
iter.update = 10, # ↓ dari 75 → default sudah cukup
iter.mcmc = 4000, # ↓ dari 10000
thin = 10, # ↓ dari 40 → ini penghematan terbesar
burn.in = 1000 # ↓ dari 2000
), silent = TRUE)
if (inherits(hb, "try-error") || is.null(hb$Est)) {
message(" HB Beta gagal konvergensi → fallback ke direct")
df$y_hb_pct <- df$Estimasi
df$sd_hb <- NA_real_
df$RSE_hb <- df$RSE
} else {
df$y_hb_pct <- hb$Est$MEAN * 100
df$sd_hb <- hb$Est$SD
df$RSE_hb <- df$sd_hb / pmax(df$y_hb_pct / 100, 1e-6) * 100
}
return(df)
}#' Plot perbandingan RSE antar metode
#'
#' @param data data.frame dengan kolom Kako dan dua kolom RSE
#' @param col_rse1 nama kolom RSE pertama
#' @param col_rse2 nama kolom RSE kedua
#' @param lab1 label metode 1
#' @param lab2 label metode 2
#' @param title judul plot
plot_rse <- function(data, col_rse1, col_rse2, lab1, lab2, title) {
# ── Guard: pastikan kedua kolom ada ──────────────────────────────────
missing_cols <- setdiff(c(col_rse1, col_rse2), names(data))
if (length(missing_cols) > 0) {
message("⚠ plot_rse(): kolom tidak ditemukan → ", paste(missing_cols, collapse = ", "))
return(invisible(NULL))
}
df_plt <- data %>%
dplyr::select(Kako, dplyr::all_of(c(col_rse1, col_rse2))) %>%
pivot_longer(-Kako, names_to = "Metode", values_to = "RSE") %>%
mutate(
Metode = case_when( # case_when: bebas konflik namespace
Metode == col_rse1 ~ lab1,
Metode == col_rse2 ~ lab2,
TRUE ~ Metode
),
Kako = as.character(Kako)
) %>%
filter(!is.na(RSE))
ggplot(df_plt, aes(x = Kako, y = RSE, group = Metode, color = Metode)) +
geom_line(linewidth = 0.8, alpha = 0.85) +
geom_point(size = 1.6) +
geom_hline(yintercept = 25, linetype = "dashed",
color = "#c0392b", linewidth = 0.9) +
annotate("text", x = 1, y = 26.5, label = "Batas RSE 25%",
color = "#c0392b", hjust = 0, size = 3.2) +
scale_color_manual(values = c("#2980b9","#27ae60","#e67e22",
"#8e44ad","#c0392b")[1:2]) +
labs(title = title,
x = "Kabupaten/Kota", y = "RSE (%)", color = NULL) +
theme_minimal(base_size = 12) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, size = 7),
legend.position = "top",
plot.title = element_text(face = "bold", size = 13)
)
}
#' Hitung % domain dengan RSE < 25
pct_rse_ok <- function(x) round(mean(x < 25, na.rm = TRUE) * 100, 1)
#' Tabel distribusi RSE
tabel_rse <- function(data, var_rse, label_metode) {
data.frame(
Metode = label_metode,
`RSE >= 25` = sum(data[[var_rse]] >= 25, na.rm = TRUE),
`RSE < 25` = sum(data[[var_rse]] < 25, na.rm = TRUE),
`% OK` = pct_rse_ok(data[[var_rse]]),
check.names = FALSE
)
}# Variance sampling
df_s1 <- df_gabungan %>%
mutate(vardir = (RSE / 100 * Estimasi)^2) %>%
as.data.frame()
# Seleksi variabel & run EBLUP
vars_s1 <- seleksi_vars(df_s1, y_col = "Estimasi",
excl_cols = c("Kako","RSE","vardir","Estimasi"))
df_s1 <- run_eblup(df_s1, vars_s1)
df_eblup_global <- df_s1
cat("Variabel terpilih:", vars_s1, "\n")## Variabel terpilih: X45 X28
## Estimasi y_eblup RSE_direct RSE_eblup
## Min. : 0.08 Min. : 0.0836 Min. : 22.9 Min. :21.7
## 1st Qu.: 6.18 1st Qu.: 6.8055 1st Qu.: 38.3 1st Qu.:25.9
## Median : 9.32 Median : 8.8513 Median : 44.4 Median :28.2
## Mean :10.08 Mean : 8.2477 Mean : 47.6 Mean :32.6
## 3rd Qu.:13.93 3rd Qu.:10.1615 3rd Qu.: 53.5 3rd Qu.:32.3
## Max. :24.05 Max. :13.6089 Max. :101.7 Max. :97.3
plot_rse(df_eblup_global, "RSE_direct", "RSE_eblup",
"Direct", "EBLUP Global",
"Skenario 1: RSE Direct vs EBLUP Global (Pulau)")Skenario 1 — EBLUP Global vs Direct
df_all_s2 <- df_gabungan %>%
mutate(kdprov = substr(Kako, 1, 2),
vardir = (RSE / 100 * Estimasi)^2) %>%
as.data.frame()
list_prov <- sort(unique(df_all_s2$kdprov))
hasil_list_s2 <- list()
for (p in list_prov) {
df_p <- filter(df_all_s2, kdprov == p)
if (nrow(df_p) < 5) next
vars_p <- seleksi_vars(df_p, y_col = "Estimasi",
excl_cols = c("Kako","RSE","vardir","Estimasi","kdprov"),
max_top = 3)
if (length(vars_p) == 0) next
hasil_list_s2[[p]] <- run_eblup(df_p, vars_p)
}
df_eblup_prov <- bind_rows(hasil_list_s2)
summary(df_eblup_prov[, c("Estimasi","y_eblup","RSE_direct","RSE_eblup")])## Estimasi y_eblup RSE_direct RSE_eblup
## Min. : 0.08 Min. : 0.0817 Min. : 22.9 Min. : 18.6
## 1st Qu.: 6.18 1st Qu.: 6.6901 1st Qu.: 38.3 1st Qu.: 20.9
## Median : 9.32 Median : 8.7997 Median : 44.4 Median : 27.4
## Mean :10.08 Mean : 8.6514 Mean : 47.6 Mean : 35.6
## 3rd Qu.:13.93 3rd Qu.:11.8395 3rd Qu.: 53.5 3rd Qu.: 36.3
## Max. :24.05 Max. :16.5036 Max. :101.7 Max. :231.1
## NA's :13 NA's :13
plot_rse(df_eblup_prov, "RSE_direct", "RSE_eblup",
"Direct", "EBLUP Provinsi",
"Skenario 2: RSE Direct vs EBLUP per Provinsi")Skenario 2 — EBLUP Provinsi vs Direct
Transformasi logit: \(y_i^* = \log\!\left(\frac{y_i/100}{1 - y_i/100}\right)\), kemudian EBLUP pada skala \(y^*\), lalu back-transform \(\hat{y}_i = \text{logistic}(\hat{y}_i^*) \times 100\). Vardir pada skala logit dihitung dengan delta method: \(\sigma^2_{\text{logit}} = \sigma^2 / (p(1-p))^2\).
df_s5 <- df_gabungan %>%
mutate(vardir = (RSE / 100 * Estimasi)^2,
y_prop = pmax(pmin(Estimasi / 100, 1 - 1e-4), 1e-4),
y_logit = log(y_prop / (1 - y_prop))) %>%
as.data.frame()
vars_s5 <- seleksi_vars(df_s5, y_col = "y_logit",
excl_cols = c("Kako","RSE","vardir",
"Estimasi","y_prop","y_logit"))
df_s5 <- run_eblup_logit(df_s5, vars_s5)
df_eblup_logit_global <- df_s5
cat("Variabel terpilih:", vars_s5, "\n")## Variabel terpilih: X8 X45 X44
## Estimasi y_logit_bt RSE_direct RSE_logit
## Min. : 0.08 Min. : 2.84 Min. : 22.9 Min. : 8.3
## 1st Qu.: 6.18 1st Qu.: 8.98 1st Qu.: 38.3 1st Qu.:10.2
## Median : 9.32 Median :11.03 Median : 44.4 Median :11.5
## Mean :10.08 Mean :10.10 Mean : 47.6 Mean :13.0
## 3rd Qu.:13.93 3rd Qu.:12.12 3rd Qu.: 53.5 3rd Qu.:13.4
## Max. :24.05 Max. :13.94 Max. :101.7 Max. :36.2
plot_rse(df_eblup_logit_global, "RSE_direct", "RSE_logit",
"Direct", "EBLUP Logit Global",
"Skenario 5: RSE Direct vs EBLUP Logit Global (Pulau)")Skenario 5 — EBLUP Logit Global vs Direct
df_all_s6 <- df_gabungan %>%
mutate(kdprov = substr(Kako, 1, 2),
vardir = (RSE / 100 * Estimasi)^2,
y_prop = pmax(pmin(Estimasi / 100, 1 - 1e-4), 1e-4),
y_logit = log(y_prop / (1 - y_prop))) %>%
as.data.frame()
hasil_list_s6 <- list()
for (p in sort(unique(df_all_s6$kdprov))) {
df_p <- filter(df_all_s6, kdprov == p)
if (nrow(df_p) < 5) next
vars_p <- seleksi_vars(df_p, y_col = "y_logit",
excl_cols = c("Kako","RSE","vardir","Estimasi",
"y_prop","y_logit","kdprov"),
max_top = 3)
if (length(vars_p) == 0) next
hasil_list_s6[[p]] <- run_eblup_logit(df_p, vars_p)
}
df_eblup_logit_prov <- bind_rows(hasil_list_s6)
summary(df_eblup_logit_prov[, c("Estimasi","y_logit_bt","RSE_direct","RSE_logit")])## Estimasi y_logit_bt RSE_direct RSE_logit
## Min. : 0.08 Min. : 0.34 Min. : 22.9 Min. : 15.7
## 1st Qu.: 6.18 1st Qu.: 7.29 1st Qu.: 38.3 1st Qu.: 21.2
## Median : 9.32 Median :10.50 Median : 44.4 Median : 26.0
## Mean :10.08 Mean :10.27 Mean : 47.6 Mean : 28.5
## 3rd Qu.:13.93 3rd Qu.:13.20 3rd Qu.: 53.5 3rd Qu.: 34.8
## Max. :24.05 Max. :26.02 Max. :101.7 Max. :102.5
plot_rse(df_eblup_logit_prov, "RSE_direct", "RSE_logit",
"Direct", "EBLUP Logit Provinsi",
"Skenario 6: RSE Direct vs EBLUP Logit per Provinsi")Skenario 6 — EBLUP Logit Provinsi vs Direct
Prasyarat JAGS: Skenario HB Beta memerlukan software
JAGS yang diinstal terpisah di sistem operasi.
Download: https://sourceforge.net/projects/mcmc-jags/files/JAGS/4.x/
Pilih JAGS-4.x.y.exe (Windows), instal, lalu
restart R, kemudian jalankan:
install.packages("rjags"); install.packages("saeHB")
Jika JAGS belum tersedia, chunk di bawah akan otomatis fallback
ke direct estimator.
if (!has_jags) {
message("⚠ JAGS tidak tersedia — Skenario 3 dilewati, menggunakan direct estimator sebagai placeholder.")
df_hb_global <- df_gabungan %>%
filter(substr(Kako, 1, 2) %in% c("71","72","73","74","75","76")) %>%
mutate(vardir = (RSE / 100 * Estimasi)^2,
y_hb_pct = Estimasi,
sd_hb = NA_real_,
RSE_hb = RSE) %>%
as.data.frame()
} else {
df_s3 <- df_gabungan %>%
filter(substr(Kako, 1, 2) %in% c("71","72","73","74","75","76")) %>%
mutate(vardir = (RSE / 100 * Estimasi)^2,
y_prop = pmax(pmin(Estimasi / 100, 1 - 1e-4), 1e-4),
y_logit = log(y_prop / (1 - y_prop))) %>%
as.data.frame()
vars_s3 <- seleksi_vars(df_s3, y_col = "y_logit",
excl_cols = c("Kako","RSE","vardir",
"Estimasi","y_prop","y_logit"))
df_s3 <- run_hbbeta(df_s3, vars_s3)
df_hb_global <- df_s3
cat("Variabel terpilih:", vars_s3, "\n")
}## Mean SD 2.5% 25% 50% 75% 97.5%
## intercept -2.3071 0.0226 -2.3519 -2.3232 -2.3075 -2.2914 -2.26
## X8 0.3229 0.0283 0.2686 0.3027 0.3237 0.3436 0.38
## X45 -0.2677 0.0266 -0.3191 -0.2865 -0.2676 -0.2480 -0.22
## X44 0.1449 0.0318 0.0860 0.1239 0.1459 0.1662 0.21
## Variabel terpilih: X8 X45 X44
## Estimasi y_hb_pct RSE RSE_hb
## Min. : 0.08 Min. : 2.13 Min. : 22.9 Min. :19.7
## 1st Qu.: 6.18 1st Qu.: 7.56 1st Qu.: 38.3 1st Qu.:22.1
## Median : 9.32 Median :10.07 Median : 44.4 Median :23.9
## Mean :10.08 Mean :10.11 Mean : 47.6 Mean :25.6
## 3rd Qu.:13.93 3rd Qu.:12.60 3rd Qu.: 53.5 3rd Qu.:28.0
## Max. :24.05 Max. :18.00 Max. :101.7 Max. :43.5
# Pastikan RSE_direct tersedia untuk scatter plot komparatif
if (!"RSE_direct" %in% names(df_hb_global)) {
df_hb_global <- df_hb_global %>%
mutate(vardir_tmp = (RSE / 100 * Estimasi)^2,
RSE_direct = sqrt(vardir_tmp) / pmax(Estimasi, 1e-6) * 100) %>%
dplyr::select(-vardir_tmp)
}plot_rse(df_hb_global, "RSE", "RSE_hb",
"Direct", "HB Beta Global",
"Skenario 3: RSE Direct vs HB Beta Global (Pulau)")Skenario 3 — HB Beta Global vs Direct
if (!has_jags) {
message("⚠ JAGS tidak tersedia — Skenario 4 dilewati, menggunakan direct estimator sebagai placeholder.")
df_hb_prov <- df_gabungan %>%
mutate(kdprov = substr(Kako, 1, 2),
vardir = (RSE / 100 * Estimasi)^2,
y_hb_pct = Estimasi,
sd_hb = NA_real_,
RSE_hb = RSE) %>%
as.data.frame()
} else {
df_all_s4 <- df_gabungan %>%
mutate(kdprov = substr(Kako, 1, 2),
vardir = (RSE / 100 * Estimasi)^2,
y_prop = pmax(pmin(Estimasi / 100, 1 - 1e-4), 1e-4),
y_logit = log(y_prop / (1 - y_prop))) %>%
as.data.frame()
hasil_list_s4 <- list()
for (p in sort(unique(df_all_s4$kdprov))) {
df_p <- filter(df_all_s4, kdprov == p)
if (nrow(df_p) < 5) next
vars_p <- tryCatch(
seleksi_vars(df_p, y_col = "y_logit",
excl_cols = c("Kako","RSE","vardir","Estimasi",
"y_prop","y_logit","kdprov"), max_top = 5),
error = function(e) {
message(" seleksi_vars gagal untuk provinsi ", p, ": ", conditionMessage(e))
character(0)
}
)
if (length(vars_p) == 0) next
hasil <- tryCatch(
run_hbbeta(df_p, vars_p),
error = function(e) {
message(" run_hbbeta gagal untuk provinsi ", p, ": ", conditionMessage(e))
df_p %>%
mutate(y_hb_pct = Estimasi, sd_hb = NA_real_, RSE_hb = RSE)
}
)
hasil_list_s4[[p]] <- hasil
}
# Pastikan df_hb_prov selalu terbentuk meski semua provinsi gagal
if (length(hasil_list_s4) > 0) {
df_hb_prov <- bind_rows(hasil_list_s4)
} else {
message(" Semua provinsi gagal — df_hb_prov menggunakan direct estimator.")
df_hb_prov <- df_all_s4 %>%
mutate(y_hb_pct = Estimasi, sd_hb = NA_real_, RSE_hb = RSE)
}
}## Mean SD 2.5% 25% 50% 75% 97.5%
## intercept -2.2460 0.0516 -2.3466 -2.2802 -2.2471 -2.2095 -2.15
## X30_mean -0.8004 0.0829 -0.9584 -0.8600 -0.8014 -0.7393 -0.65
## X8 0.4034 0.0682 0.2559 0.3591 0.4076 0.4530 0.52
## X44 0.4906 0.0808 0.3358 0.4330 0.4921 0.5416 0.66
## Mean SD 2.5% 25% 50% 75% 97.5%
## intercept -2.3647 0.0421 -2.4435 -2.3921 -2.3632 -2.3369 -2.28
## X28 0.3973 0.0417 0.3221 0.3672 0.3977 0.4259 0.48
## X33_mean -0.3473 0.0477 -0.4424 -0.3821 -0.3467 -0.3153 -0.26
## Mean SD 2.5% 25% 50% 75% 97.5%
## intercept -2.5070 0.0441 -2.5816 -2.5424 -2.5070 -2.4769 -2.42
## X26 -0.3025 0.0353 -0.3710 -0.3270 -0.3043 -0.2761 -0.24
## X25_mean -0.1892 0.0430 -0.2671 -0.2180 -0.1886 -0.1620 -0.10
## Mean SD 2.5% 25% 50% 75% 97.5%
## intercept -2.2502 0.0378 -2.3215 -2.2763 -2.2524 -2.2272 -2.17
## X29_mean -0.4161 0.0359 -0.4838 -0.4380 -0.4168 -0.3937 -0.34
## X21_mean -0.2108 0.0413 -0.2837 -0.2378 -0.2117 -0.1838 -0.13
## Mean SD 2.5% 25% 50% 75% 97.5%
## intercept -2.7863 0.0240 -2.8332 -2.8018 -2.7851 -2.7710 -2.74
## X24_mean -1.5214 0.0519 -1.6271 -1.5590 -1.5217 -1.4850 -1.43
## Mean SD 2.5% 25% 50% 75% 97.5%
## intercept -2.1182 0.0391 -2.1835 -2.1447 -2.1195 -2.0954 -2.03
## X24_mean -0.5063 0.0475 -0.5938 -0.5384 -0.5068 -0.4750 -0.42
## Estimasi y_hb_pct RSE RSE_hb
## Min. : 0.08 Min. : 0.367 Min. : 22.9 Min. : 0.721
## 1st Qu.: 6.18 1st Qu.: 6.565 1st Qu.: 38.3 1st Qu.:10.281
## Median : 9.32 Median : 9.673 Median : 44.4 Median :18.301
## Mean :10.08 Mean :10.109 Mean : 47.6 Mean :17.275
## 3rd Qu.:13.93 3rd Qu.:13.559 3rd Qu.: 53.5 3rd Qu.:23.691
## Max. :24.05 Max. :23.953 Max. :101.7 Max. :39.792
# Pastikan RSE_direct tersedia untuk scatter plot komparatif
if (!"RSE_direct" %in% names(df_hb_prov)) {
df_hb_prov <- df_hb_prov %>%
mutate(vardir_tmp = (RSE / 100 * Estimasi)^2,
RSE_direct = sqrt(vardir_tmp) / pmax(Estimasi, 1e-6) * 100) %>%
dplyr::select(-vardir_tmp)
}plot_rse(df_hb_prov, "RSE", "RSE_hb",
"Direct", "HB Beta Provinsi",
"Skenario 4: RSE Direct vs HB Beta per Provinsi")Skenario 4 — HB Beta Provinsi vs Direct
Jenks Natural Breaks: Domain dibagi ke dalam 2 kelompok berdasarkan RSE direct estimator. Kelompok 1 = RSE rendah (data lebih kuat), Kelompok 2 = RSE tinggi (data lebih lemah). Tiga model dijalankan pada masing-masing kelompok.
library(classInt)
# Hitung RSE direct
df_seg_rse <- df_gabungan %>%
mutate(vardir = (RSE / 100 * Estimasi)^2,
RSE_direct = sqrt(vardir) / Estimasi * 100) %>%
as.data.frame()
# Jenks Natural Breaks (k = 2)
breaks_jenks <- classIntervals(df_seg_rse$RSE_direct,
n = 2, style = "jenks")
df_seg_rse$grup_rse <- cut(df_seg_rse$RSE_direct,
breaks = breaks_jenks$brks,
labels = c("Grup 1 (RSE Rendah)", "Grup 2 (RSE Tinggi)"),
include.lowest = TRUE)
# Ringkasan grup
table_grup <- df_seg_rse %>%
group_by(grup_rse) %>%
summarise(
N = n(),
RSE_min = round(min(RSE_direct), 2),
RSE_max = round(max(RSE_direct), 2),
RSE_median = round(median(RSE_direct), 2),
.groups = "drop"
)
kable(table_grup, caption = "Distribusi Domain per Grup RSE (Jenks Natural Breaks)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"))| grup_rse | N | RSE_min | RSE_max | RSE_median |
|---|---|---|---|---|
| Grup 1 (RSE Rendah) | 62 | 22.94 | 53.58 | 41.78 |
| Grup 2 (RSE Tinggi) | 19 | 55.28 | 101.66 | 63.27 |
ggplot(df_seg_rse, aes(x = RSE_direct, fill = grup_rse)) +
geom_histogram(bins = 30, color = "white", alpha = 0.85) +
geom_vline(xintercept = breaks_jenks$brks[2], linetype = "dashed",
color = "#c0392b", linewidth = 1) +
scale_fill_manual(values = c("#2980b9","#e67e22")) +
labs(title = "Distribusi RSE Direct — Segmentasi Jenks Natural Breaks",
subtitle = paste0("Break point: ", round(breaks_jenks$brks[2], 2), "%"),
x = "RSE Direct (%)", y = "Jumlah Domain", fill = NULL) +
theme_minimal(base_size = 12) +
theme(legend.position = "top")Distribusi RSE Direct per Grup Jenks
list_grup <- levels(df_seg_rse$grup_rse)
hasil_s7 <- list()
for (g in list_grup) {
cat("\n===", g, "===\n")
df_g <- filter(df_seg_rse, grup_rse == g)
if (nrow(df_g) < 4) { cat("Skip: domain terlalu sedikit\n"); next }
vars_g <- seleksi_vars(df_g, y_col = "Estimasi",
excl_cols = c("Kako","RSE","vardir","Estimasi",
"RSE_direct","grup_rse"),
max_top = 5)
if (length(vars_g) == 0) next
cat("Variabel:", vars_g, "\n")
hasil_s7[[g]] <- run_eblup(df_g, vars_g)
}##
## === Grup 1 (RSE Rendah) ===
## Variabel: X19 X27_mean
##
## === Grup 2 (RSE Tinggi) ===
## Variabel: X45
df_eblup_rse <- bind_rows(hasil_s7)
summary(df_eblup_rse[, c("Estimasi","y_eblup","RSE_direct","RSE_eblup")])## Estimasi y_eblup RSE_direct RSE_eblup
## Min. : 0.08 Min. : 0.087 Min. : 22.9 Min. : 12.1
## 1st Qu.: 6.18 1st Qu.: 4.702 1st Qu.: 38.3 1st Qu.: 13.1
## Median : 9.32 Median : 9.354 Median : 44.4 Median : 15.7
## Mean :10.08 Mean : 8.017 Mean : 47.6 Mean : 26.3
## 3rd Qu.:13.93 3rd Qu.:10.854 3rd Qu.: 53.5 3rd Qu.: 39.1
## Max. :24.05 Max. :12.622 Max. :101.7 Max. :108.7
plot_rse(df_eblup_rse, "RSE_direct", "RSE_eblup",
"Direct", "EBLUP (RSE group)",
"Skenario 7: RSE Direct vs EBLUP per Grup RSE")Skenario 7 — EBLUP per Grup RSE
df_seg_rse_logit <- df_seg_rse %>%
mutate(y_prop = pmax(pmin(Estimasi / 100, 1 - 1e-4), 1e-4),
y_logit = log(y_prop / (1 - y_prop)))
hasil_s8 <- list()
for (g in list_grup) {
cat("\n===", g, "===\n")
df_g <- filter(df_seg_rse_logit, grup_rse == g)
if (nrow(df_g) < 4) next
vars_g <- seleksi_vars(df_g, y_col = "y_logit",
excl_cols = c("Kako","RSE","vardir","Estimasi",
"RSE_direct","grup_rse",
"y_prop","y_logit"),
max_top = 5)
if (length(vars_g) == 0) next
cat("Variabel:", vars_g, "\n")
hasil_s8[[g]] <- run_eblup_logit(df_g, vars_g)
}##
## === Grup 1 (RSE Rendah) ===
## Variabel: X8 X29_mean
##
## === Grup 2 (RSE Tinggi) ===
## Variabel: X22_jumlah X28
df_eblup_logit_rse <- bind_rows(hasil_s8)
summary(df_eblup_logit_rse[, c("Estimasi","y_logit_bt","RSE_direct","RSE_logit")])## Estimasi y_logit_bt RSE_direct RSE_logit
## Min. : 0.08 Min. : 1.22 Min. : 22.9 Min. : 9.24
## 1st Qu.: 6.18 1st Qu.: 6.65 1st Qu.: 38.3 1st Qu.:10.81
## Median : 9.32 Median :11.36 Median : 44.4 Median :12.36
## Mean :10.08 Mean : 9.83 Mean : 47.6 Mean :18.88
## 3rd Qu.:13.93 3rd Qu.:12.95 3rd Qu.: 53.5 3rd Qu.:25.78
## Max. :24.05 Max. :14.25 Max. :101.7 Max. :45.23
plot_rse(df_eblup_logit_rse, "RSE_direct", "RSE_logit",
"Direct", "EBLUP Logit (RSE group)",
"Skenario 8: RSE Direct vs EBLUP Logit per Grup RSE")Skenario 8 — EBLUP Logit per Grup RSE
if (!has_jags) {
message("⚠ JAGS tidak tersedia — Skenario 9 dilewati.")
df_hb_rse <- df_seg_rse_logit %>%
mutate(y_hb_pct = Estimasi, sd_hb = NA_real_, RSE_hb = RSE)
} else {
hasil_s9 <- list()
for (g in list_grup) {
cat("\n===", g, "===\n")
df_g <- filter(df_seg_rse_logit, grup_rse == g)
if (nrow(df_g) < 5) next
vars_g <- seleksi_vars(df_g, y_col = "y_logit",
excl_cols = c("Kako","RSE","vardir","Estimasi",
"RSE_direct","grup_rse","y_prop","y_logit"),
max_top = 5)
if (length(vars_g) == 0) next
cat("Variabel:", vars_g, "\n")
hasil_s9[[g]] <- run_hbbeta(df_g, vars_g)
}
df_hb_rse <- bind_rows(hasil_s9)
}##
## === Grup 1 (RSE Rendah) ===
## Variabel: X8 X29_mean
## Mean SD 2.5% 25% 50% 75% 97.5%
## intercept -2.1044 0.0181 -2.1394 -2.1165 -2.1043 -2.0913 -2.07
## X8 0.1669 0.0197 0.1270 0.1534 0.1665 0.1800 0.21
## X29_mean -0.1376 0.0204 -0.1794 -0.1495 -0.1368 -0.1245 -0.10
##
## === Grup 2 (RSE Tinggi) ===
## Variabel: X22_jumlah X28
## Mean SD 2.5% 25% 50% 75% 97.5%
## intercept -3.2395 0.0546 -3.3488 -3.2770 -3.2385 -3.2051 -3.13
## X22_jumlah 0.3645 0.0553 0.2565 0.3253 0.3709 0.3987 0.47
## X28 0.3481 0.0555 0.2420 0.3108 0.3453 0.3874 0.45
## Estimasi y_hb_pct RSE RSE_hb
## Min. : 0.08 Min. : 1.14 Min. : 22.9 Min. : 5.49
## 1st Qu.: 6.18 1st Qu.: 6.57 1st Qu.: 38.3 1st Qu.: 7.14
## Median : 9.32 Median : 9.41 Median : 44.4 Median : 8.90
## Mean :10.08 Mean :10.09 Mean : 47.6 Mean :14.23
## 3rd Qu.:13.93 3rd Qu.:13.82 3rd Qu.: 53.5 3rd Qu.:14.10
## Max. :24.05 Max. :23.63 Max. :101.7 Max. :42.66
plot_rse(df_hb_rse, "RSE", "RSE_hb",
"Direct", "HB Beta (RSE group)",
"Skenario 9: RSE Direct vs HB Beta per Grup RSE")Skenario 9 — HB Beta per Grup RSE
K-Means Clustering (k = 2): Domain dikelompokkan berdasarkan karakteristik wilayah menggunakan variabel auxiliary yang terpilih secara global. Data distandardisasi sebelum clustering. Jumlah cluster optimal dikonfirmasi dengan elbow method.
# Gunakan variabel yang terpilih di seleksi global
df_seg_clust <- df_gabungan %>%
mutate(vardir = (RSE / 100 * Estimasi)^2) %>%
as.data.frame()
# Standardisasi variabel auxiliary
vars_clust <- vars_global # variabel hasil seleksi global
mat_clust <- scale(df_seg_clust[, vars_clust])
mat_clust[is.nan(mat_clust)] <- 0
# ── Elbow method (opsional, matikan jika lambat) ─────────────────────
set.seed(42)
wss <- sapply(1:6, function(k)
kmeans(mat_clust, centers = k, nstart = 25, iter.max = 100)$tot.withinss)
ggplot(data.frame(k = 1:6, wss = wss), aes(x = k, y = wss)) +
geom_line(color = "#2980b9", linewidth = 1) +
geom_point(color = "#2980b9", size = 3) +
scale_x_continuous(breaks = 1:6) +
labs(title = "Elbow Method — Penentuan Jumlah Cluster",
x = "Jumlah Cluster (k)", y = "Total Within-Cluster SS") +
theme_minimal(base_size = 12)# ── K-Means k = 2 ───────────────────────────────────────────────────
set.seed(42)
km_result <- kmeans(mat_clust, centers = 2,
nstart = 50, iter.max = 200)
df_seg_clust$cluster <- paste0("Cluster ", km_result$cluster)
# Ringkasan cluster
table_clust <- df_seg_clust %>%
mutate(RSE_direct = sqrt(vardir) / Estimasi * 100) %>%
group_by(cluster) %>%
summarise(
N = n(),
Est_mean = round(mean(Estimasi), 2),
Est_sd = round(sd(Estimasi), 2),
RSE_direct_med = round(median(RSE_direct), 2),
.groups = "drop"
)
kable(table_clust,
caption = "Karakteristik Domain per Cluster (K-Means, k=2)") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"))| cluster | N | Est_mean | Est_sd | RSE_direct_med |
|---|---|---|---|---|
| Cluster 1 | 71 | 10.95 | 5.32 | 43.59 |
| Cluster 2 | 10 | 3.85 | 3.14 | 57.45 |
df_seg_clust %>%
ggplot(aes(x = cluster, y = Estimasi, fill = cluster)) +
geom_boxplot(alpha = 0.8, outlier.color = "#c0392b") +
scale_fill_manual(values = c("#2980b9","#27ae60")) +
labs(title = "Distribusi Direct Estimasi per Cluster",
x = NULL, y = "Estimasi (%)", fill = NULL) +
theme_minimal(base_size = 12) +
theme(legend.position = "none")Distribusi Estimasi per Cluster
list_clust <- unique(df_seg_clust$cluster)
hasil_s10 <- list()
for (cl in list_clust) {
cat("\n===", cl, "===\n")
df_cl <- filter(df_seg_clust, cluster == cl)
if (nrow(df_cl) < 4) next
vars_cl <- seleksi_vars(df_cl, y_col = "Estimasi",
excl_cols = c("Kako","RSE","vardir",
"Estimasi","cluster"),
max_top = 5)
if (length(vars_cl) == 0) next
cat("Variabel:", vars_cl, "\n")
hasil_s10[[cl]] <- run_eblup(df_cl, vars_cl)
}##
## === Cluster 1 ===
## Variabel: X28 X45
##
## === Cluster 2 ===
## Variabel: X34_jumlah X25_mean
df_eblup_clust <- bind_rows(hasil_s10)
summary(df_eblup_clust[, c("Estimasi","y_eblup","RSE_direct","RSE_eblup")])## Estimasi y_eblup RSE_direct RSE_eblup
## Min. : 0.08 Min. : 1.33 Min. : 22.9 Min. :22.9
## 1st Qu.: 6.18 1st Qu.: 7.73 1st Qu.: 38.3 1st Qu.:26.7
## Median : 9.32 Median : 9.46 Median : 44.4 Median :28.7
## Mean :10.08 Mean : 9.12 Mean : 47.6 Mean :30.1
## 3rd Qu.:13.93 3rd Qu.:10.79 3rd Qu.: 53.5 3rd Qu.:31.0
## Max. :24.05 Max. :13.94 Max. :101.7 Max. :54.6
## NA's :10 NA's :10
plot_rse(df_eblup_clust, "RSE_direct", "RSE_eblup",
"Direct", "EBLUP (Cluster)",
"Skenario 10: RSE Direct vs EBLUP per Cluster")Skenario 10 — EBLUP per Cluster
df_seg_clust_logit <- df_seg_clust %>%
mutate(y_prop = pmax(pmin(Estimasi / 100, 1 - 1e-4), 1e-4),
y_logit = log(y_prop / (1 - y_prop)))
hasil_s11 <- list()
for (cl in list_clust) {
cat("\n===", cl, "===\n")
df_cl <- filter(df_seg_clust_logit, cluster == cl)
if (nrow(df_cl) < 4) next
vars_cl <- seleksi_vars(df_cl, y_col = "y_logit",
excl_cols = c("Kako","RSE","vardir","Estimasi",
"cluster","y_prop","y_logit"),
max_top = 5)
if (length(vars_cl) == 0) next
cat("Variabel:", vars_cl, "\n")
hasil_s11[[cl]] <- run_eblup_logit(df_cl, vars_cl)
}##
## === Cluster 1 ===
## Variabel: X45 X33_mean
##
## === Cluster 2 ===
## Variabel: X34_jumlah X11_mean
df_eblup_logit_clust <- bind_rows(hasil_s11)
summary(df_eblup_logit_clust[, c("Estimasi","y_logit_bt","RSE_direct","RSE_logit")])## Estimasi y_logit_bt RSE_direct RSE_logit
## Min. : 0.08 Min. : 2.10 Min. : 22.9 Min. : 8.74
## 1st Qu.: 6.18 1st Qu.: 9.09 1st Qu.: 38.3 1st Qu.:10.41
## Median : 9.32 Median :11.09 Median : 44.4 Median :11.76
## Mean :10.08 Mean :10.23 Mean : 47.6 Mean :16.41
## 3rd Qu.:13.93 3rd Qu.:12.15 3rd Qu.: 53.5 3rd Qu.:13.95
## Max. :24.05 Max. :15.35 Max. :101.7 Max. :54.65
plot_rse(df_eblup_logit_clust, "RSE_direct", "RSE_logit",
"Direct", "EBLUP Logit (Cluster)",
"Skenario 11: RSE Direct vs EBLUP Logit per Cluster")Skenario 11 — EBLUP Logit per Cluster
if (!has_jags) {
message("⚠ JAGS tidak tersedia — Skenario 12 dilewati.")
df_hb_clust <- df_seg_clust_logit %>%
mutate(y_hb_pct = Estimasi, sd_hb = NA_real_, RSE_hb = RSE)
} else {
hasil_s12 <- list()
for (cl in list_clust) {
cat("\n===", cl, "===\n")
df_cl <- filter(df_seg_clust_logit, cluster == cl)
if (nrow(df_cl) < 5) next
vars_cl <- seleksi_vars(df_cl, y_col = "y_logit",
excl_cols = c("Kako","RSE","vardir","Estimasi",
"cluster","y_prop","y_logit"),
max_top = 5)
if (length(vars_cl) == 0) next
cat("Variabel:", vars_cl, "\n")
hasil_s12[[cl]] <- run_hbbeta(df_cl, vars_cl)
}
df_hb_clust <- bind_rows(hasil_s12)
}##
## === Cluster 1 ===
## Variabel: X45 X33_mean
## Mean SD 2.5% 25% 50% 75% 97.5%
## intercept -2.1794 0.0194 -2.2213 -2.1921 -2.1788 -2.1649 -2.15
## X45 -0.1869 0.0221 -0.2302 -0.2008 -0.1865 -0.1740 -0.14
## X33_mean -0.1660 0.0209 -0.2059 -0.1792 -0.1665 -0.1522 -0.13
##
## === Cluster 2 ===
## Variabel: X34_jumlah X11_mean
## Mean SD 2.5% 25% 50% 75% 97.5%
## intercept -3.4338 0.0881 -3.5978 -3.4954 -3.4326 -3.3770 -3.26
## X34_jumlah 0.4964 0.0951 0.3161 0.4353 0.4979 0.5562 0.69
## X11_mean 0.1400 0.0950 -0.0352 0.0731 0.1470 0.2076 0.30
## Estimasi y_hb_pct RSE RSE_hb
## Min. : 0.08 Min. : 1.13 Min. : 22.9 Min. :16.5
## 1st Qu.: 6.18 1st Qu.: 7.22 1st Qu.: 38.3 1st Qu.:20.4
## Median : 9.32 Median : 9.67 Median : 44.4 Median :22.4
## Mean :10.08 Mean :10.09 Mean : 47.6 Mean :24.7
## 3rd Qu.:13.93 3rd Qu.:12.63 3rd Qu.: 53.5 3rd Qu.:25.9
## Max. :24.05 Max. :20.71 Max. :101.7 Max. :49.5
plot_rse(df_hb_clust, "RSE", "RSE_hb",
"Direct", "HB Beta (Cluster)",
"Skenario 12: RSE Direct vs HB Beta per Cluster")Skenario 12 — HB Beta per Cluster
# ── Kumpulkan distribusi RSE semua skenario ──────────────────────────
eval_rows <- list(
tabel_rse(df_gabungan %>% mutate(RSE_direct = RSE), "RSE_direct", "0. Direct"),
tabel_rse(df_eblup_global, "RSE_eblup", "1. EBLUP Global"),
tabel_rse(df_eblup_prov, "RSE_eblup", "2. EBLUP Provinsi"),
tabel_rse(df_hb_global, "RSE_hb", "3. HB Beta Global"),
tabel_rse(df_hb_prov, "RSE_hb", "4. HB Beta Provinsi"),
tabel_rse(df_eblup_logit_global, "RSE_logit", "5. EBLUP Logit Global"),
tabel_rse(df_eblup_logit_prov, "RSE_logit", "6. EBLUP Logit Provinsi"),
tabel_rse(df_eblup_rse, "RSE_eblup", "7. EBLUP RSE-group"),
tabel_rse(df_eblup_logit_rse, "RSE_logit", "8. EBLUP Logit RSE-group"),
tabel_rse(df_hb_rse, "RSE_hb", "9. HB Beta RSE-group"),
tabel_rse(df_eblup_clust, "RSE_eblup", "10. EBLUP Cluster"),
tabel_rse(df_eblup_logit_clust, "RSE_logit", "11. EBLUP Logit Cluster"),
tabel_rse(df_hb_clust, "RSE_hb", "12. HB Beta Cluster")
)
tabel_eval <- bind_rows(eval_rows)
kable(tabel_eval,
caption = "Perbandingan Distribusi RSE — 12 Skenario Model SAE",
col.names = c("Skenario","RSE ≥ 25","RSE < 25","% Domain OK")) %>%
kable_styling(bootstrap_options = c("striped","hover","condensed","bordered"),
full_width = FALSE) %>%
row_spec(which(tabel_eval$`% OK` == max(tabel_eval$`% OK`)),
bold = TRUE, background = "#d4edda") %>%
row_spec(1, background = "#fff3cd") # highlight direct| Skenario | RSE ≥ 25 | RSE < 25 | % Domain OK |
|---|---|---|---|
|
80 | 1 | 1.2 |
|
66 | 15 | 18.5 |
|
38 | 30 | 44.1 |
|
35 | 46 | 56.8 |
|
15 | 66 | 81.5 |
|
3 | 78 | 96.3 |
|
49 | 32 | 39.5 |
|
24 | 57 | 70.4 |
|
21 | 60 | 74.1 |
|
18 | 63 | 77.8 |
|
62 | 9 | 12.7 |
|
11 | 70 | 86.4 |
|
23 | 58 | 71.6 |
# Helper: hitung metrik akurasi terhadap direct
metrik <- function(est_model, est_direct, label) {
rb <- mean((est_model - est_direct) / est_direct, na.rm = TRUE) * 100
rmse <- sqrt(mean((est_model - est_direct)^2, na.rm = TRUE))
data.frame(
Skenario = label,
`RB (%)` = round(rb, 3),
`RMSE` = round(rmse, 4),
check.names = FALSE
)
}
metrik_rows <- list(
metrik(df_eblup_global$y_eblup, df_gabungan$Estimasi, "1. EBLUP Global"),
metrik(df_eblup_prov$y_eblup, df_eblup_prov$Estimasi, "2. EBLUP Provinsi"),
metrik(df_hb_global$y_hb_pct, df_hb_global$Estimasi, "3. HB Beta Global"),
metrik(df_hb_prov$y_hb_pct, df_hb_prov$Estimasi, "4. HB Beta Provinsi"),
metrik(df_eblup_logit_global$y_logit_bt, df_eblup_logit_global$Estimasi, "5. EBLUP Logit Global"),
metrik(df_eblup_logit_prov$y_logit_bt, df_eblup_logit_prov$Estimasi, "6. EBLUP Logit Provinsi"),
metrik(df_eblup_rse$y_eblup, df_eblup_rse$Estimasi, "7. EBLUP RSE-group"),
metrik(df_eblup_logit_rse$y_logit_bt, df_eblup_logit_rse$Estimasi, "8. EBLUP Logit RSE-group"),
metrik(df_hb_rse$y_hb_pct, df_hb_rse$Estimasi, "9. HB Beta RSE-group"),
metrik(df_eblup_clust$y_eblup, df_eblup_clust$Estimasi, "10. EBLUP Cluster"),
metrik(df_eblup_logit_clust$y_logit_bt, df_eblup_logit_clust$Estimasi, "11. EBLUP Logit Cluster"),
metrik(df_hb_clust$y_hb_pct, df_hb_clust$Estimasi, "12. HB Beta Cluster")
)
tabel_metrik <- bind_rows(metrik_rows)
kable(tabel_metrik,
caption = "Metrik Akurasi — Relative Bias & RMSE Terhadap Direct Estimator") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed","bordered"),
full_width = FALSE) %>%
row_spec(which(abs(tabel_metrik$`RB (%)`) == min(abs(tabel_metrik$`RB (%)`))),
bold = TRUE, background = "#d4edda")| Skenario | RB (%) | RMSE |
|---|---|---|
|
-6.336 | 3.8559 |
|
-4.530 | 3.4774 |
|
66.831 | 2.3347 |
|
28.979 | 1.0322 |
|
98.898 | 4.7280 |
|
55.022 | 3.9476 |
|
-10.971 | 4.3171 |
|
36.878 | 4.1330 |
|
22.162 | 0.5763 |
|
-5.383 | 3.8960 |
|
71.278 | 4.7474 |
|
31.295 | 1.5462 |
tabel_eval_plot <- tabel_eval[-1, ] %>% # buang baris Direct
mutate(
Skenario = factor(Metode, levels = rev(Metode)),
Segmentasi = case_when(
grepl("Global|Provinsi", Metode) & !grepl("RSE|Cluster", Metode) ~ "Pulau / Provinsi",
grepl("RSE", Metode) ~ "RSE Natural Break",
grepl("Cluster", Metode) ~ "Clustering",
TRUE ~ "Lainnya"
)
)
ggplot(tabel_eval_plot, aes(x = Skenario, y = `% OK`, fill = Segmentasi)) +
geom_col(alpha = 0.88, width = 0.7) +
geom_text(aes(label = paste0(`% OK`, "%")),
hjust = -0.15, size = 3.5, fontface = "bold") +
geom_vline(xintercept = tabel_eval[1, "% OK"],
linetype = "dashed", color = "#c0392b", linewidth = 0.9) +
scale_fill_manual(values = c(
"Pulau / Provinsi" = "#2980b9",
"RSE Natural Break" = "#e67e22",
"Clustering" = "#27ae60"
)) +
scale_y_continuous(limits = c(0, 115), expand = c(0, 0)) +
coord_flip() +
labs(title = "Persentase Domain dengan RSE < 25% — 12 Skenario SAE",
subtitle = "Garis merah = baseline Direct Estimator",
x = NULL, y = "% Domain RSE < 25%", fill = "Segmentasi") +
theme_minimal(base_size = 12) +
theme(
legend.position = "top",
plot.title = element_text(face = "bold"),
panel.grid.major.y = element_blank()
)Persentase Domain dengan RSE < 25% per Skenario
# Gabung semua skenario dalam format panjang
df_scatter <- bind_rows(
df_eblup_global %>% mutate(Skenario = "1. EBLUP Global", RSE_model = RSE_eblup),
df_eblup_prov %>% mutate(Skenario = "2. EBLUP Provinsi", RSE_model = RSE_eblup),
df_hb_global %>% mutate(Skenario = "3. HB Beta Global", RSE_model = RSE_hb, RSE_direct = RSE),
df_hb_prov %>% mutate(Skenario = "4. HB Beta Provinsi", RSE_model = RSE_hb, RSE_direct = RSE),
df_eblup_logit_global %>% mutate(Skenario = "5. EBLUP Logit Global", RSE_model = RSE_logit),
df_eblup_logit_prov %>% mutate(Skenario = "6. EBLUP Logit Provinsi", RSE_model = RSE_logit),
df_eblup_rse %>% mutate(Skenario = "7. EBLUP RSE-group", RSE_model = RSE_eblup),
df_eblup_logit_rse %>% mutate(Skenario = "8. EBLUP Logit RSE-group",RSE_model = RSE_logit),
df_hb_rse %>% mutate(Skenario = "9. HB Beta RSE-group", RSE_model = RSE_hb, RSE_direct = RSE),
df_eblup_clust %>% mutate(Skenario = "10. EBLUP Cluster", RSE_model = RSE_eblup),
df_eblup_logit_clust %>% mutate(Skenario = "11. EBLUP Logit Cluster", RSE_model = RSE_logit),
df_hb_clust %>% mutate(Skenario = "12. HB Beta Cluster", RSE_model = RSE_hb, RSE_direct = RSE)
) %>%
dplyr::select(Skenario, RSE_direct, RSE_model) %>%
filter(!is.na(RSE_direct) & !is.na(RSE_model))
lim_max <- quantile(c(df_scatter$RSE_direct, df_scatter$RSE_model), 0.99, na.rm = TRUE)
ggplot(df_scatter, aes(x = RSE_direct, y = RSE_model)) +
geom_point(alpha = 0.55, size = 1.5, color = "#2980b9") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "#c0392b") +
geom_hline(yintercept = 25, linetype = "dotted", color = "#e67e22") +
geom_vline(xintercept = 25, linetype = "dotted", color = "#e67e22") +
facet_wrap(~ Skenario, ncol = 4, scales = "free") +
coord_cartesian(xlim = c(0, lim_max), ylim = c(0, lim_max)) +
labs(title = "RSE Direct vs RSE Model — 12 Skenario SAE",
subtitle = "Garis merah = RSE sama dengan direct | Titik di bawah diagonal = estimasi lebih presisi",
x = "RSE Direct (%)", y = "RSE Model (%)") +
theme_minimal(base_size = 11) +
theme(
strip.text = element_text(face = "bold", size = 9),
plot.title = element_text(face = "bold")
)RSE Direct vs RSE Model — 12 Skenario
best_pct <- tabel_eval %>%
filter(Metode != "0. Direct") %>%
slice_max(`% OK`, n = 1)
cat("Model terbaik (% domain RSE < 25%):\n")## Model terbaik (% domain RSE < 25%):
## → 5. EBLUP Logit Global : 96.3 %
best_rb <- tabel_metrik %>%
slice_min(abs(`RB (%)`), n = 1)
cat("Model dengan Relative Bias terkecil:\n")## Model dengan Relative Bias terkecil:
## → 2. EBLUP Provinsi : RB = -4.53 %
Interpretasi kunci: