Ringkasan analisis: Membandingkan 12 skenario estimasi SAE pada data Kabupaten/Kota Sulawesi menggunakan tiga pendekatan model (EBLUP, EBLUP Logit, HB Beta) dan empat strategi segmentasi domain (Pulau, Provinsi, RSE Natural Break, Clustering).

Load Library & Data

Library

# ── PENTING: MASS & car di-load DULU sebelum dplyr ───────────────────
# supaya dplyr::select dan dplyr::recode tidak tertimpa oleh car/MASS
library(MASS)       # stepAIC  (masking dplyr::select dicegah dengan load order)
library(car)        # vif      (car::recode tidak akan tertimpa dplyr)

# ── Manipulasi data ──────────────────────────────────────────────────
library(readxl)
library(dplyr)      # load SETELAH MASS & car → dplyr::select & ::recode menang
library(tidyr)

# ── Visualisasi ──────────────────────────────────────────────────────
library(ggplot2)
library(scales)
library(patchwork)

# ── SAE ──────────────────────────────────────────────────────────────
library(sae)        # mseFH  (EBLUP)

# ── Segmentasi ───────────────────────────────────────────────────────
library(classInt)   # Jenks Natural Breaks
library(factoextra) # visualisasi clustering (opsional)

# ── Tabel output ─────────────────────────────────────────────────────
library(knitr)
library(kableExtra)

# Pastikan fungsi-fungsi kritis memakai namespace yang benar
select    <- dplyr::select
recode    <- dplyr::recode

Cek JAGS & saeHB

# ============================================================
# JAGS adalah software eksternal yang HARUS diinstal terpisah
# sebelum package saeHB / rjags bisa digunakan.
#
# Langkah instalasi JAGS:
#   1. Download installer dari:
#      https://sourceforge.net/projects/mcmc-jags/files/JAGS/4.x/
#   2. Pilih file: JAGS-4.x.y.exe  (Windows)
#              atau via brew/apt  (Mac/Linux)
#   3. Instal seperti biasa, RESTART R sesudahnya
#   4. Lalu jalankan: install.packages("rjags"); install.packages("saeHB")
# ============================================================

# Deteksi otomatis apakah JAGS + saeHB tersedia
has_jags <- tryCatch({
  library(rjags)
  library(saeHB)
  TRUE
}, error = function(e) {
  message("⚠ saeHB/JAGS tidak tersedia — skenario HB Beta akan dilewati.")
  message("  Instal JAGS dari: https://sourceforge.net/projects/mcmc-jags/files/")
  FALSE
})

cat("Status JAGS/saeHB:", ifelse(has_jags, "✔ Tersedia", "✘ Tidak tersedia"), "\n")

## Status JAGS/saeHB: ✔ Tersedia

Load Data

# ── Raw data ─────────────────────────────────────────────────────────
Estimasi_531  <- read_excel("C:/Users/User/Downloads/Hasil_sulawesi_KabKota.xlsx")
Podes_kab2024 <- read_excel("C:/Users/User/Downloads/Podes_kab_lengkap.xlsx")
Data_Penduduk <- read_excel("C:/Users/User/Downloads/Data_Penduduk.xlsx")

# ── Samakan format kode ───────────────────────────────────────────────
Estimasi_531  <- Estimasi_531  %>% mutate(Kako = sprintf("%04d", Kako))
Podes_kab2024 <- Podes_kab2024 %>% mutate(kode_kab = sprintf("%04s", kode_kab))

# ── Gabung Estimasi + PODES ───────────────────────────────────────────
df_gabungan <- Estimasi_531 %>%
  left_join(Podes_kab2024, by = c("Kako" = "kode_kab")) %>%
  dplyr::select(Kako, Estimasi, RSE, starts_with("X"))

# ── Gabung dengan data penduduk ───────────────────────────────────────
df_gabungan$Kako <- as.numeric(df_gabungan$Kako)

df_final <- df_gabungan %>%
  left_join(Data_Penduduk, by = "Kako")

cat("Jumlah domain:", nrow(df_final), "\n")

## Jumlah domain: 81

cat("Jumlah variabel:", ncol(df_final), "\n")

## Jumlah variabel: 112

Pembentukan Variabel Rasio

df_final <- df_final %>%
  mutate(
    # Per 1.000 penduduk
    X6  = X6_jumlah  / Jumlah_Penduduk * 1000,
    X7  = X7_jumlah  / Jumlah_Penduduk * 1000,
    X10 = X10_jumlah / Jumlah_Penduduk * 1000,
    X33 = X33_jumlah / Jumlah_Penduduk * 1000,

    # Per 1.000 perempuan
    X11 = X11_jumlah / Jumlah_P * 1000,
    X12 = X12_jumlah / Jumlah_P * 1000,
    X27 = X27_jumlah / Jumlah_P * 1000,
    X28 = X28_jumlah / Jumlah_P * 1000,
    X29 = X29_jumlah / Jumlah_P * 1000,
    X30 = X30_jumlah / Jumlah_P * 1000,
    X31 = X31_jumlah / Jumlah_P * 1000,
    X35 = X35_jumlah / Jumlah_P * 1000,
    X36 = X36_jumlah / Jumlah_P * 1000
  )

Seleksi Variabel Auxiliary

Pipeline seleksi: Korelasi tertinggi per kelompok → Backward Stepwise (AIC) → VIF iteratif (≤10) → Anti-overfit (R² ≤ 0.8)

Seleksi Awal: Korelasi per Kelompok Variabel

df <- df_final

exclude_vars <- c("Kako","Estimasi","RSE",
                  "y_eblup","mse","RMSE_direct","RMSE_eblup",
                  "RSE_direct","RSE_eblup","y_hb","sd_hb",
                  "RSE_hb","y_hb_percent","vardir",
                  "Jumlah_L","Jumlah_P","Jumlah_Penduduk")

all_vars <- setdiff(names(df), exclude_vars)

# Ambil basis nama (X1, X2, dst)
get_base <- function(x) sub("(_jumlah|_mean)$", "", x)
base_vars <- unique(sapply(all_vars, get_base))

hasil_vars <- c()

for (b in base_vars) {
  group_vars <- all_vars[get_base(all_vars) == b]
  cor_vals   <- sapply(group_vars, function(v)
    cor(df[[v]], df$Estimasi, use = "complete.obs"))
  best_var   <- names(which.max(abs(cor_vals)))
  hasil_vars <- c(hasil_vars, best_var)
}

# Data kandidat
df_gabungan <- df %>%
  dplyr::select(Kako, Estimasi, RSE, all_of(hasil_vars))

cat("Variabel kandidat terpilih (", length(hasil_vars), "):\n")

## Variabel kandidat terpilih ( 40 ):

cat(paste(hasil_vars, collapse = ", "), "\n")

## X1_mean, X2, X8, X9, X13_jumlah, X14_jumlah, X15_mean, X16, X17, X18_mean, X19, X21_mean, X22_jumlah, X23_mean, X24_mean, X25_mean, X26, X32, X37_mean, X38_mean, X39_mean, X40_mean, X41_mean, X43, X44, X45, X34_jumlah, X6_mean, X7_jumlah, X10_mean, X11_mean, X12, X27_mean, X28, X29_mean, X30_mean, X31_mean, X33_mean, X35_mean, X36_mean

Stepwise + VIF (Level Global)

df_model <- df_gabungan %>%
  dplyr::select(-Kako, -RSE) %>%
  as.data.frame()

# Backward stepwise
full_model  <- lm(Estimasi ~ ., data = df_model)
step_model  <- stepAIC(full_model, direction = "backward", trace = FALSE)
vars_global <- names(coef(step_model))[-1]

# VIF iteratif
repeat {
  if (length(vars_global) <= 1) break
  form_tmp    <- as.formula(paste("Estimasi ~", paste(vars_global, collapse = " + ")))
  model_tmp   <- lm(form_tmp, data = df_model)
  vif_values  <- vif(model_tmp)
  if (all(vif_values <= 10)) break
  var_remove  <- names(which.max(vif_values))
  cat("Hapus (VIF):", var_remove, "=", round(max(vif_values),2), "\n")
  vars_global <- vars_global[vars_global != var_remove]
}

## Hapus (VIF): X17 = 21.14

cat("\nVariabel global final:", vars_global, "\n")

## 
## Variabel global final: X8 X9 X13_jumlah X15_mean X19 X21_mean X22_jumlah X40_mean X43 X44 X45 X33_mean

Fungsi Pembantu

Fungsi-fungsi berikut dipakai berulang di seluruh skenario untuk menjaga konsistensi kode.

Seleksi Variabel per Segmen

#' Seleksi variabel untuk satu segmen data
#'
#' @param df        data.frame segmen
#' @param y_col     nama kolom response (karakter)
#' @param excl_cols kolom yang tidak boleh masuk sebagai prediktor
#' @param max_top   jumlah variabel korelasi awal (default 5)
#' @param r2_cap    batas R² (default 0.8)
#' @return karakter vektor nama variabel terpilih
seleksi_vars <- function(df, y_col = "y_logit",
                         excl_cols = c("Estimasi","Kako","RSE","vardir",
                                       "kdprov","y_prop","y_logit"),
                         max_top = 5, r2_cap = 0.8) {

  # ── 1. Buang kolom konstan ─────────────────────────────────────────
  keep <- sapply(df, function(x) length(unique(na.omit(x))) > 1)
  df   <- df[, keep]

  kandidat <- setdiff(names(df), excl_cols)
  if (length(kandidat) == 0) return(character(0))

  # ── 2. Korelasi ────────────────────────────────────────────────────
  cor_vals <- sapply(df[kandidat], function(x)
    abs(cor(x, df[[y_col]], use = "complete.obs")))
  cor_vals  <- cor_vals[!is.na(cor_vals)]
  if (length(cor_vals) == 0) return(character(0))
  vars_awal <- names(sort(cor_vals, decreasing = TRUE))[1:min(max_top, length(cor_vals))]
  vars_awal <- vars_awal[!is.na(vars_awal)]
  if (length(vars_awal) == 0) return(character(0))

  # ── 3. Stepwise ────────────────────────────────────────────────────
  vars <- vars_awal
  step <- try(stepAIC(
    lm(as.formula(paste(y_col, "~", paste(vars_awal, collapse = "+"))), data = df),
    direction = "backward", trace = FALSE), silent = TRUE)
  if (!inherits(step, "try-error"))
    vars <- names(coef(step))[-1]
  vars <- vars[!is.na(vars)]
  if (length(vars) == 0) vars <- vars_awal[1]

  # ── 4. VIF iteratif ────────────────────────────────────────────────
  repeat {
    if (length(vars) <= 1) break
    m <- try(lm(as.formula(paste(y_col, "~", paste(vars, collapse = "+"))), data = df),
             silent = TRUE)
    if (inherits(m, "try-error")) { vars <- vars[1]; break }
    v <- try(vif(m), silent = TRUE)
    if (inherits(v, "try-error")) { vars <- vars[1]; break }
    # Hapus NA sebelum evaluasi — vif() bisa return NA pada kolom near-constant
    v_clean <- v[!is.na(v)]
    if (length(v_clean) == 0 || all(v_clean <= 10)) break
    vars <- vars[vars != names(which.max(v_clean))]
  }

  # ── 5. Anti-overfit ────────────────────────────────────────────────
  if (length(vars) > 1) {
    m <- lm(as.formula(paste(y_col, "~", paste(vars, collapse = "+"))), data = df)
    if (summary(m)$r.squared > r2_cap) vars <- vars[1]
  }
  return(vars)
}

EBLUP per Segmen

#' Jalankan EBLUP Fay-Herriot untuk satu segmen
#'
#' @param df    data.frame segmen (harus punya kolom Estimasi, RSE, vardir)
#' @param vars  karakter vektor prediktor
#' @return df dengan kolom tambahan: y_eblup, mse, RMSE_eblup, RSE_eblup
run_eblup <- function(df, vars) {

  df$vardir <- pmax(df$vardir, 1e-10)   # stabilisasi, hindari 0 atau negatif

  # scale() aman untuk 1 atau >1 variabel
  df[vars] <- as.data.frame(scale(df[vars]))
  # Ganti NaN dari scale (kolom konstan) dengan 0
  df[vars][is.nan(as.matrix(df[vars]))] <- 0

  form <- as.formula(paste("Estimasi ~", paste(vars, collapse = "+")))

  m <- try(mseFH(form, vardir = vardir, data = df), silent = TRUE)

  if (inherits(m, "try-error") || is.null(m$mse) || is.null(m$est)) {
    df$y_eblup <- df$Estimasi
    df$mse     <- df$vardir
  } else {
    df$y_eblup <- as.numeric(m$est$eblup)
    df$mse     <- as.numeric(m$mse)
  }

  # Cegah NaN dari mse negatif kecil (floating point)
  df$mse         <- pmax(df$mse, 0)
  df$RMSE_direct <- sqrt(df$vardir)
  df$RMSE_eblup  <- sqrt(df$mse)
  df$RSE_direct  <- df$RMSE_direct / pmax(df$Estimasi, 1e-6) * 100
  df$RSE_eblup   <- df$RMSE_eblup  / pmax(df$y_eblup,  1e-6) * 100
  return(df)
}

EBLUP Logit per Segmen

#' Jalankan EBLUP dengan transformasi logit untuk satu segmen
#'
#' @param df    data.frame segmen
#' @param vars  karakter vektor prediktor (diseleksi pada skala logit)
#' @return df dengan kolom tambahan: y_logit_bt, mse_logit, RMSE_logit, RSE_logit
run_eblup_logit <- function(df, vars) {

  eps       <- 1e-4
  df$y_prop <- df$Estimasi / 100
  df$y_prop <- pmax(pmin(df$y_prop, 1 - eps), eps)
  df$y_logit <- log(df$y_prop / (1 - df$y_prop))

  # Vardir pada skala logit (delta method)
  df$vardir       <- pmax(df$vardir, 1e-10)
  df$vardir_logit <- df$vardir / (df$y_prop * (1 - df$y_prop))^2
  df$vardir_logit <- pmax(df$vardir_logit, 1e-10)

  # scale() aman untuk 1 atau >1 variabel
  df[vars] <- as.data.frame(scale(df[vars]))
  df[vars][is.nan(as.matrix(df[vars]))] <- 0

  form <- as.formula(paste("y_logit ~", paste(vars, collapse = "+")))

  m <- try(mseFH(form, vardir = vardir_logit, data = df), silent = TRUE)

  if (inherits(m, "try-error") || is.null(m$mse) || is.null(m$est)) {
    df$eblup_logit <- df$y_logit
    df$mse_logit   <- df$vardir_logit
  } else {
    df$eblup_logit <- as.numeric(m$est$eblup)
    df$mse_logit   <- as.numeric(m$mse)
  }

  df$mse_logit <- pmax(df$mse_logit, 0)   # cegah NaN

  # Back-transform ke skala proporsi → persen
  df$y_logit_bt <- plogis(df$eblup_logit) * 100

  # MSE back-transform (delta method: g'(x) = p*(1-p))
  p_bt            <- plogis(df$eblup_logit)
  df$mse_logit_bt <- pmax(df$mse_logit * (p_bt * (1 - p_bt))^2, 0)

  df$RMSE_logit  <- sqrt(df$mse_logit_bt)
  df$RSE_logit   <- df$RMSE_logit / pmax(df$y_logit_bt, 1e-6) * 100

  # ── RSE_direct (wajib ada untuk plot & summary) ──────────────────────
  df$RMSE_direct <- sqrt(df$vardir)
  df$RSE_direct  <- df$RMSE_direct / pmax(df$Estimasi, 1e-6) * 100

  return(df)
}

HB Beta per Segmen

#' Jalankan HB Beta untuk satu segmen
#'
#' Membutuhkan JAGS + saeHB. Jika tidak tersedia, fallback ke direct estimator.
#' @param df       data.frame segmen
#' @param vars     karakter vektor prediktor
#' @param has_jags logical flag — deteksi otomatis dari chunk cek-jags
#' @return df dengan kolom tambahan: y_hb_pct, sd_hb, RSE_hb
run_hbbeta <- function(df, vars, has_jags = get("has_jags", envir = .GlobalEnv)) {

  # ── Fallback jika JAGS tidak terinstal ──────────────────────────────
  if (!has_jags) {
    df$y_hb_pct <- df$Estimasi
    df$sd_hb    <- NA_real_
    df$RSE_hb   <- df$RSE
    return(df)
  }

  eps       <- 1e-4
  df$y_prop <- df$Estimasi / 100
  df$y_prop <- pmax(pmin(df$y_prop, 1 - eps), eps)

  df[vars] <- as.data.frame(scale(df[vars]))
  df[vars][is.nan(as.matrix(df[vars]))] <- 0

  form_hb <- as.formula(paste("y_prop ~", paste(vars, collapse = "+")))

  hb <- try(saeHB::Beta(
  formula     = form_hb,
  data        = df,
  iter.update = 10,      # ↓ dari 75  → default sudah cukup
  iter.mcmc   = 4000,   # ↓ dari 10000
  thin        = 10,     # ↓ dari 40  → ini penghematan terbesar
  burn.in     = 1000    # ↓ dari 2000
), silent = TRUE)
  if (inherits(hb, "try-error") || is.null(hb$Est)) {
    message("  HB Beta gagal konvergensi → fallback ke direct")
    df$y_hb_pct <- df$Estimasi
    df$sd_hb    <- NA_real_
    df$RSE_hb   <- df$RSE
  } else {
    df$y_hb_pct <- hb$Est$MEAN * 100
    df$sd_hb    <- hb$Est$SD
    df$RSE_hb   <- df$sd_hb / pmax(df$y_hb_pct / 100, 1e-6) * 100
  }
  return(df)
}

Plot RSE

#' Plot perbandingan RSE antar metode
#'
#' @param data      data.frame dengan kolom Kako dan dua kolom RSE
#' @param col_rse1  nama kolom RSE pertama
#' @param col_rse2  nama kolom RSE kedua
#' @param lab1      label metode 1
#' @param lab2      label metode 2
#' @param title     judul plot
plot_rse <- function(data, col_rse1, col_rse2, lab1, lab2, title) {

  # ── Guard: pastikan kedua kolom ada ──────────────────────────────────
  missing_cols <- setdiff(c(col_rse1, col_rse2), names(data))
  if (length(missing_cols) > 0) {
    message("⚠ plot_rse(): kolom tidak ditemukan → ", paste(missing_cols, collapse = ", "))
    return(invisible(NULL))
  }

  df_plt <- data %>%
    dplyr::select(Kako, dplyr::all_of(c(col_rse1, col_rse2))) %>%
    pivot_longer(-Kako, names_to = "Metode", values_to = "RSE") %>%
    mutate(
      Metode = case_when(          # case_when: bebas konflik namespace
        Metode == col_rse1 ~ lab1,
        Metode == col_rse2 ~ lab2,
        TRUE               ~ Metode
      ),
      Kako = as.character(Kako)
    ) %>%
    filter(!is.na(RSE))

  ggplot(df_plt, aes(x = Kako, y = RSE, group = Metode, color = Metode)) +
    geom_line(linewidth = 0.8, alpha = 0.85) +
    geom_point(size = 1.6) +
    geom_hline(yintercept = 25, linetype = "dashed",
               color = "#c0392b", linewidth = 0.9) +
    annotate("text", x = 1, y = 26.5, label = "Batas RSE 25%",
             color = "#c0392b", hjust = 0, size = 3.2) +
    scale_color_manual(values = c("#2980b9","#27ae60","#e67e22",
                                  "#8e44ad","#c0392b")[1:2]) +
    labs(title = title,
         x = "Kabupaten/Kota", y = "RSE (%)", color = NULL) +
    theme_minimal(base_size = 12) +
    theme(
      axis.text.x = element_text(angle = 90, vjust = 0.5, size = 7),
      legend.position = "top",
      plot.title      = element_text(face = "bold", size = 13)
    )
}

#' Hitung % domain dengan RSE < 25
pct_rse_ok <- function(x) round(mean(x < 25, na.rm = TRUE) * 100, 1)

#' Tabel distribusi RSE
tabel_rse <- function(data, var_rse, label_metode) {
  data.frame(
    Metode   = label_metode,
    `RSE >= 25` = sum(data[[var_rse]] >= 25, na.rm = TRUE),
    `RSE < 25`  = sum(data[[var_rse]] < 25,  na.rm = TRUE),
    `% OK`      = pct_rse_ok(data[[var_rse]]),
    check.names = FALSE
  )
}

Skenario 1 & 2 — EBLUP Existing

Skenario 1: EBLUP Global (Pulau)

# Variance sampling
df_s1 <- df_gabungan %>%
  mutate(vardir = (RSE / 100 * Estimasi)^2) %>%
  as.data.frame()

# Seleksi variabel & run EBLUP
vars_s1   <- seleksi_vars(df_s1, y_col = "Estimasi",
                          excl_cols = c("Kako","RSE","vardir","Estimasi"))
df_s1     <- run_eblup(df_s1, vars_s1)
df_eblup_global <- df_s1

cat("Variabel terpilih:", vars_s1, "\n")

## Variabel terpilih: X45 X28

summary(df_eblup_global[, c("Estimasi","y_eblup","RSE_direct","RSE_eblup")])

##     Estimasi        y_eblup          RSE_direct      RSE_eblup   
##  Min.   : 0.08   Min.   : 0.0836   Min.   : 22.9   Min.   :21.7  
##  1st Qu.: 6.18   1st Qu.: 6.8055   1st Qu.: 38.3   1st Qu.:25.9  
##  Median : 9.32   Median : 8.8513   Median : 44.4   Median :28.2  
##  Mean   :10.08   Mean   : 8.2477   Mean   : 47.6   Mean   :32.6  
##  3rd Qu.:13.93   3rd Qu.:10.1615   3rd Qu.: 53.5   3rd Qu.:32.3  
##  Max.   :24.05   Max.   :13.6089   Max.   :101.7   Max.   :97.3

plot_rse(df_eblup_global, "RSE_direct", "RSE_eblup",
         "Direct", "EBLUP Global",
         "Skenario 1: RSE Direct vs EBLUP Global (Pulau)")

Skenario 1 — EBLUP Global vs Direct

Skenario 2: EBLUP per Provinsi

df_all_s2 <- df_gabungan %>%
  mutate(kdprov = substr(Kako, 1, 2),
         vardir = (RSE / 100 * Estimasi)^2) %>%
  as.data.frame()

list_prov      <- sort(unique(df_all_s2$kdprov))
hasil_list_s2  <- list()

for (p in list_prov) {
  df_p <- filter(df_all_s2, kdprov == p)
  if (nrow(df_p) < 5) next

  vars_p <- seleksi_vars(df_p, y_col = "Estimasi",
                         excl_cols = c("Kako","RSE","vardir","Estimasi","kdprov"),
                         max_top = 3)
  if (length(vars_p) == 0) next

  hasil_list_s2[[p]] <- run_eblup(df_p, vars_p)
}

df_eblup_prov <- bind_rows(hasil_list_s2)
summary(df_eblup_prov[, c("Estimasi","y_eblup","RSE_direct","RSE_eblup")])

##     Estimasi        y_eblup          RSE_direct      RSE_eblup    
##  Min.   : 0.08   Min.   : 0.0817   Min.   : 22.9   Min.   : 18.6  
##  1st Qu.: 6.18   1st Qu.: 6.6901   1st Qu.: 38.3   1st Qu.: 20.9  
##  Median : 9.32   Median : 8.7997   Median : 44.4   Median : 27.4  
##  Mean   :10.08   Mean   : 8.6514   Mean   : 47.6   Mean   : 35.6  
##  3rd Qu.:13.93   3rd Qu.:11.8395   3rd Qu.: 53.5   3rd Qu.: 36.3  
##  Max.   :24.05   Max.   :16.5036   Max.   :101.7   Max.   :231.1  
##                  NA's   :13                        NA's   :13

plot_rse(df_eblup_prov, "RSE_direct", "RSE_eblup",
         "Direct", "EBLUP Provinsi",
         "Skenario 2: RSE Direct vs EBLUP per Provinsi")

Skenario 2 — EBLUP Provinsi vs Direct

Skenario 5 & 6 — EBLUP Logit Baru

Transformasi logit: \(y_i^* = \log\!\left(\frac{y_i/100}{1 - y_i/100}\right)\), kemudian EBLUP pada skala \(y^*\), lalu back-transform \(\hat{y}_i = \text{logistic}(\hat{y}_i^*) \times 100\). Vardir pada skala logit dihitung dengan delta method: \(\sigma^2_{\text{logit}} = \sigma^2 / (p(1-p))^2\).

Skenario 5: EBLUP Logit Global (Pulau)

df_s5 <- df_gabungan %>%
  mutate(vardir  = (RSE / 100 * Estimasi)^2,
         y_prop  = pmax(pmin(Estimasi / 100, 1 - 1e-4), 1e-4),
         y_logit = log(y_prop / (1 - y_prop))) %>%
  as.data.frame()

vars_s5         <- seleksi_vars(df_s5, y_col = "y_logit",
                                excl_cols = c("Kako","RSE","vardir",
                                              "Estimasi","y_prop","y_logit"))
df_s5           <- run_eblup_logit(df_s5, vars_s5)
df_eblup_logit_global <- df_s5

cat("Variabel terpilih:", vars_s5, "\n")

## Variabel terpilih: X8 X45 X44

summary(df_eblup_logit_global[, c("Estimasi","y_logit_bt","RSE_direct","RSE_logit")])

##     Estimasi       y_logit_bt      RSE_direct      RSE_logit   
##  Min.   : 0.08   Min.   : 2.84   Min.   : 22.9   Min.   : 8.3  
##  1st Qu.: 6.18   1st Qu.: 8.98   1st Qu.: 38.3   1st Qu.:10.2  
##  Median : 9.32   Median :11.03   Median : 44.4   Median :11.5  
##  Mean   :10.08   Mean   :10.10   Mean   : 47.6   Mean   :13.0  
##  3rd Qu.:13.93   3rd Qu.:12.12   3rd Qu.: 53.5   3rd Qu.:13.4  
##  Max.   :24.05   Max.   :13.94   Max.   :101.7   Max.   :36.2

plot_rse(df_eblup_logit_global, "RSE_direct", "RSE_logit",
         "Direct", "EBLUP Logit Global",
         "Skenario 5: RSE Direct vs EBLUP Logit Global (Pulau)")

Skenario 5 — EBLUP Logit Global vs Direct

Skenario 6: EBLUP Logit per Provinsi

df_all_s6 <- df_gabungan %>%
  mutate(kdprov  = substr(Kako, 1, 2),
         vardir  = (RSE / 100 * Estimasi)^2,
         y_prop  = pmax(pmin(Estimasi / 100, 1 - 1e-4), 1e-4),
         y_logit = log(y_prop / (1 - y_prop))) %>%
  as.data.frame()

hasil_list_s6 <- list()

for (p in sort(unique(df_all_s6$kdprov))) {
  df_p <- filter(df_all_s6, kdprov == p)
  if (nrow(df_p) < 5) next

  vars_p <- seleksi_vars(df_p, y_col = "y_logit",
                         excl_cols = c("Kako","RSE","vardir","Estimasi",
                                       "y_prop","y_logit","kdprov"),
                         max_top = 3)
  if (length(vars_p) == 0) next

  hasil_list_s6[[p]] <- run_eblup_logit(df_p, vars_p)
}

df_eblup_logit_prov <- bind_rows(hasil_list_s6)
summary(df_eblup_logit_prov[, c("Estimasi","y_logit_bt","RSE_direct","RSE_logit")])

##     Estimasi       y_logit_bt      RSE_direct      RSE_logit    
##  Min.   : 0.08   Min.   : 0.34   Min.   : 22.9   Min.   : 15.7  
##  1st Qu.: 6.18   1st Qu.: 7.29   1st Qu.: 38.3   1st Qu.: 21.2  
##  Median : 9.32   Median :10.50   Median : 44.4   Median : 26.0  
##  Mean   :10.08   Mean   :10.27   Mean   : 47.6   Mean   : 28.5  
##  3rd Qu.:13.93   3rd Qu.:13.20   3rd Qu.: 53.5   3rd Qu.: 34.8  
##  Max.   :24.05   Max.   :26.02   Max.   :101.7   Max.   :102.5

plot_rse(df_eblup_logit_prov, "RSE_direct", "RSE_logit",
         "Direct", "EBLUP Logit Provinsi",
         "Skenario 6: RSE Direct vs EBLUP Logit per Provinsi")

Skenario 6 — EBLUP Logit Provinsi vs Direct

Skenario 3 & 4 — HB Beta Existing

Prasyarat JAGS: Skenario HB Beta memerlukan software JAGS yang diinstal terpisah di sistem operasi.
Download: https://sourceforge.net/projects/mcmc-jags/files/JAGS/4.x/
Pilih JAGS-4.x.y.exe (Windows), instal, lalu restart R, kemudian jalankan:
install.packages("rjags"); install.packages("saeHB")
Jika JAGS belum tersedia, chunk di bawah akan otomatis fallback ke direct estimator.

Skenario 3: HB Beta Global (Pulau)

if (!has_jags) {
  message("⚠ JAGS tidak tersedia — Skenario 3 dilewati, menggunakan direct estimator sebagai placeholder.")
  df_hb_global <- df_gabungan %>%
    filter(substr(Kako, 1, 2) %in% c("71","72","73","74","75","76")) %>%
    mutate(vardir  = (RSE / 100 * Estimasi)^2,
           y_hb_pct = Estimasi,
           sd_hb    = NA_real_,
           RSE_hb   = RSE) %>%
    as.data.frame()
} else {
  df_s3 <- df_gabungan %>%
    filter(substr(Kako, 1, 2) %in% c("71","72","73","74","75","76")) %>%
    mutate(vardir  = (RSE / 100 * Estimasi)^2,
           y_prop  = pmax(pmin(Estimasi / 100, 1 - 1e-4), 1e-4),
           y_logit = log(y_prop / (1 - y_prop))) %>%
    as.data.frame()
  vars_s3      <- seleksi_vars(df_s3, y_col = "y_logit",
                               excl_cols = c("Kako","RSE","vardir",
                                             "Estimasi","y_prop","y_logit"))
  df_s3        <- run_hbbeta(df_s3, vars_s3)
  df_hb_global <- df_s3
  cat("Variabel terpilih:", vars_s3, "\n")
}

##              Mean      SD    2.5%     25%     50%     75% 97.5%
## intercept -2.3071  0.0226 -2.3519 -2.3232 -2.3075 -2.2914 -2.26
## X8         0.3229  0.0283  0.2686  0.3027  0.3237  0.3436  0.38
## X45       -0.2677  0.0266 -0.3191 -0.2865 -0.2676 -0.2480 -0.22
## X44        0.1449  0.0318  0.0860  0.1239  0.1459  0.1662  0.21
## Variabel terpilih: X8 X45 X44

summary(df_hb_global[, c("Estimasi","y_hb_pct","RSE","RSE_hb")])

##     Estimasi        y_hb_pct          RSE            RSE_hb    
##  Min.   : 0.08   Min.   : 2.13   Min.   : 22.9   Min.   :19.7  
##  1st Qu.: 6.18   1st Qu.: 7.56   1st Qu.: 38.3   1st Qu.:22.1  
##  Median : 9.32   Median :10.07   Median : 44.4   Median :23.9  
##  Mean   :10.08   Mean   :10.11   Mean   : 47.6   Mean   :25.6  
##  3rd Qu.:13.93   3rd Qu.:12.60   3rd Qu.: 53.5   3rd Qu.:28.0  
##  Max.   :24.05   Max.   :18.00   Max.   :101.7   Max.   :43.5

# Pastikan RSE_direct tersedia untuk scatter plot komparatif
if (!"RSE_direct" %in% names(df_hb_global)) {
  df_hb_global <- df_hb_global %>%
    mutate(vardir_tmp = (RSE / 100 * Estimasi)^2,
           RSE_direct  = sqrt(vardir_tmp) / pmax(Estimasi, 1e-6) * 100) %>%
    dplyr::select(-vardir_tmp)
}

plot_rse(df_hb_global, "RSE", "RSE_hb",
         "Direct", "HB Beta Global",
         "Skenario 3: RSE Direct vs HB Beta Global (Pulau)")

Skenario 3 — HB Beta Global vs Direct

Skenario 4: HB Beta per Provinsi

if (!has_jags) {
  message("⚠ JAGS tidak tersedia — Skenario 4 dilewati, menggunakan direct estimator sebagai placeholder.")
  df_hb_prov <- df_gabungan %>%
    mutate(kdprov   = substr(Kako, 1, 2),
           vardir   = (RSE / 100 * Estimasi)^2,
           y_hb_pct = Estimasi,
           sd_hb    = NA_real_,
           RSE_hb   = RSE) %>%
    as.data.frame()
} else {
  df_all_s4 <- df_gabungan %>%
    mutate(kdprov  = substr(Kako, 1, 2),
           vardir  = (RSE / 100 * Estimasi)^2,
           y_prop  = pmax(pmin(Estimasi / 100, 1 - 1e-4), 1e-4),
           y_logit = log(y_prop / (1 - y_prop))) %>%
    as.data.frame()

  hasil_list_s4 <- list()
  for (p in sort(unique(df_all_s4$kdprov))) {
    df_p <- filter(df_all_s4, kdprov == p)
    if (nrow(df_p) < 5) next
    vars_p <- tryCatch(
      seleksi_vars(df_p, y_col = "y_logit",
                   excl_cols = c("Kako","RSE","vardir","Estimasi",
                                 "y_prop","y_logit","kdprov"), max_top = 5),
      error = function(e) {
        message("  seleksi_vars gagal untuk provinsi ", p, ": ", conditionMessage(e))
        character(0)
      }
    )
    if (length(vars_p) == 0) next
    hasil <- tryCatch(
      run_hbbeta(df_p, vars_p),
      error = function(e) {
        message("  run_hbbeta gagal untuk provinsi ", p, ": ", conditionMessage(e))
        df_p %>%
          mutate(y_hb_pct = Estimasi, sd_hb = NA_real_, RSE_hb = RSE)
      }
    )
    hasil_list_s4[[p]] <- hasil
  }
  # Pastikan df_hb_prov selalu terbentuk meski semua provinsi gagal
  if (length(hasil_list_s4) > 0) {
    df_hb_prov <- bind_rows(hasil_list_s4)
  } else {
    message("  Semua provinsi gagal — df_hb_prov menggunakan direct estimator.")
    df_hb_prov <- df_all_s4 %>%
      mutate(y_hb_pct = Estimasi, sd_hb = NA_real_, RSE_hb = RSE)
  }
}

##              Mean      SD    2.5%     25%     50%     75% 97.5%
## intercept -2.2460  0.0516 -2.3466 -2.2802 -2.2471 -2.2095 -2.15
## X30_mean  -0.8004  0.0829 -0.9584 -0.8600 -0.8014 -0.7393 -0.65
## X8         0.4034  0.0682  0.2559  0.3591  0.4076  0.4530  0.52
## X44        0.4906  0.0808  0.3358  0.4330  0.4921  0.5416  0.66

##              Mean      SD    2.5%     25%     50%     75% 97.5%
## intercept -2.3647  0.0421 -2.4435 -2.3921 -2.3632 -2.3369 -2.28
## X28        0.3973  0.0417  0.3221  0.3672  0.3977  0.4259  0.48
## X33_mean  -0.3473  0.0477 -0.4424 -0.3821 -0.3467 -0.3153 -0.26

##              Mean      SD    2.5%     25%     50%     75% 97.5%
## intercept -2.5070  0.0441 -2.5816 -2.5424 -2.5070 -2.4769 -2.42
## X26       -0.3025  0.0353 -0.3710 -0.3270 -0.3043 -0.2761 -0.24
## X25_mean  -0.1892  0.0430 -0.2671 -0.2180 -0.1886 -0.1620 -0.10

##              Mean      SD    2.5%     25%     50%     75% 97.5%
## intercept -2.2502  0.0378 -2.3215 -2.2763 -2.2524 -2.2272 -2.17
## X29_mean  -0.4161  0.0359 -0.4838 -0.4380 -0.4168 -0.3937 -0.34
## X21_mean  -0.2108  0.0413 -0.2837 -0.2378 -0.2117 -0.1838 -0.13

##              Mean      SD    2.5%     25%     50%     75% 97.5%
## intercept -2.7863  0.0240 -2.8332 -2.8018 -2.7851 -2.7710 -2.74
## X24_mean  -1.5214  0.0519 -1.6271 -1.5590 -1.5217 -1.4850 -1.43

##              Mean      SD    2.5%     25%     50%     75% 97.5%
## intercept -2.1182  0.0391 -2.1835 -2.1447 -2.1195 -2.0954 -2.03
## X24_mean  -0.5063  0.0475 -0.5938 -0.5384 -0.5068 -0.4750 -0.42

summary(df_hb_prov[, c("Estimasi","y_hb_pct","RSE","RSE_hb")])

##     Estimasi        y_hb_pct           RSE            RSE_hb      
##  Min.   : 0.08   Min.   : 0.367   Min.   : 22.9   Min.   : 0.721  
##  1st Qu.: 6.18   1st Qu.: 6.565   1st Qu.: 38.3   1st Qu.:10.281  
##  Median : 9.32   Median : 9.673   Median : 44.4   Median :18.301  
##  Mean   :10.08   Mean   :10.109   Mean   : 47.6   Mean   :17.275  
##  3rd Qu.:13.93   3rd Qu.:13.559   3rd Qu.: 53.5   3rd Qu.:23.691  
##  Max.   :24.05   Max.   :23.953   Max.   :101.7   Max.   :39.792

# Pastikan RSE_direct tersedia untuk scatter plot komparatif
if (!"RSE_direct" %in% names(df_hb_prov)) {
  df_hb_prov <- df_hb_prov %>%
    mutate(vardir_tmp  = (RSE / 100 * Estimasi)^2,
           RSE_direct  = sqrt(vardir_tmp) / pmax(Estimasi, 1e-6) * 100) %>%
    dplyr::select(-vardir_tmp)
}

plot_rse(df_hb_prov, "RSE", "RSE_hb",
         "Direct", "HB Beta Provinsi",
         "Skenario 4: RSE Direct vs HB Beta per Provinsi")

Skenario 4 — HB Beta Provinsi vs Direct

Skenario 7–9 — Segmentasi Natural Break (RSE) Baru

Jenks Natural Breaks: Domain dibagi ke dalam 2 kelompok berdasarkan RSE direct estimator. Kelompok 1 = RSE rendah (data lebih kuat), Kelompok 2 = RSE tinggi (data lebih lemah). Tiga model dijalankan pada masing-masing kelompok.

Pembentukan Grup RSE

library(classInt)

# Hitung RSE direct
df_seg_rse <- df_gabungan %>%
  mutate(vardir    = (RSE / 100 * Estimasi)^2,
         RSE_direct = sqrt(vardir) / Estimasi * 100) %>%
  as.data.frame()

# Jenks Natural Breaks (k = 2)
breaks_jenks <- classIntervals(df_seg_rse$RSE_direct,
                               n = 2, style = "jenks")
df_seg_rse$grup_rse <- cut(df_seg_rse$RSE_direct,
                           breaks  = breaks_jenks$brks,
                           labels  = c("Grup 1 (RSE Rendah)", "Grup 2 (RSE Tinggi)"),
                           include.lowest = TRUE)

# Ringkasan grup
table_grup <- df_seg_rse %>%
  group_by(grup_rse) %>%
  summarise(
    N          = n(),
    RSE_min    = round(min(RSE_direct), 2),
    RSE_max    = round(max(RSE_direct), 2),
    RSE_median = round(median(RSE_direct), 2),
    .groups    = "drop"
  )

kable(table_grup, caption = "Distribusi Domain per Grup RSE (Jenks Natural Breaks)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"))

Distribusi Domain per Grup RSE (Jenks Natural Breaks)
grup_rse	N	RSE_min	RSE_max	RSE_median
Grup 1 (RSE Rendah)	62	22.94	53.58	41.78
Grup 2 (RSE Tinggi)	19	55.28	101.66	63.27

ggplot(df_seg_rse, aes(x = RSE_direct, fill = grup_rse)) +
  geom_histogram(bins = 30, color = "white", alpha = 0.85) +
  geom_vline(xintercept = breaks_jenks$brks[2], linetype = "dashed",
             color = "#c0392b", linewidth = 1) +
  scale_fill_manual(values = c("#2980b9","#e67e22")) +
  labs(title    = "Distribusi RSE Direct — Segmentasi Jenks Natural Breaks",
       subtitle = paste0("Break point: ", round(breaks_jenks$brks[2], 2), "%"),
       x = "RSE Direct (%)", y = "Jumlah Domain", fill = NULL) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "top")

Distribusi RSE Direct per Grup Jenks

Skenario 7: EBLUP per Grup RSE

list_grup     <- levels(df_seg_rse$grup_rse)
hasil_s7      <- list()

for (g in list_grup) {
  cat("\n===", g, "===\n")
  df_g <- filter(df_seg_rse, grup_rse == g)
  if (nrow(df_g) < 4) { cat("Skip: domain terlalu sedikit\n"); next }

  vars_g <- seleksi_vars(df_g, y_col = "Estimasi",
                         excl_cols = c("Kako","RSE","vardir","Estimasi",
                                       "RSE_direct","grup_rse"),
                         max_top = 5)
  if (length(vars_g) == 0) next

  cat("Variabel:", vars_g, "\n")
  hasil_s7[[g]] <- run_eblup(df_g, vars_g)
}

## 
## === Grup 1 (RSE Rendah) ===
## Variabel: X19 X27_mean 
## 
## === Grup 2 (RSE Tinggi) ===
## Variabel: X45

df_eblup_rse <- bind_rows(hasil_s7)
summary(df_eblup_rse[, c("Estimasi","y_eblup","RSE_direct","RSE_eblup")])

##     Estimasi        y_eblup         RSE_direct      RSE_eblup    
##  Min.   : 0.08   Min.   : 0.087   Min.   : 22.9   Min.   : 12.1  
##  1st Qu.: 6.18   1st Qu.: 4.702   1st Qu.: 38.3   1st Qu.: 13.1  
##  Median : 9.32   Median : 9.354   Median : 44.4   Median : 15.7  
##  Mean   :10.08   Mean   : 8.017   Mean   : 47.6   Mean   : 26.3  
##  3rd Qu.:13.93   3rd Qu.:10.854   3rd Qu.: 53.5   3rd Qu.: 39.1  
##  Max.   :24.05   Max.   :12.622   Max.   :101.7   Max.   :108.7

plot_rse(df_eblup_rse, "RSE_direct", "RSE_eblup",
         "Direct", "EBLUP (RSE group)",
         "Skenario 7: RSE Direct vs EBLUP per Grup RSE")

Skenario 7 — EBLUP per Grup RSE

Skenario 8: EBLUP Logit per Grup RSE

df_seg_rse_logit <- df_seg_rse %>%
  mutate(y_prop  = pmax(pmin(Estimasi / 100, 1 - 1e-4), 1e-4),
         y_logit = log(y_prop / (1 - y_prop)))

hasil_s8 <- list()

for (g in list_grup) {
  cat("\n===", g, "===\n")
  df_g <- filter(df_seg_rse_logit, grup_rse == g)
  if (nrow(df_g) < 4) next

  vars_g <- seleksi_vars(df_g, y_col = "y_logit",
                         excl_cols = c("Kako","RSE","vardir","Estimasi",
                                       "RSE_direct","grup_rse",
                                       "y_prop","y_logit"),
                         max_top = 5)
  if (length(vars_g) == 0) next

  cat("Variabel:", vars_g, "\n")
  hasil_s8[[g]] <- run_eblup_logit(df_g, vars_g)
}

## 
## === Grup 1 (RSE Rendah) ===
## Variabel: X8 X29_mean 
## 
## === Grup 2 (RSE Tinggi) ===
## Variabel: X22_jumlah X28

df_eblup_logit_rse <- bind_rows(hasil_s8)
summary(df_eblup_logit_rse[, c("Estimasi","y_logit_bt","RSE_direct","RSE_logit")])

##     Estimasi       y_logit_bt      RSE_direct      RSE_logit    
##  Min.   : 0.08   Min.   : 1.22   Min.   : 22.9   Min.   : 9.24  
##  1st Qu.: 6.18   1st Qu.: 6.65   1st Qu.: 38.3   1st Qu.:10.81  
##  Median : 9.32   Median :11.36   Median : 44.4   Median :12.36  
##  Mean   :10.08   Mean   : 9.83   Mean   : 47.6   Mean   :18.88  
##  3rd Qu.:13.93   3rd Qu.:12.95   3rd Qu.: 53.5   3rd Qu.:25.78  
##  Max.   :24.05   Max.   :14.25   Max.   :101.7   Max.   :45.23

plot_rse(df_eblup_logit_rse, "RSE_direct", "RSE_logit",
         "Direct", "EBLUP Logit (RSE group)",
         "Skenario 8: RSE Direct vs EBLUP Logit per Grup RSE")

Skenario 8 — EBLUP Logit per Grup RSE

Skenario 9: HB Beta per Grup RSE

if (!has_jags) {
  message("⚠ JAGS tidak tersedia — Skenario 9 dilewati.")
  df_hb_rse <- df_seg_rse_logit %>%
    mutate(y_hb_pct = Estimasi, sd_hb = NA_real_, RSE_hb = RSE)
} else {
  hasil_s9 <- list()
  for (g in list_grup) {
    cat("\n===", g, "===\n")
    df_g <- filter(df_seg_rse_logit, grup_rse == g)
    if (nrow(df_g) < 5) next
    vars_g <- seleksi_vars(df_g, y_col = "y_logit",
                           excl_cols = c("Kako","RSE","vardir","Estimasi",
                                         "RSE_direct","grup_rse","y_prop","y_logit"),
                           max_top = 5)
    if (length(vars_g) == 0) next
    cat("Variabel:", vars_g, "\n")
    hasil_s9[[g]] <- run_hbbeta(df_g, vars_g)
  }
  df_hb_rse <- bind_rows(hasil_s9)
}

## 
## === Grup 1 (RSE Rendah) ===
## Variabel: X8 X29_mean

##              Mean      SD    2.5%     25%     50%     75% 97.5%
## intercept -2.1044  0.0181 -2.1394 -2.1165 -2.1043 -2.0913 -2.07
## X8         0.1669  0.0197  0.1270  0.1534  0.1665  0.1800  0.21
## X29_mean  -0.1376  0.0204 -0.1794 -0.1495 -0.1368 -0.1245 -0.10
## 
## === Grup 2 (RSE Tinggi) ===
## Variabel: X22_jumlah X28

##               Mean      SD    2.5%     25%     50%     75% 97.5%
## intercept  -3.2395  0.0546 -3.3488 -3.2770 -3.2385 -3.2051 -3.13
## X22_jumlah  0.3645  0.0553  0.2565  0.3253  0.3709  0.3987  0.47
## X28         0.3481  0.0555  0.2420  0.3108  0.3453  0.3874  0.45

summary(df_hb_rse[, c("Estimasi","y_hb_pct","RSE","RSE_hb")])

##     Estimasi        y_hb_pct          RSE            RSE_hb     
##  Min.   : 0.08   Min.   : 1.14   Min.   : 22.9   Min.   : 5.49  
##  1st Qu.: 6.18   1st Qu.: 6.57   1st Qu.: 38.3   1st Qu.: 7.14  
##  Median : 9.32   Median : 9.41   Median : 44.4   Median : 8.90  
##  Mean   :10.08   Mean   :10.09   Mean   : 47.6   Mean   :14.23  
##  3rd Qu.:13.93   3rd Qu.:13.82   3rd Qu.: 53.5   3rd Qu.:14.10  
##  Max.   :24.05   Max.   :23.63   Max.   :101.7   Max.   :42.66

plot_rse(df_hb_rse, "RSE", "RSE_hb",
         "Direct", "HB Beta (RSE group)",
         "Skenario 9: RSE Direct vs HB Beta per Grup RSE")

Skenario 9 — HB Beta per Grup RSE

Skenario 10–12 — Segmentasi Clustering Baru

K-Means Clustering (k = 2): Domain dikelompokkan berdasarkan karakteristik wilayah menggunakan variabel auxiliary yang terpilih secara global. Data distandardisasi sebelum clustering. Jumlah cluster optimal dikonfirmasi dengan elbow method.

Pembentukan Cluster

# Gunakan variabel yang terpilih di seleksi global
df_seg_clust <- df_gabungan %>%
  mutate(vardir = (RSE / 100 * Estimasi)^2) %>%
  as.data.frame()

# Standardisasi variabel auxiliary
vars_clust <- vars_global  # variabel hasil seleksi global
mat_clust  <- scale(df_seg_clust[, vars_clust])
mat_clust[is.nan(mat_clust)] <- 0

# ── Elbow method (opsional, matikan jika lambat) ─────────────────────
set.seed(42)
wss <- sapply(1:6, function(k)
  kmeans(mat_clust, centers = k, nstart = 25, iter.max = 100)$tot.withinss)

ggplot(data.frame(k = 1:6, wss = wss), aes(x = k, y = wss)) +
  geom_line(color = "#2980b9", linewidth = 1) +
  geom_point(color = "#2980b9", size = 3) +
  scale_x_continuous(breaks = 1:6) +
  labs(title = "Elbow Method — Penentuan Jumlah Cluster",
       x = "Jumlah Cluster (k)", y = "Total Within-Cluster SS") +
  theme_minimal(base_size = 12)

# ── K-Means k = 2 ───────────────────────────────────────────────────
set.seed(42)
km_result              <- kmeans(mat_clust, centers = 2,
                                 nstart = 50, iter.max = 200)
df_seg_clust$cluster   <- paste0("Cluster ", km_result$cluster)

# Ringkasan cluster
table_clust <- df_seg_clust %>%
  mutate(RSE_direct = sqrt(vardir) / Estimasi * 100) %>%
  group_by(cluster) %>%
  summarise(
    N              = n(),
    Est_mean       = round(mean(Estimasi), 2),
    Est_sd         = round(sd(Estimasi), 2),
    RSE_direct_med = round(median(RSE_direct), 2),
    .groups        = "drop"
  )

kable(table_clust,
      caption = "Karakteristik Domain per Cluster (K-Means, k=2)") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"))

Karakteristik Domain per Cluster (K-Means, k=2)
cluster	N	Est_mean	Est_sd	RSE_direct_med
Cluster 1	71	10.95	5.32	43.59
Cluster 2	10	3.85	3.14	57.45

df_seg_clust %>%
  ggplot(aes(x = cluster, y = Estimasi, fill = cluster)) +
  geom_boxplot(alpha = 0.8, outlier.color = "#c0392b") +
  scale_fill_manual(values = c("#2980b9","#27ae60")) +
  labs(title = "Distribusi Direct Estimasi per Cluster",
       x = NULL, y = "Estimasi (%)", fill = NULL) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

Distribusi Estimasi per Cluster

Skenario 10: EBLUP per Cluster

list_clust  <- unique(df_seg_clust$cluster)
hasil_s10   <- list()

for (cl in list_clust) {
  cat("\n===", cl, "===\n")
  df_cl <- filter(df_seg_clust, cluster == cl)
  if (nrow(df_cl) < 4) next

  vars_cl <- seleksi_vars(df_cl, y_col = "Estimasi",
                          excl_cols = c("Kako","RSE","vardir",
                                        "Estimasi","cluster"),
                          max_top = 5)
  if (length(vars_cl) == 0) next

  cat("Variabel:", vars_cl, "\n")
  hasil_s10[[cl]] <- run_eblup(df_cl, vars_cl)
}

## 
## === Cluster 1 ===
## Variabel: X28 X45 
## 
## === Cluster 2 ===
## Variabel: X34_jumlah X25_mean

df_eblup_clust <- bind_rows(hasil_s10)
summary(df_eblup_clust[, c("Estimasi","y_eblup","RSE_direct","RSE_eblup")])

##     Estimasi        y_eblup        RSE_direct      RSE_eblup   
##  Min.   : 0.08   Min.   : 1.33   Min.   : 22.9   Min.   :22.9  
##  1st Qu.: 6.18   1st Qu.: 7.73   1st Qu.: 38.3   1st Qu.:26.7  
##  Median : 9.32   Median : 9.46   Median : 44.4   Median :28.7  
##  Mean   :10.08   Mean   : 9.12   Mean   : 47.6   Mean   :30.1  
##  3rd Qu.:13.93   3rd Qu.:10.79   3rd Qu.: 53.5   3rd Qu.:31.0  
##  Max.   :24.05   Max.   :13.94   Max.   :101.7   Max.   :54.6  
##                  NA's   :10                      NA's   :10

plot_rse(df_eblup_clust, "RSE_direct", "RSE_eblup",
         "Direct", "EBLUP (Cluster)",
         "Skenario 10: RSE Direct vs EBLUP per Cluster")

Skenario 10 — EBLUP per Cluster

Skenario 11: EBLUP Logit per Cluster

df_seg_clust_logit <- df_seg_clust %>%
  mutate(y_prop  = pmax(pmin(Estimasi / 100, 1 - 1e-4), 1e-4),
         y_logit = log(y_prop / (1 - y_prop)))

hasil_s11 <- list()

for (cl in list_clust) {
  cat("\n===", cl, "===\n")
  df_cl <- filter(df_seg_clust_logit, cluster == cl)
  if (nrow(df_cl) < 4) next

  vars_cl <- seleksi_vars(df_cl, y_col = "y_logit",
                          excl_cols = c("Kako","RSE","vardir","Estimasi",
                                        "cluster","y_prop","y_logit"),
                          max_top = 5)
  if (length(vars_cl) == 0) next

  cat("Variabel:", vars_cl, "\n")
  hasil_s11[[cl]] <- run_eblup_logit(df_cl, vars_cl)
}

## 
## === Cluster 1 ===
## Variabel: X45 X33_mean 
## 
## === Cluster 2 ===
## Variabel: X34_jumlah X11_mean

df_eblup_logit_clust <- bind_rows(hasil_s11)
summary(df_eblup_logit_clust[, c("Estimasi","y_logit_bt","RSE_direct","RSE_logit")])

##     Estimasi       y_logit_bt      RSE_direct      RSE_logit    
##  Min.   : 0.08   Min.   : 2.10   Min.   : 22.9   Min.   : 8.74  
##  1st Qu.: 6.18   1st Qu.: 9.09   1st Qu.: 38.3   1st Qu.:10.41  
##  Median : 9.32   Median :11.09   Median : 44.4   Median :11.76  
##  Mean   :10.08   Mean   :10.23   Mean   : 47.6   Mean   :16.41  
##  3rd Qu.:13.93   3rd Qu.:12.15   3rd Qu.: 53.5   3rd Qu.:13.95  
##  Max.   :24.05   Max.   :15.35   Max.   :101.7   Max.   :54.65

plot_rse(df_eblup_logit_clust, "RSE_direct", "RSE_logit",
         "Direct", "EBLUP Logit (Cluster)",
         "Skenario 11: RSE Direct vs EBLUP Logit per Cluster")

Skenario 11 — EBLUP Logit per Cluster

Skenario 12: HB Beta per Cluster

if (!has_jags) {
  message("⚠ JAGS tidak tersedia — Skenario 12 dilewati.")
  df_hb_clust <- df_seg_clust_logit %>%
    mutate(y_hb_pct = Estimasi, sd_hb = NA_real_, RSE_hb = RSE)
} else {
  hasil_s12 <- list()
  for (cl in list_clust) {
    cat("\n===", cl, "===\n")
    df_cl <- filter(df_seg_clust_logit, cluster == cl)
    if (nrow(df_cl) < 5) next
    vars_cl <- seleksi_vars(df_cl, y_col = "y_logit",
                            excl_cols = c("Kako","RSE","vardir","Estimasi",
                                          "cluster","y_prop","y_logit"),
                            max_top = 5)
    if (length(vars_cl) == 0) next
    cat("Variabel:", vars_cl, "\n")
    hasil_s12[[cl]] <- run_hbbeta(df_cl, vars_cl)
  }
  df_hb_clust <- bind_rows(hasil_s12)
}

## 
## === Cluster 1 ===
## Variabel: X45 X33_mean

##              Mean      SD    2.5%     25%     50%     75% 97.5%
## intercept -2.1794  0.0194 -2.2213 -2.1921 -2.1788 -2.1649 -2.15
## X45       -0.1869  0.0221 -0.2302 -0.2008 -0.1865 -0.1740 -0.14
## X33_mean  -0.1660  0.0209 -0.2059 -0.1792 -0.1665 -0.1522 -0.13
## 
## === Cluster 2 ===
## Variabel: X34_jumlah X11_mean

##               Mean      SD    2.5%     25%     50%     75% 97.5%
## intercept  -3.4338  0.0881 -3.5978 -3.4954 -3.4326 -3.3770 -3.26
## X34_jumlah  0.4964  0.0951  0.3161  0.4353  0.4979  0.5562  0.69
## X11_mean    0.1400  0.0950 -0.0352  0.0731  0.1470  0.2076  0.30

summary(df_hb_clust[, c("Estimasi","y_hb_pct","RSE","RSE_hb")])

##     Estimasi        y_hb_pct          RSE            RSE_hb    
##  Min.   : 0.08   Min.   : 1.13   Min.   : 22.9   Min.   :16.5  
##  1st Qu.: 6.18   1st Qu.: 7.22   1st Qu.: 38.3   1st Qu.:20.4  
##  Median : 9.32   Median : 9.67   Median : 44.4   Median :22.4  
##  Mean   :10.08   Mean   :10.09   Mean   : 47.6   Mean   :24.7  
##  3rd Qu.:13.93   3rd Qu.:12.63   3rd Qu.: 53.5   3rd Qu.:25.9  
##  Max.   :24.05   Max.   :20.71   Max.   :101.7   Max.   :49.5

plot_rse(df_hb_clust, "RSE", "RSE_hb",
         "Direct", "HB Beta (Cluster)",
         "Skenario 12: RSE Direct vs HB Beta per Cluster")

Skenario 12 — HB Beta per Cluster

Evaluasi Komparatif — 12 Skenario

Tabel Ringkasan Distribusi RSE

# ── Kumpulkan distribusi RSE semua skenario ──────────────────────────
eval_rows <- list(
  tabel_rse(df_gabungan %>% mutate(RSE_direct = RSE), "RSE_direct", "0. Direct"),
  tabel_rse(df_eblup_global,        "RSE_eblup",  "1. EBLUP Global"),
  tabel_rse(df_eblup_prov,          "RSE_eblup",  "2. EBLUP Provinsi"),
  tabel_rse(df_hb_global,           "RSE_hb",     "3. HB Beta Global"),
  tabel_rse(df_hb_prov,             "RSE_hb",     "4. HB Beta Provinsi"),
  tabel_rse(df_eblup_logit_global,  "RSE_logit",  "5. EBLUP Logit Global"),
  tabel_rse(df_eblup_logit_prov,    "RSE_logit",  "6. EBLUP Logit Provinsi"),
  tabel_rse(df_eblup_rse,           "RSE_eblup",  "7. EBLUP RSE-group"),
  tabel_rse(df_eblup_logit_rse,     "RSE_logit",  "8. EBLUP Logit RSE-group"),
  tabel_rse(df_hb_rse,              "RSE_hb",     "9. HB Beta RSE-group"),
  tabel_rse(df_eblup_clust,         "RSE_eblup",  "10. EBLUP Cluster"),
  tabel_rse(df_eblup_logit_clust,   "RSE_logit",  "11. EBLUP Logit Cluster"),
  tabel_rse(df_hb_clust,            "RSE_hb",     "12. HB Beta Cluster")
)

tabel_eval <- bind_rows(eval_rows)

kable(tabel_eval,
      caption = "Perbandingan Distribusi RSE — 12 Skenario Model SAE",
      col.names = c("Skenario","RSE ≥ 25","RSE < 25","% Domain OK")) %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed","bordered"),
                full_width = FALSE) %>%
  row_spec(which(tabel_eval$`% OK` == max(tabel_eval$`% OK`)),
           bold = TRUE, background = "#d4edda") %>%
  row_spec(1, background = "#fff3cd")  # highlight direct

Perbandingan Distribusi RSE — 12 Skenario Model SAE
Skenario	RSE ≥ 25	RSE < 25	% Domain OK
Direct	80	1	1.2
EBLUP Global	66	15	18.5
EBLUP Provinsi	38	30	44.1
HB Beta Global	35	46	56.8
HB Beta Provinsi	15	66	81.5
EBLUP Logit Global	3	78	96.3
EBLUP Logit Provinsi	49	32	39.5
EBLUP RSE-group	24	57	70.4
EBLUP Logit RSE-group	21	60	74.1
HB Beta RSE-group	18	63	77.8
EBLUP Cluster	62	9	12.7
EBLUP Logit Cluster	11	70	86.4
HB Beta Cluster	23	58	71.6

Metrik Akurasi (Relative Bias & RMSE)

# Helper: hitung metrik akurasi terhadap direct
metrik <- function(est_model, est_direct, label) {
  rb   <- mean((est_model - est_direct) / est_direct, na.rm = TRUE) * 100
  rmse <- sqrt(mean((est_model - est_direct)^2, na.rm = TRUE))
  data.frame(
    Skenario        = label,
    `RB (%)`        = round(rb, 3),
    `RMSE`          = round(rmse, 4),
    check.names     = FALSE
  )
}

metrik_rows <- list(
  metrik(df_eblup_global$y_eblup,        df_gabungan$Estimasi, "1. EBLUP Global"),
  metrik(df_eblup_prov$y_eblup,          df_eblup_prov$Estimasi,        "2. EBLUP Provinsi"),
  metrik(df_hb_global$y_hb_pct,          df_hb_global$Estimasi,         "3. HB Beta Global"),
  metrik(df_hb_prov$y_hb_pct,            df_hb_prov$Estimasi,           "4. HB Beta Provinsi"),
  metrik(df_eblup_logit_global$y_logit_bt, df_eblup_logit_global$Estimasi, "5. EBLUP Logit Global"),
  metrik(df_eblup_logit_prov$y_logit_bt,   df_eblup_logit_prov$Estimasi,   "6. EBLUP Logit Provinsi"),
  metrik(df_eblup_rse$y_eblup,           df_eblup_rse$Estimasi,         "7. EBLUP RSE-group"),
  metrik(df_eblup_logit_rse$y_logit_bt,  df_eblup_logit_rse$Estimasi,   "8. EBLUP Logit RSE-group"),
  metrik(df_hb_rse$y_hb_pct,             df_hb_rse$Estimasi,            "9. HB Beta RSE-group"),
  metrik(df_eblup_clust$y_eblup,         df_eblup_clust$Estimasi,       "10. EBLUP Cluster"),
  metrik(df_eblup_logit_clust$y_logit_bt, df_eblup_logit_clust$Estimasi, "11. EBLUP Logit Cluster"),
  metrik(df_hb_clust$y_hb_pct,           df_hb_clust$Estimasi,          "12. HB Beta Cluster")
)

tabel_metrik <- bind_rows(metrik_rows)

kable(tabel_metrik,
      caption = "Metrik Akurasi — Relative Bias & RMSE Terhadap Direct Estimator") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed","bordered"),
                full_width = FALSE) %>%
  row_spec(which(abs(tabel_metrik$`RB (%)`) == min(abs(tabel_metrik$`RB (%)`))),
           bold = TRUE, background = "#d4edda")

Metrik Akurasi — Relative Bias & RMSE Terhadap Direct Estimator
Skenario	RB (%)	RMSE
EBLUP Global	-6.336	3.8559
EBLUP Provinsi	-4.530	3.4774
HB Beta Global	66.831	2.3347
HB Beta Provinsi	28.979	1.0322
EBLUP Logit Global	98.898	4.7280
EBLUP Logit Provinsi	55.022	3.9476
EBLUP RSE-group	-10.971	4.3171
EBLUP Logit RSE-group	36.878	4.1330
HB Beta RSE-group	22.162	0.5763
EBLUP Cluster	-5.383	3.8960
EBLUP Logit Cluster	71.278	4.7474
HB Beta Cluster	31.295	1.5462

Visualisasi Perbandingan % Domain RSE < 25%

tabel_eval_plot <- tabel_eval[-1, ] %>%   # buang baris Direct
  mutate(
    Skenario  = factor(Metode, levels = rev(Metode)),
    Segmentasi = case_when(
      grepl("Global|Provinsi", Metode) & !grepl("RSE|Cluster", Metode) ~ "Pulau / Provinsi",
      grepl("RSE",     Metode) ~ "RSE Natural Break",
      grepl("Cluster", Metode) ~ "Clustering",
      TRUE ~ "Lainnya"
    )
  )

ggplot(tabel_eval_plot, aes(x = Skenario, y = `% OK`, fill = Segmentasi)) +
  geom_col(alpha = 0.88, width = 0.7) +
  geom_text(aes(label = paste0(`% OK`, "%")),
            hjust = -0.15, size = 3.5, fontface = "bold") +
  geom_vline(xintercept = tabel_eval[1, "% OK"],
             linetype = "dashed", color = "#c0392b", linewidth = 0.9) +
  scale_fill_manual(values = c(
    "Pulau / Provinsi"  = "#2980b9",
    "RSE Natural Break" = "#e67e22",
    "Clustering"        = "#27ae60"
  )) +
  scale_y_continuous(limits = c(0, 115), expand = c(0, 0)) +
  coord_flip() +
  labs(title    = "Persentase Domain dengan RSE < 25% — 12 Skenario SAE",
       subtitle = "Garis merah = baseline Direct Estimator",
       x = NULL, y = "% Domain RSE < 25%", fill = "Segmentasi") +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "top",
    plot.title      = element_text(face = "bold"),
    panel.grid.major.y = element_blank()
  )

Persentase Domain dengan RSE < 25% per Skenario

Scatter Plot: RSE Direct vs RSE Model (Semua Skenario)

# Gabung semua skenario dalam format panjang
df_scatter <- bind_rows(
  df_eblup_global       %>% mutate(Skenario = "1. EBLUP Global",         RSE_model = RSE_eblup),
  df_eblup_prov         %>% mutate(Skenario = "2. EBLUP Provinsi",       RSE_model = RSE_eblup),
  df_hb_global          %>% mutate(Skenario = "3. HB Beta Global",       RSE_model = RSE_hb,    RSE_direct = RSE),
  df_hb_prov            %>% mutate(Skenario = "4. HB Beta Provinsi",     RSE_model = RSE_hb,    RSE_direct = RSE),
  df_eblup_logit_global %>% mutate(Skenario = "5. EBLUP Logit Global",   RSE_model = RSE_logit),
  df_eblup_logit_prov   %>% mutate(Skenario = "6. EBLUP Logit Provinsi", RSE_model = RSE_logit),
  df_eblup_rse          %>% mutate(Skenario = "7. EBLUP RSE-group",      RSE_model = RSE_eblup),
  df_eblup_logit_rse    %>% mutate(Skenario = "8. EBLUP Logit RSE-group",RSE_model = RSE_logit),
  df_hb_rse             %>% mutate(Skenario = "9. HB Beta RSE-group",    RSE_model = RSE_hb,    RSE_direct = RSE),
  df_eblup_clust        %>% mutate(Skenario = "10. EBLUP Cluster",       RSE_model = RSE_eblup),
  df_eblup_logit_clust  %>% mutate(Skenario = "11. EBLUP Logit Cluster", RSE_model = RSE_logit),
  df_hb_clust           %>% mutate(Skenario = "12. HB Beta Cluster",     RSE_model = RSE_hb,    RSE_direct = RSE)
) %>%
  dplyr::select(Skenario, RSE_direct, RSE_model) %>%
  filter(!is.na(RSE_direct) & !is.na(RSE_model))

lim_max <- quantile(c(df_scatter$RSE_direct, df_scatter$RSE_model), 0.99, na.rm = TRUE)

ggplot(df_scatter, aes(x = RSE_direct, y = RSE_model)) +
  geom_point(alpha = 0.55, size = 1.5, color = "#2980b9") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "#c0392b") +
  geom_hline(yintercept = 25, linetype = "dotted", color = "#e67e22") +
  geom_vline(xintercept = 25, linetype = "dotted", color = "#e67e22") +
  facet_wrap(~ Skenario, ncol = 4, scales = "free") +
  coord_cartesian(xlim = c(0, lim_max), ylim = c(0, lim_max)) +
  labs(title    = "RSE Direct vs RSE Model — 12 Skenario SAE",
       subtitle = "Garis merah = RSE sama dengan direct | Titik di bawah diagonal = estimasi lebih presisi",
       x = "RSE Direct (%)", y = "RSE Model (%)") +
  theme_minimal(base_size = 11) +
  theme(
    strip.text = element_text(face = "bold", size = 9),
    plot.title = element_text(face = "bold")
  )

RSE Direct vs RSE Model — 12 Skenario

Ringkasan & Kesimpulan

best_pct <- tabel_eval %>%
  filter(Metode != "0. Direct") %>%
  slice_max(`% OK`, n = 1)

cat("Model terbaik (% domain RSE < 25%):\n")

## Model terbaik (% domain RSE < 25%):

cat(" →", best_pct$Metode, ":", best_pct$`% OK`, "%\n\n")

##  → 5. EBLUP Logit Global : 96.3 %

best_rb <- tabel_metrik %>%
  slice_min(abs(`RB (%)`), n = 1)

cat("Model dengan Relative Bias terkecil:\n")

## Model dengan Relative Bias terkecil:

cat(" →", best_rb$Skenario, ": RB =", best_rb$`RB (%)`, "%\n")

##  → 2. EBLUP Provinsi : RB = -4.53 %

Interpretasi kunci:

EBLUP vs Direct — EBLUP secara konsisten mereduksi RSE dibanding direct estimator di semua level segmentasi.
Logit link — EBLUP Logit cocok untuk proporsi yang mendekati 0 atau 1, mencegah estimasi negatif/di atas 100.
HB Beta — Menawarkan estimasi yang lebih stabil di domain kecil karena memanfaatkan full posterior distribution.
Segmentasi RSE — Memisahkan domain berdasarkan kualitas data memungkinkan model lebih tepat sasaran per kelompok.
Segmentasi Cluster — Menangkap heterogenitas karakteristik wilayah yang tidak tertangkap oleh batas administratif (provinsi).

Small Area Estimation — Analisis Komparatif 12 Skenario

EBLUP · EBLUP Logit · HB Beta | Segmentasi: Pulau · Provinsi · RSE · Cluster

Fajar Fithra Ramadhan

29 April 2026