library(MASS)

—- 1. Parameter Sebenarnya —-

set.seed(2024)
beta_true <- c(b0 = -1.5, b1 = 2.0, b2 = -0.8)

—- 2. Fungsi Pembangkit Data —-

bangkit_data <- function(n, rho = 0, pct_outlier = 0, seed = NULL) {
  if (!is.null(seed)) set.seed(seed)
  Sigma <- matrix(c(1, rho, rho, 1), nrow = 2)
  X <- mvrnorm(n, mu = c(0, 0), Sigma = Sigma)
  if (pct_outlier > 0) {
    idx <- sample(n, floor(n * pct_outlier))
    X[idx, ] <- X[idx, ] * 5   # outlier: kalikan 5x
  }
  logit_p <- beta_true["b0"] + beta_true["b1"] * X[, 1] + beta_true["b2"] * X[, 2]
  y <- rbinom(n, 1, plogis(logit_p))
  data.frame(y = y, x1 = X[, 1], x2 = X[, 2])
}

—- 3. Fungsi Replikasi —-

replikasi <- function(n, rho = 0, pct_outlier = 0, R = 100) {
  hasil <- matrix(NA, nrow = R, ncol = 3,
                  dimnames = list(NULL, c("b0", "b1", "b2")))
  for (r in 1:R) {
    df  <- bangkit_data(n, rho, pct_outlier, seed = 1000 + r)
    fit <- tryCatch(
      glm(y ~ x1 + x2, data = df, family = binomial(link = "logit")),
      error = function(e) NULL
    )
    if (!is.null(fit)) hasil[r, ] <- coef(fit)
  }
  hasil[complete.cases(hasil), ]
}

—- 4. Jalankan 4 Skenario —-

cat("Menjalankan skenario simulasi...\n")
## Menjalankan skenario simulasi...
# Skenario 1: Sampel Kecil
sc1 <- replikasi(n = 30)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cat("Skenario 1 selesai: Sampel Kecil (n=30)\n")
## Skenario 1 selesai: Sampel Kecil (n=30)
# Skenario 2: Sampel Besar
sc2 <- replikasi(n = 500)
cat("Skenario 2 selesai: Sampel Besar (n=500)\n")
## Skenario 2 selesai: Sampel Besar (n=500)
# Skenario 3: Outlier 15%
sc3 <- replikasi(n = 200, pct_outlier = 0.15)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cat("Skenario 3 selesai: Outlier 15% (n=200)\n")
## Skenario 3 selesai: Outlier 15% (n=200)
# Skenario 4: Multikolinearitas (rho=0.80)
sc4 <- replikasi(n = 200, rho = 0.80)
cat("Skenario 4 selesai: Multikolinearitas rho=0.80 (n=200)\n")
## Skenario 4 selesai: Multikolinearitas rho=0.80 (n=200)

—- 5. Fungsi Ringkasan: Bias & MSE —-

ringkasan_skenario <- function(mat, label, n_sampel) {
  rata  <- colMeans(mat)
  bias  <- rata - beta_true
  varian <- apply(mat, 2, var)
  mse   <- bias^2 + varian   # MSE = Bias^2 + Varians
  data.frame(
    Skenario    = label,
    n           = n_sampel,
    b0_hat      = round(rata["b0"], 4),
    bias_b0     = round(bias["b0"], 4),
    b1_hat      = round(rata["b1"], 4),
    bias_b1     = round(bias["b1"], 4),
    b2_hat      = round(rata["b2"], 4),
    bias_b2     = round(bias["b2"], 4),
    MSE_rerata  = round(mean(mse), 5),
    row.names   = NULL
  )
}

tabel_ringkasan <- rbind(
  ringkasan_skenario(sc1, "Sampel Kecil (n=30)",          30),
  ringkasan_skenario(sc2, "Sampel Besar (n=500)",         500),
  ringkasan_skenario(sc3, "Outlier 15% (n=200)",          200),
  ringkasan_skenario(sc4, "Multikolinearitas rho=0.80",   200)
)

cat("\n========== TABEL RINGKASAN HASIL SIMULASI ==========\n")
## 
## ========== TABEL RINGKASAN HASIL SIMULASI ==========
print(tabel_ringkasan, row.names = FALSE)
##                    Skenario   n  b0_hat bias_b0 b1_hat bias_b1  b2_hat bias_b2
##         Sampel Kecil (n=30)  30 -2.7298 -1.2298 4.0482  2.0482 -1.5359 -0.7359
##        Sampel Besar (n=500) 500 -1.5055 -0.0055 2.0081  0.0081 -0.8094 -0.0094
##         Outlier 15% (n=200) 200 -1.5611 -0.0611 2.0893  0.0893 -0.8370 -0.0370
##  Multikolinearitas rho=0.80 200 -1.5637 -0.0637 2.0780  0.0780 -0.8204 -0.0204
##  MSE_rerata
##   132.41841
##     0.02590
##     0.08081
##     0.13320
cat("\nParameter sebenarnya: beta0=-1.5 | beta1=2.0 | beta2=-0.8\n")
## 
## Parameter sebenarnya: beta0=-1.5 | beta1=2.0 | beta2=-0.8
cat("MSE terkecil (estimasi terbaik):", tabel_ringkasan$Skenario[which.min(tabel_ringkasan$MSE_rerata)], "\n")
## MSE terkecil (estimasi terbaik): Sampel Besar (n=500)
cat("MSE terbesar (bias terbesar)   :", tabel_ringkasan$Skenario[which.max(tabel_ringkasan$MSE_rerata)], "\n")
## MSE terbesar (bias terbesar)   : Sampel Kecil (n=30)

—- 6. Visualisasi —-

# 6a. Distribusi estimasi beta1
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))
daftar <- list(sc1[, "b1"], sc2[, "b1"], sc3[, "b1"], sc4[, "b1"])
judul  <- c("Sampel Kecil (n=30)", "Sampel Besar (n=500)",
            "Outlier 15% (n=200)", "Multikolinearitas rho=0.80 (n=200)")
warna  <- c("#EF9F27", "#378ADD", "#E24B4A", "#639922")

for (i in 1:4) {
  hist(daftar[[i]], main = judul[i], xlab = expression(hat(beta)[1]),
       col = paste0(warna[i], "66"), border = warna[i], breaks = 20,
       xlim = c(-1, 6))
  abline(v = beta_true["b1"], lty = 2, lwd = 2, col = "black")
  abline(v = mean(daftar[[i]]), lty = 1, lwd = 2, col = warna[i])
  legend("topright", legend = c("Sebenarnya", "Rata-rata"), bty = "n",
         lty = c(2, 1), lwd = 2, col = c("black", warna[i]), cex = 0.75)
}

# 6b. Barplot bias
par(mfrow = c(1, 1), mar = c(6, 4, 3, 1))
bias_mat <- rbind(
  tabel_ringkasan$bias_b0,
  tabel_ringkasan$bias_b1,
  tabel_ringkasan$bias_b2
)
colnames(bias_mat) <- c("Kecil", "Besar", "Outlier", "Multikol")
rownames(bias_mat) <- c("Bias beta0", "Bias beta1", "Bias beta2")
barplot(bias_mat, beside = TRUE, col = c("#534AB7", "#1D9E75", "#D85A30"),
        legend.text = TRUE, args.legend = list(x = "topright", bty = "n", cex = 0.85),
        main = "Perbandingan Bias Estimasi per Skenario",
        ylab = "Bias", las = 2, ylim = c(min(bias_mat) - 0.3, max(bias_mat) + 0.3))
abline(h = 0, lty = 2, col = "gray40")

# 6c. Barplot MSE
par(mar = c(6, 4, 3, 1))
barplot(tabel_ringkasan$MSE_rerata,
        names.arg = c("Kecil\n(n=30)", "Besar\n(n=500)", "Outlier\n15%", "Multikol\nrho=0.8"),
        col = c("#FAEEDA", "#E6F1FB", "#FCEBEB", "#EAF3DE"),
        border = c("#BA7517", "#185FA5", "#A32D2D", "#3B6D11"),
        main = "MSE Rata-rata per Skenario", ylab = "MSE", las = 1)

Berdasarkan hasil simulasi estimasi parameter model regresi logistik menggunakan pendekatan Monte Carlo, dapat diambil beberapa kesimpulan sebagai berikut:

  1. Ukuran sampel memiliki pengaruh paling besar terhadap kualitas estimasi. Pada sampel kecil (n = 30), hasil estimasi menunjukkan penyimpangan yang sangat besar dan tidak stabil. Sebaliknya, pada sampel besar (n = 500), estimasi menjadi sangat akurat dan mendekati nilai parameter sebenarnya.

  2. Estimator Maximum Likelihood Estimation (MLE) menunjukkan sifat konsisten. Hal ini terlihat dari peningkatan akurasi estimasi seiring bertambahnya ukuran sampel, di mana bias menjadi semakin kecil dan nilai MSE menurun secara signifikan.

  3. Keberadaan outlier memengaruhi hasil estimasi, namun tidak secara ekstrem. Outlier menyebabkan peningkatan bias dan MSE, tetapi estimasi masih tergolong cukup baik dibandingkan kondisi sampel kecil.

  4. Multikolinearitas meningkatkan ketidakstabilan estimasi. Meskipun bias yang dihasilkan relatif kecil, nilai MSE pada skenario multikolinearitas lebih tinggi dibandingkan skenario outlier. Hal ini menunjukkan bahwa hubungan yang kuat antar variabel prediktor dapat meningkatkan variabilitas hasil estimasi.

  5. Secara keseluruhan, hasil simulasi sesuai dengan teori statistika. Estimator MLE bekerja dengan baik pada kondisi data yang ideal, tetapi kinerjanya dapat menurun pada kondisi tertentu seperti ukuran sampel kecil, keberadaan outlier, dan multikolinearitas.