Pendahuluan

Simulasi ini bertujuan untuk mengevaluasi kinerja estimasi parameter regresi logistik dalam berbagai kondisi: ukuran sampel, outlier, missing value, dan multikolinearitas.


1. Atur Seed untuk Reproductibility

set.seed(42)

2. Parameter Sebenarnya

beta_true <- c(beta0 = -1.5, beta1 = 2.0, beta2 = -1.0, beta3 = 0.5)
beta_true
## beta0 beta1 beta2 beta3 
##  -1.5   2.0  -1.0   0.5

3. Fungsi Pembangkitan Data & Estimasi

generate_data <- function(n, seed_val,
                          apply_outlier = FALSE,
                          apply_missing = FALSE,
                          outlier_pct = 0,
                          missing_pct = 0,
                          multicol = FALSE) {
  set.seed(seed_val)

  # Skenario Multikolinearitas: Membuat x2 sangat bergantung/berkorelasi kuat dengan x1
  if (multicol) {
    x1 <- rnorm(n)
    x2 <- 0.95 * x1 + rnorm(n, 0, 0.1)
    x3 <- rnorm(n)
  } else {
    x1 <- rnorm(n)
    x2 <- rnorm(n)
    x3 <- rnorm(n)
  }

  # Transformasi kombinasi linear ke probabilitas menggunakan fungsi inverse-logit
  log_odds <- beta_true["beta0"] + beta_true["beta1"]*x1 +
              beta_true["beta2"]*x2 + beta_true["beta3"]*x3
  prob <- 1 / (1 + exp(-log_odds))
  y <- rbinom(n, 1, prob)

  df <- data.frame(y, x1, x2, x3)

  # Skenario Outlier: Menggeser sebagian data x1 dan x2 menjadi nilai ekstrem (±10)
  if (apply_outlier) {
    n_out <- max(1, round(n * outlier_pct))
    idx <- sample(1:n, n_out)
    df$x1[idx] <- df$x1[idx] + sample(c(-10, 10), n_out, replace = TRUE)
    df$x2[idx] <- df$x2[idx] + sample(c(-10, 10), n_out, replace = TRUE)
  }

  # Skenario Missing Value: Mengosongkan data secara acak (MCAR - Missing Completely at Random)
  if (apply_missing) {
    n_miss <- max(1, round(n * missing_pct))
    df$x1[sample(1:n, n_miss)] <- NA
    df$x2[sample(1:n, n_miss)] <- NA
  }

  return(df)
}

estimate_params <- function(df) {
  # tryCatch: cegah loop berhenti jika model gagal konvergen (misal sampel terlalu sedikit)
  tryCatch({
    df_clean <- na.omit(df)
    if (nrow(df_clean) < 10) return(rep(NA, 4))
    model <- glm(y ~ x1 + x2 + x3, data = df_clean, family = binomial())
    coef(model)
  }, error = function(e) rep(NA, 4))
}

4. Desain Simulasi

n_rep <- 50
scenarios <- list(
  list(name="Sampel Kecil (n=50)",    n=50),
  list(name="Sampel Sedang (n=200)",  n=200),
  list(name="Sampel Besar (n=1000)",  n=1000),

  list(name="Outlier 5%",  n=200, apply_outlier=TRUE, outlier_pct=0.05),
  list(name="Outlier 10%", n=200, apply_outlier=TRUE, outlier_pct=0.10),

  list(name="Missing 10%", n=200, apply_missing=TRUE, missing_pct=0.10),
  list(name="Missing 20%", n=200, apply_missing=TRUE, missing_pct=0.20),

  list(name="Multikol Tinggi", n=200, multicol=TRUE)
)

5. Proses Simulasi Monte Carlo

results_all <- list()

for (s in seq_along(scenarios)) {
  sc <- scenarios[[s]]

  mat <- matrix(NA, nrow = n_rep, ncol = 4)
  colnames(mat) <- c("beta0", "beta1", "beta2", "beta3")

  for (r in 1:n_rep) {

    df <- generate_data(
      n = sc$n,
      seed_val = r*100 + s,
      apply_outlier = ifelse(is.null(sc$apply_outlier), FALSE, sc$apply_outlier),
      apply_missing = ifelse(is.null(sc$apply_missing), FALSE, sc$apply_missing),
      outlier_pct   = ifelse(is.null(sc$outlier_pct), 0, sc$outlier_pct),
      missing_pct   = ifelse(is.null(sc$missing_pct), 0, sc$missing_pct),
      multicol      = ifelse(is.null(sc$multicol), FALSE, sc$multicol)
    )

    est <- estimate_params(df)
    if (length(est) == 4) mat[r, ] <- est
  }

  mean_est <- colMeans(mat, na.rm = TRUE)
  bias <- mean_est - beta_true
  rmse <- sqrt(colMeans((sweep(mat, 2, beta_true))^2, na.rm = TRUE))

  results_all[[s]] <- list(
    name = sc$name,
    mat = mat,
    mean_est = mean_est,
    bias = bias,
    rmse = rmse
  )
}

6. Ringkasan Hasil

summary_df <- do.call(rbind, lapply(results_all, function(r) {
  data.frame(
    Skenario    = r$name,
    # Mean Estimates
    M_B0 = round(r$mean_est[1], 4),
    M_B1 = round(r$mean_est[2], 4),
    M_B2 = round(r$mean_est[3], 4),
    M_B3 = round(r$mean_est[4], 4),
    # Bias
    B_B0 = round(r$bias[1], 4),
    B_B1 = round(r$bias[2], 4),
    B_B2 = round(r$bias[3], 4),
    B_B3 = round(r$bias[4], 4),
    # RMSE
    R_B0 = round(r$rmse[1], 4),
    R_B1 = round(r$rmse[2], 4),
    R_B2 = round(r$rmse[3], 4),
    R_B3 = round(r$rmse[4], 4)
  )
}))

kable(summary_df, 
      col.names = c("Skenario", 
                    "Mean $\\beta_0$", "Mean $\\beta_1$", "Mean $\\beta_2$", "Mean $\\beta_3$",
                    "Bias $\\beta_0$", "Bias $\\beta_1$", "Bias $\\beta_2$", "Bias $\\beta_3$",
                    "RMSE $\\beta_0$", "RMSE $\\beta_1$", "RMSE $\\beta_2$", "RMSE $\\beta_3$"),
      escape = FALSE, 
      row.names = FALSE,
      caption = "Tabel Ringkasan Evaluasi Seluruh Parameter Model",
      format = "html",
      table.attr = "class='table table-condensed'")
Tabel Ringkasan Evaluasi Seluruh Parameter Model
Skenario Mean \(\beta_0\) Mean \(\beta_1\) Mean \(\beta_2\) Mean \(\beta_3\) Bias \(\beta_0\) Bias \(\beta_1\) Bias \(\beta_2\) Bias \(\beta_3\) RMSE \(\beta_0\) RMSE \(\beta_1\) RMSE \(\beta_2\) RMSE \(\beta_3\)
Sampel Kecil (n=50) -1.8389 2.2622 -1.2980 0.5847 -0.3389 0.2622 -0.2980 0.0847 0.7573 0.9113 0.8087 0.5897
Sampel Sedang (n=200) -1.5932 2.1301 -1.0481 0.5699 -0.0932 0.1301 -0.0481 0.0699 0.2645 0.3181 0.2555 0.2244
Sampel Besar (n=1000) -1.4721 1.9987 -0.9962 0.4924 0.0279 -0.0013 0.0038 -0.0076 0.1177 0.1392 0.1108 0.0666
Outlier 5% -1.0271 0.5767 -0.4115 0.3767 0.4729 -1.4233 0.5885 -0.1233 0.5127 1.4706 0.6917 0.2295
Outlier 10% -0.9060 0.1386 -0.0733 0.2812 0.5940 -1.8614 0.9267 -0.2188 0.6160 1.8654 0.9352 0.2718
Missing 10% -1.5880 2.0957 -1.0394 0.6160 -0.0880 0.0957 -0.0394 0.1160 0.2843 0.3575 0.2606 0.2902
Missing 20% -1.4753 2.0410 -0.9893 0.5592 0.0247 0.0410 0.0107 0.0592 0.2884 0.3478 0.3246 0.2772
Multikol Tinggi -1.5324 1.9120 -0.8869 0.5370 -0.0324 -0.0880 0.1131 0.0370 0.2068 1.7243 1.7774 0.2664

7. Visualisasi

7.1 Bias beta1

par(mar = c(11, 4, 4, 2))

bias_vals <- sapply(results_all, function(r) r$bias["beta1"])
sc_names  <- sapply(results_all, function(r) r$name)

barplot(bias_vals, names.arg = sc_names,
        main = "Bias Estimasi beta1",
        ylab = "Bias", col = "steelblue", las = 2)
abline(h = 0, lty = 2)

7.2 RMSE beta1

par(mar = c(11, 4, 4, 2))

rmse_vals <- sapply(results_all, function(r) r$rmse["beta1"])

barplot(rmse_vals, names.arg = sc_names,
        main = "RMSE Estimasi beta1",
        ylab = "RMSE", col = "coral", las = 2)

7.3 Boxplot Estimasi

par(mar = c(11, 4, 4, 2))

all_mats <- lapply(results_all, function(r) r$mat[, "beta1"])
names(all_mats) <- sapply(results_all, function(r) substr(r$name, 1, 20))

boxplot(all_mats,
        main = "Distribusi Estimasi beta1",
        ylab = "Estimasi beta1",
        col = "lightblue", las = 2)

abline(h = beta_true["beta1"], col = "red", lty = 2)