Simulasi ini bertujuan untuk mengevaluasi kinerja estimasi parameter regresi logistik dalam berbagai kondisi: ukuran sampel, outlier, missing value, dan multikolinearitas.
set.seed(42)
beta_true <- c(beta0 = -1.5, beta1 = 2.0, beta2 = -1.0, beta3 = 0.5)
beta_true
## beta0 beta1 beta2 beta3
## -1.5 2.0 -1.0 0.5
generate_data <- function(n, seed_val,
apply_outlier = FALSE,
apply_missing = FALSE,
outlier_pct = 0,
missing_pct = 0,
multicol = FALSE) {
set.seed(seed_val)
# Skenario Multikolinearitas: Membuat x2 sangat bergantung/berkorelasi kuat dengan x1
if (multicol) {
x1 <- rnorm(n)
x2 <- 0.95 * x1 + rnorm(n, 0, 0.1)
x3 <- rnorm(n)
} else {
x1 <- rnorm(n)
x2 <- rnorm(n)
x3 <- rnorm(n)
}
# Transformasi kombinasi linear ke probabilitas menggunakan fungsi inverse-logit
log_odds <- beta_true["beta0"] + beta_true["beta1"]*x1 +
beta_true["beta2"]*x2 + beta_true["beta3"]*x3
prob <- 1 / (1 + exp(-log_odds))
y <- rbinom(n, 1, prob)
df <- data.frame(y, x1, x2, x3)
# Skenario Outlier: Menggeser sebagian data x1 dan x2 menjadi nilai ekstrem (±10)
if (apply_outlier) {
n_out <- max(1, round(n * outlier_pct))
idx <- sample(1:n, n_out)
df$x1[idx] <- df$x1[idx] + sample(c(-10, 10), n_out, replace = TRUE)
df$x2[idx] <- df$x2[idx] + sample(c(-10, 10), n_out, replace = TRUE)
}
# Skenario Missing Value: Mengosongkan data secara acak (MCAR - Missing Completely at Random)
if (apply_missing) {
n_miss <- max(1, round(n * missing_pct))
df$x1[sample(1:n, n_miss)] <- NA
df$x2[sample(1:n, n_miss)] <- NA
}
return(df)
}
estimate_params <- function(df) {
# tryCatch: cegah loop berhenti jika model gagal konvergen (misal sampel terlalu sedikit)
tryCatch({
df_clean <- na.omit(df)
if (nrow(df_clean) < 10) return(rep(NA, 4))
model <- glm(y ~ x1 + x2 + x3, data = df_clean, family = binomial())
coef(model)
}, error = function(e) rep(NA, 4))
}
n_rep <- 50
scenarios <- list(
list(name="Sampel Kecil (n=50)", n=50),
list(name="Sampel Sedang (n=200)", n=200),
list(name="Sampel Besar (n=1000)", n=1000),
list(name="Outlier 5%", n=200, apply_outlier=TRUE, outlier_pct=0.05),
list(name="Outlier 10%", n=200, apply_outlier=TRUE, outlier_pct=0.10),
list(name="Missing 10%", n=200, apply_missing=TRUE, missing_pct=0.10),
list(name="Missing 20%", n=200, apply_missing=TRUE, missing_pct=0.20),
list(name="Multikol Tinggi", n=200, multicol=TRUE)
)
results_all <- list()
for (s in seq_along(scenarios)) {
sc <- scenarios[[s]]
mat <- matrix(NA, nrow = n_rep, ncol = 4)
colnames(mat) <- c("beta0", "beta1", "beta2", "beta3")
for (r in 1:n_rep) {
df <- generate_data(
n = sc$n,
seed_val = r*100 + s,
apply_outlier = ifelse(is.null(sc$apply_outlier), FALSE, sc$apply_outlier),
apply_missing = ifelse(is.null(sc$apply_missing), FALSE, sc$apply_missing),
outlier_pct = ifelse(is.null(sc$outlier_pct), 0, sc$outlier_pct),
missing_pct = ifelse(is.null(sc$missing_pct), 0, sc$missing_pct),
multicol = ifelse(is.null(sc$multicol), FALSE, sc$multicol)
)
est <- estimate_params(df)
if (length(est) == 4) mat[r, ] <- est
}
mean_est <- colMeans(mat, na.rm = TRUE)
bias <- mean_est - beta_true
rmse <- sqrt(colMeans((sweep(mat, 2, beta_true))^2, na.rm = TRUE))
results_all[[s]] <- list(
name = sc$name,
mat = mat,
mean_est = mean_est,
bias = bias,
rmse = rmse
)
}
summary_df <- do.call(rbind, lapply(results_all, function(r) {
data.frame(
Skenario = r$name,
# Mean Estimates
M_B0 = round(r$mean_est[1], 4),
M_B1 = round(r$mean_est[2], 4),
M_B2 = round(r$mean_est[3], 4),
M_B3 = round(r$mean_est[4], 4),
# Bias
B_B0 = round(r$bias[1], 4),
B_B1 = round(r$bias[2], 4),
B_B2 = round(r$bias[3], 4),
B_B3 = round(r$bias[4], 4),
# RMSE
R_B0 = round(r$rmse[1], 4),
R_B1 = round(r$rmse[2], 4),
R_B2 = round(r$rmse[3], 4),
R_B3 = round(r$rmse[4], 4)
)
}))
kable(summary_df,
col.names = c("Skenario",
"Mean $\\beta_0$", "Mean $\\beta_1$", "Mean $\\beta_2$", "Mean $\\beta_3$",
"Bias $\\beta_0$", "Bias $\\beta_1$", "Bias $\\beta_2$", "Bias $\\beta_3$",
"RMSE $\\beta_0$", "RMSE $\\beta_1$", "RMSE $\\beta_2$", "RMSE $\\beta_3$"),
escape = FALSE,
row.names = FALSE,
caption = "Tabel Ringkasan Evaluasi Seluruh Parameter Model",
format = "html",
table.attr = "class='table table-condensed'")
| Skenario | Mean \(\beta_0\) | Mean \(\beta_1\) | Mean \(\beta_2\) | Mean \(\beta_3\) | Bias \(\beta_0\) | Bias \(\beta_1\) | Bias \(\beta_2\) | Bias \(\beta_3\) | RMSE \(\beta_0\) | RMSE \(\beta_1\) | RMSE \(\beta_2\) | RMSE \(\beta_3\) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Sampel Kecil (n=50) | -1.8389 | 2.2622 | -1.2980 | 0.5847 | -0.3389 | 0.2622 | -0.2980 | 0.0847 | 0.7573 | 0.9113 | 0.8087 | 0.5897 |
| Sampel Sedang (n=200) | -1.5932 | 2.1301 | -1.0481 | 0.5699 | -0.0932 | 0.1301 | -0.0481 | 0.0699 | 0.2645 | 0.3181 | 0.2555 | 0.2244 |
| Sampel Besar (n=1000) | -1.4721 | 1.9987 | -0.9962 | 0.4924 | 0.0279 | -0.0013 | 0.0038 | -0.0076 | 0.1177 | 0.1392 | 0.1108 | 0.0666 |
| Outlier 5% | -1.0271 | 0.5767 | -0.4115 | 0.3767 | 0.4729 | -1.4233 | 0.5885 | -0.1233 | 0.5127 | 1.4706 | 0.6917 | 0.2295 |
| Outlier 10% | -0.9060 | 0.1386 | -0.0733 | 0.2812 | 0.5940 | -1.8614 | 0.9267 | -0.2188 | 0.6160 | 1.8654 | 0.9352 | 0.2718 |
| Missing 10% | -1.5880 | 2.0957 | -1.0394 | 0.6160 | -0.0880 | 0.0957 | -0.0394 | 0.1160 | 0.2843 | 0.3575 | 0.2606 | 0.2902 |
| Missing 20% | -1.4753 | 2.0410 | -0.9893 | 0.5592 | 0.0247 | 0.0410 | 0.0107 | 0.0592 | 0.2884 | 0.3478 | 0.3246 | 0.2772 |
| Multikol Tinggi | -1.5324 | 1.9120 | -0.8869 | 0.5370 | -0.0324 | -0.0880 | 0.1131 | 0.0370 | 0.2068 | 1.7243 | 1.7774 | 0.2664 |
par(mar = c(11, 4, 4, 2))
bias_vals <- sapply(results_all, function(r) r$bias["beta1"])
sc_names <- sapply(results_all, function(r) r$name)
barplot(bias_vals, names.arg = sc_names,
main = "Bias Estimasi beta1",
ylab = "Bias", col = "steelblue", las = 2)
abline(h = 0, lty = 2)
par(mar = c(11, 4, 4, 2))
rmse_vals <- sapply(results_all, function(r) r$rmse["beta1"])
barplot(rmse_vals, names.arg = sc_names,
main = "RMSE Estimasi beta1",
ylab = "RMSE", col = "coral", las = 2)
par(mar = c(11, 4, 4, 2))
all_mats <- lapply(results_all, function(r) r$mat[, "beta1"])
names(all_mats) <- sapply(results_all, function(r) substr(r$name, 1, 20))
boxplot(all_mats,
main = "Distribusi Estimasi beta1",
ylab = "Estimasi beta1",
col = "lightblue", las = 2)
abline(h = beta_true["beta1"], col = "red", lty = 2)