# ==================================================
# SIMULASI REGRESI LOGISTIK
# UTS Pemodelan Statistika dan Simulasi
# ==================================================

# 1. SETUP ----
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)

# Set seed untuk reproduksibilitas
set.seed(2025)
# 2. PARAMETER TRUE ----
beta0_true <- 0.5
beta1_true <- 1.0
beta2_true <- -0.8
# 3. FUNGSI GENERATE DATA ----
generate_data <- function(n, outlier_pct = 0) {
  # Bangkitkan prediktor
  X1 <- rnorm(n, 0, 1)
  X2 <- rnorm(n, 0, 1)
  
  # Tambahkan outlier pada X1 jika diperlukan
  if (outlier_pct > 0) {
    n_out <- floor(n * outlier_pct / 100)
    if (n_out > 0) {
      idx <- sample(1:n, n_out)
      X1[idx] <- rnorm(n_out, mean = 5, sd = 1)  # outlier ekstrim
    }
  }
  
  # Hitung probabilitas dan bangkitkan Y
  lin_pred <- beta0_true + beta1_true * X1 + beta2_true * X2
  prob <- 1 / (1 + exp(-lin_pred))
  Y <- rbinom(n, 1, prob)
  
  return(data.frame(Y = Y, X1 = X1, X2 = X2))
}
# 4. SKENARIO SIMULASI ----
n_vals <- c(50, 100, 200)      # ukuran sampel
outlier_vals <- c(0, 5, 10)    # persentase outlier
R <- 50                         # jumlah replikasi
# 5. LOOP SIMULASI ----
hasil <- data.frame()

for (n in n_vals) {
  for (out_pct in outlier_vals) {
    for (r in 1:R) {
      # Bangkitkan data
      data <- generate_data(n, out_pct)
      
      # Estimasi parameter dengan MLE
      fit <- glm(Y ~ X1 + X2, data = data, family = binomial)
      coefs <- coef(fit)
      
      # Simpan hasil
      hasil <- rbind(hasil, data.frame(
        n = n,
        outlier_pct = out_pct,
        replikasi = r,
        beta0_est = coefs[1],
        beta1_est = coefs[2],
        beta2_est = coefs[3]
      ))
    }
  }
}
# 6. HITUNG BIAS ----
hasil_bias <- hasil %>%
  mutate(
    bias0 = beta0_est - beta0_true,
    bias1 = beta1_est - beta1_true,
    bias2 = beta2_est - beta2_true
  )
# 7. RINGKASAN PER SKENARIO ----
ringkasan <- hasil_bias %>%
  group_by(n, outlier_pct) %>%
  summarise(
    Bias_beta0 = mean(bias0, na.rm = TRUE),
    Bias_beta1 = mean(bias1, na.rm = TRUE),
    Bias_beta2 = mean(bias2, na.rm = TRUE),
    .groups = "drop"
  )

# Tampilkan hasil
print("=== RINGKASAN BIAS PER SKENARIO ===")
## [1] "=== RINGKASAN BIAS PER SKENARIO ==="
print(ringkasan)
## # A tibble: 9 × 5
##       n outlier_pct Bias_beta0 Bias_beta1 Bias_beta2
##   <dbl>       <dbl>      <dbl>      <dbl>      <dbl>
## 1    50           0     0.0495     0.204    -0.119  
## 2    50           5    -0.0430     0.124     0.0280 
## 3    50          10     0.0634     0.240    -0.0954 
## 4   100           0    -0.0332     0.0237   -0.0797 
## 5   100           5    -0.0187     0.0600   -0.0301 
## 6   100          10     0.0228     0.0307    0.0160 
## 7   200           0     0.0400     0.0332   -0.00849
## 8   200           5     0.0511     0.0445   -0.0687 
## 9   200          10     0.0324     0.0455   -0.0554
# 8. VISUALISASI ----
# Grafik bias beta1
ggplot(ringkasan, aes(x = factor(n), y = Bias_beta1, fill = factor(outlier_pct))) +
  geom_col(position = "dodge", width = 0.7) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(
    title = "Bias Estimasi Beta1 berdasarkan Ukuran Sampel dan Outlier",
    x = "Ukuran Sampel (n)",
    y = "Rata-rata Bias",
    fill = "Outlier (%)"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")