Keg 8 - Bootstrap pada Regresi dan Missing Value

# LOAD LIBRARY

library(boot)
library(mice)

## Warning: package 'mice' was built under R version 4.4.3

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(broom)

## Warning: package 'broom' was built under R version 4.4.3

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

# SIMULASI DATA

set.seed(456)

# jumlah observasi
n <- 120

# variabel x
x <- rnorm(n, mean = 12, sd = 3)

# variabel y
y <- 5 + 1.8*x + rnorm(n, mean = 0, sd = 3)

# gabungkan data
data <- data.frame(x,y)

# tambahkan missing value pada x
data[sample(1:n,15), "x"] <- NA

# lihat data
head(data)

##           x        y
## 1  7.969436 18.51386
## 2 13.865327 28.03350
## 3        NA 33.49535
## 4  7.833323 20.04151
## 5  9.856929 26.40986
## 6        NA 21.47109

summary(data)

##        x                y        
##  Min.   : 5.236   Min.   :10.26  
##  1st Qu.:10.018   1st Qu.:22.73  
##  Median :12.207   Median :27.36  
##  Mean   :12.153   Mean   :27.02  
##  3rd Qu.:14.431   3rd Qu.:31.76  
##  Max.   :18.840   Max.   :39.87  
##  NA's   :15

# PRAKTIKUM 1: BOOTSTRAP REGRESI (DATA LENGKAP)

# hapus missing value
clean_data <- na.omit(data)

# fungsi bootstrap
boot_regression <- function(data, indices){
  
  d <- data[indices,]
  
  model <- lm(y ~ x, data = d)
  
  return(coef(model))
}

# bootstrap 1000 kali
boot_result <- boot(
  data = clean_data,
  statistic = boot_regression,
  R = 1000
)

# hasil bootstrap
boot_result

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = clean_data, statistic = boot_regression, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original       bias    std. error
## t1* 5.255779 -0.027817742  1.24016720
## t2* 1.767315  0.003654091  0.09642658

# plot bootstrap
plot(boot_result)

# confidence interval slope
boot.ci(
  boot_result,
  type = "perc",
  index = 2
)

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_result, type = "perc", index = 2)
## 
## Intervals : 
## Level     Percentile     
## 95%   ( 1.585,  1.965 )  
## Calculations and Intervals on Original Scale

# INTERPRETASI:
# Bootstrap digunakan untuk mengetahui distribusi
# koefisien regresi dan confidence interval

# PRAKTIKUM 2: MEAN IMPUTATION + BOOTSTRAP

# hitung mean x
mean_x <- mean(data$x, na.rm = TRUE)

# imputasi mean
data$ximp <- ifelse(
  is.na(data$x),
  mean_x,
  data$x
)

# model regresi
model_imp <- lm(y ~ ximp, data = data)

summary(model_imp)

## 
## Call:
## lm(formula = y ~ ximp, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.2270 -2.2998 -0.2632  2.3218 12.8551 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.5404     1.4296   3.875 0.000175 ***
## ximp          1.7673     0.1147  15.409  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.481 on 118 degrees of freedom
## Multiple R-squared:  0.668,  Adjusted R-squared:  0.6652 
## F-statistic: 237.5 on 1 and 118 DF,  p-value: < 2.2e-16

# fungsi bootstrap imputasi
boot_imp <- function(data, indices){
  
  d <- data[indices,]
  
  model <- lm(y ~ ximp, data = d)
  
  return(coef(model))
}

# bootstrap
boot_result_imp <- boot(
  data = data,
  statistic = boot_imp,
  R = 1000
)

# hasil
boot_result_imp

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = data, statistic = boot_imp, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original       bias    std. error
## t1* 5.540414  0.035853077   1.3115391
## t2* 1.767315 -0.002295576   0.1023997

# CI
boot_ci <- boot.ci(
  boot_result_imp,
  type = "perc",
  index = 2
)

boot_ci

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_result_imp, type = "perc", index = 2)
## 
## Intervals : 
## Level     Percentile     
## 95%   ( 1.571,  1.981 )  
## Calculations and Intervals on Original Scale

# INTERPRETASI:
# Missing value diganti dengan rata-rata x
# lalu dilakukan bootstrap kembali

# PRAKTIKUM 3: MULTIPLE IMPUTATION (MICE)

imp <- mice(
  data[,c("x","y")],
  m = 5,
  method = "pmm",
  seed = 456
)

## 
##  iter imp variable
##   1   1  x
##   1   2  x
##   1   3  x
##   1   4  x
##   1   5  x
##   2   1  x
##   2   2  x
##   2   3  x
##   2   4  x
##   2   5  x
##   3   1  x
##   3   2  x
##   3   3  x
##   3   4  x
##   3   5  x
##   4   1  x
##   4   2  x
##   4   3  x
##   4   4  x
##   4   5  x
##   5   1  x
##   5   2  x
##   5   3  x
##   5   4  x
##   5   5  x

# model tiap imputasi
model_mice <- with(
  imp,
  lm(y ~ x)
)

# gabungkan hasil
mice_summary <- summary(
  pool(model_mice),
  conf.int = TRUE
)

mice_summary

##          term estimate  std.error statistic       df      p.value    2.5 %
## 1 (Intercept) 5.489190 1.17403860  4.675476 89.67545 1.029672e-05 3.156642
## 2           x 1.754618 0.09293205 18.880651 89.91011 4.864461e-33 1.569989
##     97.5 % conf.low conf.high
## 1 7.821737 3.156642  7.821737
## 2 1.939246 1.569989  1.939246

# INTERPRETASI:
# MICE lebih baik dibanding mean imputation
# karena mempertahankan variasi data

# PERBANDINGAN HASIL

# model data lengkap
model_clean <- lm(y ~ x, data = clean_data)

clean_summary <- tidy(
  model_clean,
  conf.int = TRUE
)

# model mean imputasi
boot_summary <- tidy(
  model_imp,
  conf.int = TRUE
)

# tabel perbandingan
results_table <- data.frame(
  
  Metode = c(
    "Data Lengkap",
    "Mean Imputation + Bootstrap",
    "MICE"
  ),
  
  Intercept = c(
    clean_summary$estimate[1],
    boot_summary$estimate[1],
    mice_summary$estimate[1]
  ),
  
  Slope = c(
    clean_summary$estimate[2],
    boot_summary$estimate[2],
    mice_summary$estimate[2]
  ),
  
  SE_Slope = c(
    clean_summary$std.error[2],
    boot_summary$std.error[2],
    mice_summary$std.error[2]
  ),
  
  CI_Slope = c(
    paste0(
      "(",
      round(clean_summary$conf.low[2],3),
      ", ",
      round(clean_summary$conf.high[2],3),
      ")"
    ),
    
    paste0(
      "(",
      round(boot_ci$percent[4],3),
      ", ",
      round(boot_ci$percent[5],3),
      ")"
    ),
    
    paste0(
      "(",
      round(mice_summary$`2.5 %`[2],3),
      ", ",
      round(mice_summary$`97.5 %`[2],3),
      ")"
    )
  )
)

results_table

##                        Metode Intercept    Slope   SE_Slope       CI_Slope
## 1                Data Lengkap  5.255779 1.767315 0.09638160 (1.576, 1.958)
## 2 Mean Imputation + Bootstrap  5.540414 1.767315 0.11469062 (1.571, 1.981)
## 3                        MICE  5.489190 1.754618 0.09293205  (1.57, 1.939)

# INTERPRETASI:
# Dibandingkan hasil ketiga metode:
# 1. Data lengkap
# 2. Mean imputation + bootstrap
# 3. MICE

# VISUALISASI

results <- data.frame(
  Method = c(
    "Data Lengkap",
    "Mean Imputation",
    "MICE"
  ),
  
  Slope = c(
    clean_summary$estimate[2],
    boot_summary$estimate[2],
    mice_summary$estimate[2]
  ),
  
  SE = c(
    clean_summary$std.error[2],
    boot_summary$std.error[2],
    mice_summary$std.error[2]
  ),
  
  CI_lower = c(
    clean_summary$conf.low[2],
    boot_ci$percent[4],
    mice_summary$`2.5 %`[2]
  ),
  
  CI_upper = c(
    clean_summary$conf.high[2],
    boot_ci$percent[5],
    mice_summary$`97.5 %`[2]
  )
)

ggplot(results,
       aes(x=Method,
           y=Slope,
           color=Method))+
  geom_point(size=3)+
  geom_errorbar(
    aes(
      ymin=CI_lower,
      ymax=CI_upper
    ),
    width=0.2
  )+
  labs(
    title="Perbandingan Estimasi Slope",
    y="Nilai Slope"
  )+
  theme_minimal()

# KESIMPULAN

cat("1. Ketiga metode menghasilkan slope yang relatif mirip\n")

## 1. Ketiga metode menghasilkan slope yang relatif mirip

cat("2. Mean imputation cenderung lebih sederhana namun bisa bias\n")

## 2. Mean imputation cenderung lebih sederhana namun bisa bias

cat("3. MICE lebih baik karena mempertahankan variasi data\n")

## 3. MICE lebih baik karena mempertahankan variasi data

cat("4. Bootstrap membantu mengestimasi ketidakpastian parameter\n")

## 4. Bootstrap membantu mengestimasi ketidakpastian parameter

Keg 8 - Bootstrap pada Regresi dan Missing Value

Muharima Sahara (2404220039)

2026-05-02