Week 12 - Bootstrap pada Regresi dan Missing Value

# Load package
library(boot)
library(mice)

## Warning: package 'mice' was built under R version 4.4.3

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(broom)
library(ggplot2)

Bootstrap dalam Regresi & Estimasi Missing Value

# Dataset Simulasi
set.seed(123)
n <- 100
x <- rnorm(n, mean = 10, sd = 2)
y <- 3 + 1.5 * x + rnorm(n, mean = 0, sd = 2)
data <- data.frame(x, y)
data[sample(1:n, 10), "x"] <- NA
head(data)

##           x        y
## 1  8.879049 14.89776
## 2  9.539645 17.82323
## 3 13.117417 22.18274
## 4 10.141017 17.51644
## 5 10.258575 16.48463
## 6 13.430130 23.05514

Penjelasan: - set.seed(123): menjamin hasil random yang konsisten. - rnorm(): menghasilkan data dari distribusi normal. - Hubungan antara y dan x: \(y = 3 + 1.5x + error\) - sample(): memilih 10 baris secara acak untuk dijadikan NA.

Praktikum 1: Bootstrap untuk Regresi (tanpa missing)

# Hapus baris yang mengandung NA
clean_data <- na.omit(data)

# Fungsi bootstrap regresi
boot_regression <- function(data, indices) {
  d <- data[indices, ]
  model <- lm(y ~ x, data = d)
  return(coef(model))
}

# Jalankan bootstrap
boot_result <- boot(
  data = clean_data,
  statistic = boot_regression,
  R = 1000
)

# Output hasil
boot_result

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = clean_data, statistic = boot_regression, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original      bias    std. error
## t1* 3.581084  0.06067069   1.1482885
## t2* 1.412127 -0.00547455   0.1074228

plot(boot_result)

# Confidence Interval
boot.ci(boot_result, type = "perc", index = 2)

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_result, type = "perc", index = 2)
## 
## Intervals : 
## Level     Percentile     
## 95%   ( 1.176,  1.596 )  
## Calculations and Intervals on Original Scale

Penjelasan: - na.omit(): hapus data yang mengandung NA. - boot(): bootstrap 1000 kali. - coef(): ambil hasil estimasi koefisien model. - boot.ci(): hitung confidence interval.

Praktikum 2: Estimasi Missing Value + Bootstrap

mean_x <- mean(data$x, na.rm = TRUE)
data$ximp <- ifelse(is.na(data$x), mean_x, data$x)
model_imp <- lm(y ~ ximp, data = data)
summary(model_imp)

## 
## Call:
## lm(formula = y ~ ximp, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1153 -1.4394 -0.0902  1.2053  6.5280 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.6538     1.2332   2.963  0.00383 ** 
## ximp          1.4121     0.1191  11.854  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.109 on 98 degrees of freedom
## Multiple R-squared:  0.5891, Adjusted R-squared:  0.5849 
## F-statistic: 140.5 on 1 and 98 DF,  p-value: < 2.2e-16

boot_imp <- function(data, indices) {
  d <- data[indices, ]
  model <- lm(y ~ ximp, data = d)
  return(coef(model))
}

boot_result_imp <- boot(data = data, statistic = boot_imp, R = 1000)
boot_result_imp

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = data, statistic = boot_imp, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original       bias    std. error
## t1* 3.653794  0.053055397   1.1350004
## t2* 1.412127 -0.005093136   0.1064137

plot(boot_result_imp)

boot.ci(boot_result_imp, type = "perc", index = 2)

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_result_imp, type = "perc", index = 2)
## 
## Intervals : 
## Level     Percentile     
## 95%   ( 1.188,  1.603 )  
## Calculations and Intervals on Original Scale

Catatan: - mean(na.rm = TRUE): menghitung mean tanpa NA. - ifelse(): mengganti NA dengan mean. - Bootstrap tetap dilakukan seperti sebelumnya, tetapi data sudah diimputasi.

Praktikum 3: Multiple Imputation (MICE)

imp <- mice(data[ , c("x", "y")], m = 5, method = 'pmm', seed = 123)

## 
##  iter imp variable
##   1   1  x
##   1   2  x
##   1   3  x
##   1   4  x
##   1   5  x
##   2   1  x
##   2   2  x
##   2   3  x
##   2   4  x
##   2   5  x
##   3   1  x
##   3   2  x
##   3   3  x
##   3   4  x
##   3   5  x
##   4   1  x
##   4   2  x
##   4   3  x
##   4   4  x
##   4   5  x
##   5   1  x
##   5   2  x
##   5   3  x
##   5   4  x
##   5   5  x

imp_data <- complete(imp, "long")
model_mi <- with(imp, lm(y ~ x))
summary(pool(model_mi))

##          term estimate std.error statistic       df      p.value
## 1 (Intercept) 3.619991 1.1112706  3.257524 78.99385 1.657655e-03
## 2           x 1.408248 0.1068028 13.185496 78.10532 1.472407e-21

Penjelasan: - mice(): melakukan multiple imputation (m = 5) dengan metode PMM. - with() + pool(): estimasi model dan gabungkan hasil imputasi.

Gabungan Hasil

# Model Lengkap
model_clean <- lm(y ~ x, data = clean_data)
clean_summary <- tidy(model_clean, conf.int = TRUE)

# Model Mean Imputation
boot_ci <- boot.ci(boot_result_imp, type = "perc", index = 2)
boot_summary <- tidy(model_imp, conf.int = TRUE)

# Model MICE
model_mice <- with(imp, lm(y ~ x))
mice_summary <- summary(pool(model_mice), conf.int = TRUE)

# Gabung tabel hasil
results_table <- data.frame(
  Metode = c("Data Lengkap", "Mean Imputation + Bootstrap", "MICE"),
  Intercept = c(
    clean_summary$estimate[1], 
    boot_summary$estimate[1], 
    mice_summary$estimate[1]
  ),
  Slope = c(
    clean_summary$estimate[2], 
    boot_summary$estimate[2], 
    mice_summary$estimate[2]
  ),
  SE_Slope = c(
    clean_summary$std.error[2], 
    boot_summary$std.error[2], 
    mice_summary$std.error[2]
  ),
  CI_Slope = c(
    sprintf("(%.3f, %.3f)", clean_summary$conf.low[2], clean_summary$conf.high[2]),
    sprintf("(%.3f, %.3f)", boot_ci$percent[4], boot_ci$percent[5]),
    sprintf("(%.3f, %.3f)", mice_summary$`2.5 %`[2], mice_summary$`97.5 %`[2])
  ),
  stringsAsFactors = FALSE
)

print(results_table)

##                        Metode Intercept    Slope  SE_Slope       CI_Slope
## 1                Data Lengkap  3.581084 1.412127 0.1079083 (1.198, 1.627)
## 2 Mean Imputation + Bootstrap  3.653794 1.412127 0.1191314 (1.188, 1.603)
## 3                        MICE  3.619991 1.408248 0.1068028 (1.196, 1.621)

Visualisasi Perbandingan Estimasi

results <- data.frame(
  Method = c("Data Lengkap", "Mean Imp + Bootstrap", "MICE"),
  Slope = c(1.412127, 1.412127, 1.408248),
  SE = c(0.1079083, 0.1191314, 0.1068028),
  CI_lower = c(1.198, 1.188, 1.196),
  CI_upper = c(1.627, 1.603, 1.621)
)

ggplot(results, aes(x = Method, y = Slope, color = Method)) + 
  geom_point(size = 3) + 
  geom_errorbar(aes(ymin = CI_lower, ymax = CI_upper), width = 0.2) + 
  labs(title = "Perbandingan Estimasi Slope dengan Berbagai Metode", 
       y = "Estimasi Slope (y ~ x)") + 
  theme_minimal()

Analisis Perbandingan Estimasi

Estimasi Slope dari ketiga metode sangat konsisten (~1.41), menunjukkan missing value tidak terlalu memengaruhi hasil.
Intercept berbeda lebih nyata, menunjukkan bahwa imputasi (terutama mean) bisa memengaruhi lokasi regresi.
Standard Error (SE) untuk mean imputation lebih besar, mencerminkan ketidakpastian dari teknik ini.
Confidence Interval (CI) paling sempit justru pada mean imputation, bisa jadi karena jumlah missing sedikit atau efek bootstrap.
MICE dianggap pendekatan paling robust karena memperhitungkan ketidakpastian dari imputasi dan menghasilkan estimasi paling stabil.