Bootstrap pada Regresi dan Missing Value

set.seed(123)
n <- 100 
x <- rnorm(n, mean = 10, sd = 2) 
y <- 3 + 1.5 * x + rnorm(n, mean = 0, sd = 2) 

data <- data.frame(x, y) 
data[sample(1:n, 10), "x"] <- NA 

head(data)

##           x        y
## 1  8.879049 14.89776
## 2  9.539645 17.82323
## 3 13.117417 22.18274
## 4 10.141017 17.51644
## 5 10.258575 16.48463
## 6 13.430130 23.05514

Praktikum 1: Bootstrap untuk Regresi (tanpa missing value)

# Data Cleaning
clean_data <- na.omit(data) 

# Bootstrap Regresi
boot_regression <- function(data, indices) {
 d <- data[indices, ] 
 model <- lm(y ~ x, data = d) 
 return(coef(model)) 
}

# Bootstrap 1000 replikasi
library(boot) 
boot_result <- boot(
 data = clean_data, 
 statistic = boot_regression, 
 R = 1000 
)
boot_result

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = clean_data, statistic = boot_regression, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original      bias    std. error
## t1* 3.581084  0.06067069   1.1482885
## t2* 1.412127 -0.00547455   0.1074228

# Plot distribusi bootstrap
plot(boot_result)

# Hitung confidence interval 95% untuk koefisien x (index=2)
boot.ci(boot_result, type = "perc", index = 2)

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_result, type = "perc", index = 2)
## 
## Intervals : 
## Level     Percentile     
## 95%   ( 1.176,  1.596 )  
## Calculations and Intervals on Original Scale

Praktikum 2: Estimasi pada Missing Value dengan Bootstrap

mean_x <- mean(data$x, na.rm = TRUE) 
data$ximp <- ifelse(is.na(data$x), mean_x, data$x) 

model_imp <- lm(y ~ ximp, data = data) 
summary(model_imp)

## 
## Call:
## lm(formula = y ~ ximp, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1153 -1.4394 -0.0902  1.2053  6.5280 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.6538     1.2332   2.963  0.00383 ** 
## ximp          1.4121     0.1191  11.854  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.109 on 98 degrees of freedom
## Multiple R-squared:  0.5891, Adjusted R-squared:  0.5849 
## F-statistic: 140.5 on 1 and 98 DF,  p-value: < 2.2e-16

# Fungsi bootstrap setelah imputasi
boot_imp <- function(data, indices) {
 d <- data[indices, ] 
 model <- lm(y ~ ximp, data = d) 
 return(coef(model)) 
}

boot_result_imp <- boot(data = data, statistic = boot_imp, R = 1000) 

boot_result_imp

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = data, statistic = boot_imp, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original       bias    std. error
## t1* 3.653794  0.053055397   1.1350004
## t2* 1.412127 -0.005093136   0.1064137

plot(boot_result_imp)

boot.ci(boot_result_imp, type = "perc", index = 2)

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_result_imp, type = "perc", index = 2)
## 
## Intervals : 
## Level     Percentile     
## 95%   ( 1.188,  1.603 )  
## Calculations and Intervals on Original Scale

Praktikum 3: Multiple Imputation + Bootstrap

library(mice)

## Warning: package 'mice' was built under R version 4.5.2

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

# Lakukan multiple imputation (m=5) dengan Predictive Mean Matching
imp <- mice(
 data[, c("x", "y")], 
 m = 5, 
 method = 'pmm', 
 seed = 123 
)

## 
##  iter imp variable
##   1   1  x
##   1   2  x
##   1   3  x
##   1   4  x
##   1   5  x
##   2   1  x
##   2   2  x
##   2   3  x
##   2   4  x
##   2   5  x
##   3   1  x
##   3   2  x
##   3   3  x
##   3   4  x
##   3   5  x
##   4   1  x
##   4   2  x
##   4   3  x
##   4   4  x
##   4   5  x
##   5   1  x
##   5   2  x
##   5   3  x
##   5   4  x
##   5   5  x

imp_data <- complete(imp, "long") 

model_mi <- with(imp, lm(y ~ x)) 
summary(pool(model_mi))

##          term estimate std.error statistic       df      p.value
## 1 (Intercept) 3.619991 1.1112706  3.257524 78.99385 1.657655e-03
## 2           x 1.408248 0.1068028 13.185496 78.10532 1.472407e-21

Gabungan Hasil

library(mice)
library(broom)

## Warning: package 'broom' was built under R version 4.5.1

model_clean <- lm(y ~ x, data = clean_data)
clean_summary <- tidy(model_clean, conf.int = TRUE)
# Model Mean Imputation + Bootstrap
boot_ci <- boot.ci(boot_result_imp, type = "perc", index = 2)
boot_summary <- tidy(model_imp, conf.int = TRUE)
# Model MICE
model_mice <- with(imp, lm(y ~ x))
mice_summary <- summary(pool(model_mice), conf.int = TRUE)
# Membuat data frame yang lebih robust
results_table <- data.frame(
  Metode = c("Data Lengkap", "Mean Imputation + Bootstrap", "MICE"),
  Intercept = c(
    clean_summary$estimate[1],
    boot_summary$estimate[1],
    mice_summary$estimate[1]
  ),
  Slope = c(
    clean_summary$estimate[2],
    boot_summary$estimate[2],
    mice_summary$estimate[2]
  ),
  SE_Slope = c(
    clean_summary$std.error[2],
    boot_summary$std.error[2],
    mice_summary$std.error[2]
  ),
  CI_Slope = c(
    sprintf("(%.3f, %.3f)", clean_summary$conf.low[2], clean_summary$conf.high[2]),
    sprintf("(%.3f, %.3f)", boot_ci$percent[4], boot_ci$percent[5]),
    sprintf("(%.3f, %.3f)", mice_summary$`2.5 %`[2], mice_summary$`97.5 %`[2])
  ),
  stringsAsFactors = FALSE
)

print(results_table)

##                        Metode Intercept    Slope  SE_Slope       CI_Slope
## 1                Data Lengkap  3.581084 1.412127 0.1079083 (1.198, 1.627)
## 2 Mean Imputation + Bootstrap  3.653794 1.412127 0.1191314 (1.188, 1.603)
## 3                        MICE  3.619991 1.408248 0.1068028 (1.196, 1.621)

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.5.2

# Data untuk plot
results <- data.frame(
 Method = c("Data Lengkap", "Mean Imp + Bootstrap", "MICE"),
 Slope = c(1.412127, 1.412127, 1.408248),
 SE = c(0.1079083, 0.1191314, 0.1068028 ),
 CI_lower = c(1.198, 1.188, 1.196),
 CI_upper = c(1.627, 1.603, 1.621)
)
ggplot(results, aes(x = Method, y = Slope, color = Method)) +
 geom_point(size = 3) +
 geom_errorbar(aes(ymin = CI_lower, ymax = CI_upper), width = 0.2) +
 labs(title = "Perbandingan Estimasi Slope dengan Berbagai Metode",
 y = "Estimasi Slope (y ~ x)") +
 theme_minimal()

Analisis Perbedaan Estimasi

Estimasi Slope (Koefisien x)

Berdasarkan hasil analisis, ketiga metode menghasilkan slope yang sangat mirip (~ 1.41), dengan perbedaaan < 0.004 (hanya 0.3% variasi). Hal ini mengindikasikan bahwa pola missing value tidak terlalu memengaruhi hubungan x dan y. Selain itu, MICE memberikan slope paling rendaah dengan data yang lengkap dan mean imputation sama (1.412).

Estimasi Intercept

Dari hasil analisis didapat rentang nilai dari 3.581 (data lengkap) hingga 3.654 (mean imputation), dengan perbedaan ~ 0.073 (2% dari nilai intercept). Selain itu mean imputation menghasilkan intercept ke arah rata-rata dan sedikit bias karena pengisian nilai konstan.

Standard Error (SE) Slope

SE data lengkap (0.108) dengan MICE (0.107) menghasilkan hasil yang sangat mirip. Mean imputation memiliki SE lebih besar (0.119), mencerminkan ketidakpastian dari impputasi sederhana dan sesuai ekspektasi teori karena mean imputation meremehkan variabilitas.

Confidence Interval (CI)

Pada data lengkap lebar CI sebesar 0.429 (1.627 - 1.198), pada mean imputation sebesar 0.415 (1.603 - 1.188), sedangkan pada MICE lebar intervalnya sebesar 0.425 (1.621 - 1.196).

Hal ini menunjukkan bahwa mean imputation memiliki CI paling sempit (bertentangan dengan teori), kemungkinan penyebabnya sebagai berikut:

Jumlah bootstrap tidak cukup banyak (misal hanya 100 iterasi)
Mising values sedikit (< 10%) sehingga dampak imputasi minimal
Data missing completely at random (MCAR)