set.seed(123)
n <- 100

# Generate variabel x dari distribusi normal (mean=10, sd=2)
x <- rnorm(n, mean = 10, sd = 2)

# Generate variabel y dengan pola hubungan linear terhadap x plus error
y <- 3 + 1.5 * x + rnorm(n, mean = 0, sd = 2)

# Gabungkan menjadi data frame
data <- data.frame(x, y)

# Introduksi missing value secara acak pada 10 observasi x
data[sample(1:n, 10), "x"] <- NA
head(data)

##           x        y
## 1  8.879049 14.89776
## 2  9.539645 17.82323
## 3 13.117417 22.18274
## 4 10.141017 17.51644
## 5 10.258575 16.48463
## 6 13.430130 23.05514

1. Boostrap untuk regresi (tanpa missing)

clean_data <- na.omit(data)

# Fungsi untuk bootstrap regresi
boot_regression <- function(data, indices) {
 # Ambil sampel bootstrap sesuai indices
 d <- data[indices, ]
 # Fit model regresi linear
 model <- lm(y ~ x, data = d)
 # Return koefisien model
 return(coef(model))
}

# Load library boot
library(boot)

# Lakukan bootstrap dengan 1000 replikasi
boot_result <- boot(
 data = clean_data,
 statistic = boot_regression,
 R = 1000
)
boot_result

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = clean_data, statistic = boot_regression, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original      bias    std. error
## t1* 3.581084  0.06067069   1.1482885
## t2* 1.412127 -0.00547455   0.1074228

plot distribusi bootstrap

plot(boot_result)

#Hitung confidence interval 95% untuk koefisien x (index=2)

boot.ci(boot_result, type = "perc", index = 2)

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_result, type = "perc", index = 2)
## 
## Intervals : 
## Level     Percentile     
## 95%   ( 1.176,  1.596 )  
## Calculations and Intervals on Original Scale

Hasil bootstrap menunjukkan bahwa estimasi slope berada di sekitar 1.41 dengan interval kepercayaan yang relatif sempit, sehingga hubungan linear antara x dan y dapat dianggap stabil. Nilai standard error yang kecil mengindikasikan bahwa estimasi koefisien memiliki tingkat ketidakpastian yang rendah pada data tanpa missing value.

2. Estimasi pada Missing Value dengan Bootstrap

# Hitung mean x (abaikan NA)
mean_x <- mean(data$x, na.rm = TRUE)

# Buat variabel baru dengan imputasi mean
data$ximp <- ifelse(is.na(data$x), mean_x, data$x)

# Fit model setelah imputasi
model_imp <- lm(y ~ ximp, data = data)
summary(model_imp)

## 
## Call:
## lm(formula = y ~ ximp, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1153 -1.4394 -0.0902  1.2053  6.5280 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.6538     1.2332   2.963  0.00383 ** 
## ximp          1.4121     0.1191  11.854  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.109 on 98 degrees of freedom
## Multiple R-squared:  0.5891, Adjusted R-squared:  0.5849 
## F-statistic: 140.5 on 1 and 98 DF,  p-value: < 2.2e-16

# Fungsi bootstrap setelah imputasi
boot_imp <- function(data, indices) {
 d <- data[indices, ]
 model <- lm(y ~ ximp, data = d)
 return(coef(model))
}

# Jalankan bootstrap
boot_result_imp <- boot(data = data, statistic = boot_imp, R = 1000)
boot_result_imp

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = data, statistic = boot_imp, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original       bias    std. error
## t1* 3.653794  0.053055397   1.1350004
## t2* 1.412127 -0.005093136   0.1064137

plot(boot_result_imp)

boot.ci(boot_result_imp, type = "perc", index = 2)

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_result_imp, type = "perc", index = 2)
## 
## Intervals : 
## Level     Percentile     
## 95%   ( 1.188,  1.603 )  
## Calculations and Intervals on Original Scale

Setelah dilakukan mean imputation, nilai slope tetap berada di kisaran yang sama dengan data lengkap, namun intercept mengalami peningkatan. Hal ini menunjukkan bahwa imputasi dengan nilai rata-rata tidak banyak memengaruhi kemiringan hubungan, tetapi menggeser posisi garis regresi. Standard error yang lebih besar mengindikasikan adanya ketidakstabilan tambahan akibat metode imputasi sederhana.

3 Multiple Imputation + Bootstrap

library(mice)

## Warning: package 'mice' was built under R version 4.4.3

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

# Lakukan multiple imputation (m=5) dengan Predictive Mean Matching
imp <- mice(
 data[, c("x", "y")],
 m = 5,
 method = 'pmm',
 seed = 123
)

## 
##  iter imp variable
##   1   1  x
##   1   2  x
##   1   3  x
##   1   4  x
##   1   5  x
##   2   1  x
##   2   2  x
##   2   3  x
##   2   4  x
##   2   5  x
##   3   1  x
##   3   2  x
##   3   3  x
##   3   4  x
##   3   5  x
##   4   1  x
##   4   2  x
##   4   3  x
##   4   4  x
##   4   5  x
##   5   1  x
##   5   2  x
##   5   3  x
##   5   4  x
##   5   5  x

# Gabungkan dataset imputasi dalam long format
imp_data <- complete(imp, "long")

# Fit model di setiap dataset imputasi dan gabungkan hasilnya
model_mi <- with(imp, lm(y ~ x))
summary(pool(model_mi))

##          term estimate std.error statistic       df      p.value
## 1 (Intercept) 3.619991 1.1112706  3.257524 78.99385 1.657655e-03
## 2           x 1.408248 0.1068028 13.185496 78.10532 1.472407e-21

Metode multiple imputation (MICE) menghasilkan estimasi koefisien yang sangat mendekati data lengkap baik dari sisi slope maupun standard error. Hal ini menunjukkan bahwa MICE mampu mempertahankan struktur variabilitas data dan memberikan estimasi yang lebih konsisten dibandingkan mean imputation.

gabungkan hasil

library(mice)
library(broom)

## Warning: package 'broom' was built under R version 4.4.3

# 1. Model Data Lengkap
model_clean <- lm(y ~ x, data = clean_data)
clean_summary <- tidy(model_clean, conf.int = TRUE)

# 2. Model Mean Imputation + Bootstrap
# Asumsi boot_result_imp sudah dibuat sebelumnya
boot_ci <- boot.ci(boot_result_imp, type = "perc", index = 2)
boot_summary <- tidy(model_imp, conf.int = TRUE)

# 3. Model MICE
model_mice <- with(imp, lm(y ~ x))
mice_summary <- summary(pool(model_mice), conf.int = TRUE)

# Membuat data frame yang lebih robust
results_table <- data.frame(
  Metode = c("Data Lengkap", "Mean Imputation + Bootstrap", "MICE"),
  
  Intercept = c(
    clean_summary$estimate[1],
    boot_summary$estimate[1],
    mice_summary$estimate[1]
  ),
  
  Slope = c(
    clean_summary$estimate[2],
    boot_summary$estimate[2],
    mice_summary$estimate[2]
  ),
  
  SE_Slope = c(
    clean_summary$std.error[2],
    boot_summary$std.error[2],
    mice_summary$std.error[2]
  ),
  
  CI_Slope = c(
    sprintf("(%.3f, %.3f)", 
            clean_summary$conf.low[2],
            clean_summary$conf.high[2]),
    
    sprintf("(%.3f, %.3f)", 
            boot_ci$percent[4],
            boot_ci$percent[5]),
    
    sprintf("(%.3f, %.3f)", 
            mice_summary$`2.5 %`[2],
            mice_summary$`97.5 %`[2])   # <- FIX spasi kolom
  ),
  
  stringsAsFactors = FALSE
)

print(results_table)

##                        Metode Intercept    Slope  SE_Slope       CI_Slope
## 1                Data Lengkap  3.581084 1.412127 0.1079083 (1.198, 1.627)
## 2 Mean Imputation + Bootstrap  3.653794 1.412127 0.1191314 (1.188, 1.603)
## 3                        MICE  3.619991 1.408248 0.1068028 (1.196, 1.621)

Ketiga pendekatan menciptakan estimasi kemiringan yang sangat serupa menunjukkan bahwa keterkaitan antara variabel x dan y tampaknya tidak terpengaruh oleh adanya nilai hilang dalam jumlah kecil. Namun, perbedaan yang lebih mencolok terlihat pada estimasi intercept, di mana imputasi rata-rata memberikan nilai yang lebih tinggi karena mengganti nilai yang hilang dengan rata-rata yang mengubah distribusi data.

Dari sisi ketidakpastian, standard error pada data lengkap dan MICE relatif konsisten, sementara imputasi rata-rata menunjukkan angka yang lebih tinggi, menandakan estimasi yang lebih tidak stabil. Dalam interval kepercayaan, imputasi rata-rata justru menghasilkan rentang yang paling sempit, yang tidak sepenuhnya sejalan dengan nilai kesalahan standar yang lebih besar. Hal ini menunjukkan potensi underestimation variasi yang diakibatkan oleh imputasi sederhana serta pengaruh distribusi bootstrap yang tidak stabil.

Secara keseluruhan, metode MICE memberikan hasil yang paling homogen karena mampu mempertahankan variasi data, sedangkan imputasi rata-rata cenderung menyebabkan bias dan ketidakstabilan dalam estimasi, terutama pada intercept dan ukuran ketidakpastian.

library(ggplot2)
# Data untuk plot
results <- data.frame(
 Method = c("Data Lengkap", "Mean Imp + Bootstrap", "MICE"),
 Slope = c(1.412127, 1.412127, 1.408248),
 SE = c(0.1079083, 0.1191314, 0.1068028 ),
 CI_lower = c(1.198, 1.188, 1.196),
 CI_upper = c(1.627, 1.603, 1.621)
)
ggplot(results, aes(x = Method, y = Slope, color = Method)) +
 geom_point(size = 3) +
 geom_errorbar(aes(ymin = CI_lower, ymax = CI_upper), width = 0.2) +
 labs(title = "Perbandingan Estimasi Slope dengan Berbagai Metode",
 y = "Estimasi Slope (y ~ x)") +
 theme_minimal()

Bootstrap pada Regresi dan Missing Value

Febriana Dwi Rahayu

2026-04-27

1. Boostrap untuk regresi (tanpa missing)

plot distribusi bootstrap

2. Estimasi pada Missing Value dengan Bootstrap

3 Multiple Imputation + Bootstrap

gabungkan hasil