Praktikum

Dataset Simulasi

set.seed(123)
# Jumlah observasi
n <- 100
n

## [1] 100

# Generate variabel x dari distribusi normal (mean=10, sd=2)
x <- rnorm(n, mean = 10, sd = 2)
x

##   [1]  8.879049  9.539645 13.117417 10.141017 10.258575 13.430130 10.921832
##   [8]  7.469878  8.626294  9.108676 12.448164 10.719628 10.801543 10.221365
##  [15]  8.888318 13.573826 10.995701  6.066766 11.402712  9.054417  7.864353
##  [22]  9.564050  7.947991  8.542218  8.749921  6.626613 11.675574 10.306746
##  [29]  7.723726 12.507630 10.852928  9.409857 11.790251 11.756267 11.643162
##  [36] 11.377281 11.107835  9.876177  9.388075  9.239058  8.610586  9.584165
##  [43]  7.469207 14.337912 12.415924  7.753783  9.194230  9.066689 11.559930
##  [50]  9.833262 10.506637  9.942906  9.914259 12.737205  9.548458 13.032941
##  [57]  6.902494 11.169227 10.247708 10.431883 10.759279  8.995353  9.333585
##  [64]  7.962849  7.856418 10.607057 10.896420 10.106008 11.844535 14.100169
##  [71]  9.017938  5.381662 12.011477  8.581598  8.623983 12.051143  9.430454
##  [78]  7.558565 10.362607  9.722217 10.011528 10.770561  9.258680 11.288753
##  [85]  9.559027 10.663564 12.193678 10.870363  9.348137 12.297615 11.987008
##  [92] 11.096794 10.477463  8.744188 12.721305  8.799481 14.374666 13.065221
##  [99]  9.528599  7.947158

# Generate variabel y dengan pola hubungan linear terhadap x plus error
y <- 3 + 1.5 * x + rnorm(n, mean = 0, sd = 2)
y

##   [1] 14.89776 17.82323 22.18274 17.51644 16.48463 23.05514 17.81294 10.86893
##   [9] 15.17899 18.50101 20.52155 20.29537 15.96655 18.22092 17.37129 23.96305
##  [17] 19.70490 10.81874 18.40466 14.53337 15.03182 15.45113 13.94087 15.30114
##  [25] 19.81261 11.63602 20.98413 18.61604 12.66188 21.61883 22.16849 18.01779
##  [33] 20.76784 19.78941 16.35825 22.32860 16.74047 19.29416 20.90032 13.97080
##  [41] 17.31945 16.85185 11.05952 21.47753 18.42081 13.56886 13.86783 17.97587
##  [49] 24.54011 15.17583 20.33543 19.45244 18.53579 20.08905 17.08378 21.98862
##  [57] 14.47972 19.00896 20.32551 17.89866 21.24434 14.39468 14.48007 21.42635
##  [65] 13.95091 19.50704 20.61777 17.19145 21.80053 24.88818 16.09615 11.20308
##  [73] 20.94908 20.12930 14.45330 18.88472 17.22126 14.95881 19.41696 16.66660
##  [81] 15.89064 21.68221 16.18872 18.20210 16.86598 18.60099 23.51036 19.47502
##  [89] 18.53031 20.44784 21.40940 18.99582 18.90536 14.32556 19.46035 20.19365
##  [97] 25.76342 20.09529 16.07057 12.54978

# Gabungkan menjadi data frame
data <- data.frame(x, y)
data

##             x        y
## 1    8.879049 14.89776
## 2    9.539645 17.82323
## 3   13.117417 22.18274
## 4   10.141017 17.51644
## 5   10.258575 16.48463
## 6   13.430130 23.05514
## 7   10.921832 17.81294
## 8    7.469878 10.86893
## 9    8.626294 15.17899
## 10   9.108676 18.50101
## 11  12.448164 20.52155
## 12  10.719628 20.29537
## 13  10.801543 15.96655
## 14  10.221365 18.22092
## 15   8.888318 17.37129
## 16  13.573826 23.96305
## 17  10.995701 19.70490
## 18   6.066766 10.81874
## 19  11.402712 18.40466
## 20   9.054417 14.53337
## 21   7.864353 15.03182
## 22   9.564050 15.45113
## 23   7.947991 13.94087
## 24   8.542218 15.30114
## 25   8.749921 19.81261
## 26   6.626613 11.63602
## 27  11.675574 20.98413
## 28  10.306746 18.61604
## 29   7.723726 12.66188
## 30  12.507630 21.61883
## 31  10.852928 22.16849
## 32   9.409857 18.01779
## 33  11.790251 20.76784
## 34  11.756267 19.78941
## 35  11.643162 16.35825
## 36  11.377281 22.32860
## 37  11.107835 16.74047
## 38   9.876177 19.29416
## 39   9.388075 20.90032
## 40   9.239058 13.97080
## 41   8.610586 17.31945
## 42   9.584165 16.85185
## 43   7.469207 11.05952
## 44  14.337912 21.47753
## 45  12.415924 18.42081
## 46   7.753783 13.56886
## 47   9.194230 13.86783
## 48   9.066689 17.97587
## 49  11.559930 24.54011
## 50   9.833262 15.17583
## 51  10.506637 20.33543
## 52   9.942906 19.45244
## 53   9.914259 18.53579
## 54  12.737205 20.08905
## 55   9.548458 17.08378
## 56  13.032941 21.98862
## 57   6.902494 14.47972
## 58  11.169227 19.00896
## 59  10.247708 20.32551
## 60  10.431883 17.89866
## 61  10.759279 21.24434
## 62   8.995353 14.39468
## 63   9.333585 14.48007
## 64   7.962849 21.42635
## 65   7.856418 13.95091
## 66  10.607057 19.50704
## 67  10.896420 20.61777
## 68  10.106008 17.19145
## 69  11.844535 21.80053
## 70  14.100169 24.88818
## 71   9.017938 16.09615
## 72   5.381662 11.20308
## 73  12.011477 20.94908
## 74   8.581598 20.12930
## 75   8.623983 14.45330
## 76  12.051143 18.88472
## 77   9.430454 17.22126
## 78   7.558565 14.95881
## 79  10.362607 19.41696
## 80   9.722217 16.66660
## 81  10.011528 15.89064
## 82  10.770561 21.68221
## 83   9.258680 16.18872
## 84  11.288753 18.20210
## 85   9.559027 16.86598
## 86  10.663564 18.60099
## 87  12.193678 23.51036
## 88  10.870363 19.47502
## 89   9.348137 18.53031
## 90  12.297615 20.44784
## 91  11.987008 21.40940
## 92  11.096794 18.99582
## 93  10.477463 18.90536
## 94   8.744188 14.32556
## 95  12.721305 19.46035
## 96   8.799481 20.19365
## 97  14.374666 25.76342
## 98  13.065221 20.09529
## 99   9.528599 16.07057
## 100  7.947158 12.54978

# Introduksi missing value secara acak pada 10 observasi x
data[sample(1:n, 10), "x"] <- NA
# Lihat 6 baris pertama
head(data)

##           x        y
## 1  8.879049 14.89776
## 2  9.539645 17.82323
## 3 13.117417 22.18274
## 4 10.141017 17.51644
## 5 10.258575 16.48463
## 6 13.430130 23.05514

Penjelasan: – set.seed(123) menjamin hasil random yang konsisten – rnorm() menghasilkan data dari distribusi normal – Hubungan antara y dan x sengaja dibuat linear (y = 3 + 1.5x + error) – sample() memilih 10 baris secara acak untuk dijadikan NA

Praktikum 1: Bootstrap untuk Regresi (tanpamissing)

# Hapus baris yang mengandung NA
clean_data <- na.omit(data)

# Fungsi untuk bootstrap regresi
boot_regression <- function(data, indices) {
# Ambil sampel bootstrap sesuai indices
 d <- data[indices, ]
# Fit model regresi linear
 model <- lm(y ~ x, data = d)
# Return koefisien model
return(coef(model))
}

# Load library boot
library(boot)

## Warning: package 'boot' was built under R version 4.4.3

# Lakukan bootstrap dengan 1000 replikasi
boot_result <- boot(
 data = clean_data,
 statistic = boot_regression,
 R = 1000
)
# Tampilkan hasil
boot_result

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = clean_data, statistic = boot_regression, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original      bias    std. error
## t1* 3.581084  0.06067069   1.1482885
## t2* 1.412127 -0.00547455   0.1074228

# Plot distribusi bootstrap
plot(boot_result)

# Hitung confidence interval 95% untuk koefisien x (index=2)
boot.ci(boot_result, type = "perc", index = 2)

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_result, type = "perc", index = 2)
## 
## Intervals : 
## Level     Percentile     
## 95%   ( 1.176,  1.596 )  
## Calculations and Intervals on Original Scale

Penjelasan: – na.omit() menghapus baris dengan missing values – Fungsi boot_regression: — indices menentukan sampel yang diambil — lm() melakukan regresi linear — coef() mengambil koefisien model – boot() menjalankan bootstrap dengan: — data: dataset bersih — statistic: fungsi yang di-bootstrap — R: jumlah replikasi – boot.ci() menghitung interval kepercayaan percentile

Praktikum 2: Estimasi pada Missing Value dengan Bootstrap

Kita akan menggunakan mean imputation + bootstrap:

# Hitung mean x (abaikan NA)
mean_x <- mean(data$x, na.rm = TRUE)

# Buat variabel baru dengan imputasi mean
data$ximp <- ifelse(is.na(data$x), mean_x, data$x)

# Fit model setelah imputasi
model_imp <- lm(y ~ ximp, data = data)
summary(model_imp)

## 
## Call:
## lm(formula = y ~ ximp, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1153 -1.4394 -0.0902  1.2053  6.5280 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.6538     1.2332   2.963  0.00383 ** 
## ximp          1.4121     0.1191  11.854  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.109 on 98 degrees of freedom
## Multiple R-squared:  0.5891, Adjusted R-squared:  0.5849 
## F-statistic: 140.5 on 1 and 98 DF,  p-value: < 2.2e-16

# Fungsi bootstrap setelah imputasi
boot_imp <- function(data, indices) {
 d <- data[indices, ]
 model <- lm(y ~ ximp, data = d)
return(coef(model))
}

# Jalankan bootstrap
boot_result_imp <- boot(data = data, statistic = boot_imp, R = 1000)
# Hasil
boot_result_imp

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = data, statistic = boot_imp, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original       bias    std. error
## t1* 3.653794  0.053055397   1.1350004
## t2* 1.412127 -0.005093136   0.1064137

plot(boot_result_imp)

boot.ci(boot_result_imp, type = "perc", index = 2)

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_result_imp, type = "perc", index = 2)
## 
## Intervals : 
## Level     Percentile     
## 95%   ( 1.188,  1.603 )  
## Calculations and Intervals on Original Scale

Catatan Penting: – mean(na.rm=TRUE) menghitung mean tanpa NA – ifelse() mengganti NA dengan mean – Model setelah imputasi cenderung underestimate variance – Proses bootstrap sama seperti sebelumnya tetapi menggunakan data yang sudah diimputasi – Mean imputation bisa mengurangi variabilitas → bias underestimation. – Lebih canggih: gunakan Multiple Imputation + Bootstrap (dengan mice package)

Praktikum 3: Multiple Imputation + Bootstrap

library(mice)

## Warning: package 'mice' was built under R version 4.4.3

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

# Lakukan multiple imputation (m=5) dengan Predictive Mean Matching
imp <- mice(
 data[, c("x", "y")],
 m = 5,
 method = 'pmm',
 seed = 123
)

## 
##  iter imp variable
##   1   1  x
##   1   2  x
##   1   3  x
##   1   4  x
##   1   5  x
##   2   1  x
##   2   2  x
##   2   3  x
##   2   4  x
##   2   5  x
##   3   1  x
##   3   2  x
##   3   3  x
##   3   4  x
##   3   5  x
##   4   1  x
##   4   2  x
##   4   3  x
##   4   4  x
##   4   5  x
##   5   1  x
##   5   2  x
##   5   3  x
##   5   4  x
##   5   5  x

# Gabungkan dataset imputasi dalam long format
imp_data <- complete(imp, "long")
# Fit model di setiap dataset imputasi dan gabungkan hasilnya
model_mi <- with(imp, lm(y ~ x))
summary(pool(model_mi))

##          term estimate std.error statistic       df      p.value
## 1 (Intercept) 3.619991 1.1112706  3.257524 78.99385 1.657655e-03
## 2           x 1.408248 0.1068028 13.185496 78.10532 1.472407e-21

Gabungan Hasil

# Pastikan semua package sudah terinstall
library(boot)
library(broom)

## Warning: package 'broom' was built under R version 4.4.3

# 1. Model Data Lengkap
model_clean <- lm(y ~ x, data = clean_data)
clean_summary <- tidy(model_clean, conf.int = TRUE)
model_clean

## 
## Call:
## lm(formula = y ~ x, data = clean_data)
## 
## Coefficients:
## (Intercept)            x  
##       3.581        1.412

clean_summary

## # A tibble: 2 × 7
##   term        estimate std.error statistic  p.value conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 (Intercept)     3.58     1.12       3.20 1.91e- 3     1.36      5.80
## 2 x               1.41     0.108     13.1  2.34e-22     1.20      1.63

# 2. Model Mean Imputation + Bootstrap
# Asumsi boot_result_imp sudah dibuat sebelumnya
boot_ci <- boot.ci(boot_result_imp, type = "perc", index = 2)
boot_summary <- tidy(model_imp, conf.int = TRUE)

# 3. Model MICE
model_mice <- with(imp, lm(y ~ x))
mice_summary <- summary(pool(model_mice), conf.int = TRUE)
mice_summary

##          term estimate std.error statistic       df      p.value    2.5 %
## 1 (Intercept) 3.619991 1.1112706  3.257524 78.99385 1.657655e-03 1.408059
## 2           x 1.408248 0.1068028 13.185496 78.10532 1.472407e-21 1.195625
##     97.5 % conf.low conf.high
## 1 5.831922 1.408059  5.831922
## 2 1.620872 1.195625  1.620872

# Membuat data frame yang lebih robust
results_table <- data.frame(
 Metode = c("Data Lengkap", "Mean Imputation + Bootstrap", "MICE"),
 Intercept = c(
 clean_summary$estimate[1],
 boot_summary$estimate[1],
 mice_summary$estimate[1]
 ),
 Slope = c(
 clean_summary$estimate[2],
 boot_summary$estimate[2],
 mice_summary$estimate[2]
 ),
 SE_Slope = c(
 clean_summary$std.error[2],
 boot_summary$std.error[2],
 mice_summary$std.error[2]
 ),
 CI_Slope = c(
 sprintf("(%.3f, %.3f)", clean_summary$conf.low[2], clean_summary$conf.high[2]), sprintf("(%.3f, %.3f)", boot_ci$percent[4], boot_ci$percent[5]),
 sprintf("(%.3f, %.3f)", mice_summary$`2.5 %`[2], mice_summary$`97.5 %`[2]) ),
 stringsAsFactors = FALSE
)

# Tampilkan hasil
print(results_table)

##                        Metode Intercept    Slope  SE_Slope       CI_Slope
## 1                Data Lengkap  3.581084 1.412127 0.1079083 (1.198, 1.627)
## 2 Mean Imputation + Bootstrap  3.653794 1.412127 0.1191314 (1.188, 1.603)
## 3                        MICE  3.619991 1.408248 0.1068028 (1.196, 1.621)

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.2

# Data untuk plot
results <- data.frame(
 Method = c("Data Lengkap", "Mean Imp + Bootstrap", "MICE"),
 Slope = c(1.412127, 1.412127, 1.408248),
 SE = c(0.1079083, 0.1191314, 0.1068028 ),
 CI_lower = c(1.198, 1.188, 1.196),
 CI_upper = c(1.627, 1.603, 1.621)
)
ggplot(results, aes(x = Method, y = Slope, color = Method)) +
 geom_point(size = 3) +
 geom_errorbar(aes(ymin = CI_lower, ymax = CI_upper), width = 0.2) +
 labs(title = "Perbandingan Estimasi Slope dengan Berbagai Metode",
 y = "Estimasi Slope (y ~ x)") +
 theme_minimal()

Analisis Perbedaan Estimasi

Estimasi Slope (Koefisien x) Konsistensi Nilai: – Ketiga metode menghasilkan slope yang sangat mirip (~1.41) – Perbedaan <0.004 (hanya 0.3% variasi) – Indikasi bahwa pola missing tidak terlalu memengaruhi hubungan x-y Perbedaan Kecil: – MICE memberikan slope paling rendah (1.408) – Data lengkap dan mean imputation sama (1.412)
Estimasi Intercept Variasi Lebih Nyata: – Rentang nilai: 3.581 (data lengkap) hingga 3.654 (mean imputation) – Perbedaan ~0.073 (2% dari nilai intercept) – Mean imputation menghasilkan intercept tertinggi, mungkin karena imputasi mean cenderung menarikintercept ke arah rata-rata dan sedikit bias karena pengisian nilai konstan.
Standard Error (SE) Slope Konsistensi: – SE data lengkap (0.108) vs MICE (0.107) sangat mirip – Mean imputation memiliki SE lebih besar (0.119), mencerminkan ketidakpastian dari imputasi sederhana dan sesuai ekspektasi teori karena mean imputation meremehkan variabilitas
Confidence Interval (CI) Lebar CI: – Data lengkap: 0.429 (1.627-1.198) – Mean imputation: 0.415 (1.603-1.188) – MICE: 0.425 (1.621-1.196) Pola Unik: Mean imputation memiliki CI paling sempit (bertentangan dengan teori), kemungkinan penyebab:– Jumlah bootstrap tidak cukup (misal hanya 100 iterasi) – Missing values sedikit (<10%) sehingga dampak imputasi minimal – Data missing completely at random (MCAR)

Week 12 - Bootstrap pada Regresi danMissing Value

Farid

2025-07-01

Bootstrap dalam Regresi & Estimasi MissingValue

Pendahuluan

Konsep Dasar Bootstrap