# LOAD LIBRARY
library(boot)
library(mice)
## Warning: package 'mice' was built under R version 4.4.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(broom)
## Warning: package 'broom' was built under R version 4.4.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
# SIMULASI DATA
set.seed(456)
# jumlah observasi
n <- 120
# variabel x
x <- rnorm(n, mean = 12, sd = 3)
# variabel y
y <- 5 + 1.8*x + rnorm(n, mean = 0, sd = 3)
# gabungkan data
data <- data.frame(x,y)
# tambahkan missing value pada x
data[sample(1:n,15), "x"] <- NA
# lihat data
head(data)
## x y
## 1 7.969436 18.51386
## 2 13.865327 28.03350
## 3 NA 33.49535
## 4 7.833323 20.04151
## 5 9.856929 26.40986
## 6 NA 21.47109
summary(data)
## x y
## Min. : 5.236 Min. :10.26
## 1st Qu.:10.018 1st Qu.:22.73
## Median :12.207 Median :27.36
## Mean :12.153 Mean :27.02
## 3rd Qu.:14.431 3rd Qu.:31.76
## Max. :18.840 Max. :39.87
## NA's :15
# PRAKTIKUM 1: BOOTSTRAP REGRESI (DATA LENGKAP)
# hapus missing value
clean_data <- na.omit(data)
# fungsi bootstrap
boot_regression <- function(data, indices){
d <- data[indices,]
model <- lm(y ~ x, data = d)
return(coef(model))
}
# bootstrap 1000 kali
boot_result <- boot(
data = clean_data,
statistic = boot_regression,
R = 1000
)
# hasil bootstrap
boot_result
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = clean_data, statistic = boot_regression, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 5.255779 -0.027817742 1.24016720
## t2* 1.767315 0.003654091 0.09642658
# plot bootstrap
plot(boot_result)

# confidence interval slope
boot.ci(
boot_result,
type = "perc",
index = 2
)
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot_result, type = "perc", index = 2)
##
## Intervals :
## Level Percentile
## 95% ( 1.585, 1.965 )
## Calculations and Intervals on Original Scale
# INTERPRETASI:
# Bootstrap digunakan untuk mengetahui distribusi
# koefisien regresi dan confidence interval
# PRAKTIKUM 2: MEAN IMPUTATION + BOOTSTRAP
# hitung mean x
mean_x <- mean(data$x, na.rm = TRUE)
# imputasi mean
data$ximp <- ifelse(
is.na(data$x),
mean_x,
data$x
)
# model regresi
model_imp <- lm(y ~ ximp, data = data)
summary(model_imp)
##
## Call:
## lm(formula = y ~ ximp, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.2270 -2.2998 -0.2632 2.3218 12.8551
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.5404 1.4296 3.875 0.000175 ***
## ximp 1.7673 0.1147 15.409 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.481 on 118 degrees of freedom
## Multiple R-squared: 0.668, Adjusted R-squared: 0.6652
## F-statistic: 237.5 on 1 and 118 DF, p-value: < 2.2e-16
# fungsi bootstrap imputasi
boot_imp <- function(data, indices){
d <- data[indices,]
model <- lm(y ~ ximp, data = d)
return(coef(model))
}
# bootstrap
boot_result_imp <- boot(
data = data,
statistic = boot_imp,
R = 1000
)
# hasil
boot_result_imp
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = data, statistic = boot_imp, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 5.540414 0.035853077 1.3115391
## t2* 1.767315 -0.002295576 0.1023997
# CI
boot_ci <- boot.ci(
boot_result_imp,
type = "perc",
index = 2
)
boot_ci
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot_result_imp, type = "perc", index = 2)
##
## Intervals :
## Level Percentile
## 95% ( 1.571, 1.981 )
## Calculations and Intervals on Original Scale
# INTERPRETASI:
# Missing value diganti dengan rata-rata x
# lalu dilakukan bootstrap kembali
# PRAKTIKUM 3: MULTIPLE IMPUTATION (MICE)
imp <- mice(
data[,c("x","y")],
m = 5,
method = "pmm",
seed = 456
)
##
## iter imp variable
## 1 1 x
## 1 2 x
## 1 3 x
## 1 4 x
## 1 5 x
## 2 1 x
## 2 2 x
## 2 3 x
## 2 4 x
## 2 5 x
## 3 1 x
## 3 2 x
## 3 3 x
## 3 4 x
## 3 5 x
## 4 1 x
## 4 2 x
## 4 3 x
## 4 4 x
## 4 5 x
## 5 1 x
## 5 2 x
## 5 3 x
## 5 4 x
## 5 5 x
# model tiap imputasi
model_mice <- with(
imp,
lm(y ~ x)
)
# gabungkan hasil
mice_summary <- summary(
pool(model_mice),
conf.int = TRUE
)
mice_summary
## term estimate std.error statistic df p.value 2.5 %
## 1 (Intercept) 5.489190 1.17403860 4.675476 89.67545 1.029672e-05 3.156642
## 2 x 1.754618 0.09293205 18.880651 89.91011 4.864461e-33 1.569989
## 97.5 % conf.low conf.high
## 1 7.821737 3.156642 7.821737
## 2 1.939246 1.569989 1.939246
# INTERPRETASI:
# MICE lebih baik dibanding mean imputation
# karena mempertahankan variasi data
# PERBANDINGAN HASIL
# model data lengkap
model_clean <- lm(y ~ x, data = clean_data)
clean_summary <- tidy(
model_clean,
conf.int = TRUE
)
# model mean imputasi
boot_summary <- tidy(
model_imp,
conf.int = TRUE
)
# tabel perbandingan
results_table <- data.frame(
Metode = c(
"Data Lengkap",
"Mean Imputation + Bootstrap",
"MICE"
),
Intercept = c(
clean_summary$estimate[1],
boot_summary$estimate[1],
mice_summary$estimate[1]
),
Slope = c(
clean_summary$estimate[2],
boot_summary$estimate[2],
mice_summary$estimate[2]
),
SE_Slope = c(
clean_summary$std.error[2],
boot_summary$std.error[2],
mice_summary$std.error[2]
),
CI_Slope = c(
paste0(
"(",
round(clean_summary$conf.low[2],3),
", ",
round(clean_summary$conf.high[2],3),
")"
),
paste0(
"(",
round(boot_ci$percent[4],3),
", ",
round(boot_ci$percent[5],3),
")"
),
paste0(
"(",
round(mice_summary$`2.5 %`[2],3),
", ",
round(mice_summary$`97.5 %`[2],3),
")"
)
)
)
results_table
## Metode Intercept Slope SE_Slope CI_Slope
## 1 Data Lengkap 5.255779 1.767315 0.09638160 (1.576, 1.958)
## 2 Mean Imputation + Bootstrap 5.540414 1.767315 0.11469062 (1.571, 1.981)
## 3 MICE 5.489190 1.754618 0.09293205 (1.57, 1.939)
# INTERPRETASI:
# Dibandingkan hasil ketiga metode:
# 1. Data lengkap
# 2. Mean imputation + bootstrap
# 3. MICE
# VISUALISASI
results <- data.frame(
Method = c(
"Data Lengkap",
"Mean Imputation",
"MICE"
),
Slope = c(
clean_summary$estimate[2],
boot_summary$estimate[2],
mice_summary$estimate[2]
),
SE = c(
clean_summary$std.error[2],
boot_summary$std.error[2],
mice_summary$std.error[2]
),
CI_lower = c(
clean_summary$conf.low[2],
boot_ci$percent[4],
mice_summary$`2.5 %`[2]
),
CI_upper = c(
clean_summary$conf.high[2],
boot_ci$percent[5],
mice_summary$`97.5 %`[2]
)
)
ggplot(results,
aes(x=Method,
y=Slope,
color=Method))+
geom_point(size=3)+
geom_errorbar(
aes(
ymin=CI_lower,
ymax=CI_upper
),
width=0.2
)+
labs(
title="Perbandingan Estimasi Slope",
y="Nilai Slope"
)+
theme_minimal()

# KESIMPULAN
cat("1. Ketiga metode menghasilkan slope yang relatif mirip\n")
## 1. Ketiga metode menghasilkan slope yang relatif mirip
cat("2. Mean imputation cenderung lebih sederhana namun bisa bias\n")
## 2. Mean imputation cenderung lebih sederhana namun bisa bias
cat("3. MICE lebih baik karena mempertahankan variasi data\n")
## 3. MICE lebih baik karena mempertahankan variasi data
cat("4. Bootstrap membantu mengestimasi ketidakpastian parameter\n")
## 4. Bootstrap membantu mengestimasi ketidakpastian parameter