set.seed(123)
n <- 100
x <- rnorm(n, mean = 10, sd = 2)
y <- 3 + 1.5 * x + rnorm(n, mean = 0, sd = 2)
data <- data.frame(x, y)
data[sample(1:n, 10), "x"] <- NA
head(data)
## x y
## 1 8.879049 14.89776
## 2 9.539645 17.82323
## 3 13.117417 22.18274
## 4 10.141017 17.51644
## 5 10.258575 16.48463
## 6 13.430130 23.05514
# Data Cleaning
clean_data <- na.omit(data)
# Bootstrap Regresi
boot_regression <- function(data, indices) {
d <- data[indices, ]
model <- lm(y ~ x, data = d)
return(coef(model))
}
# Bootstrap 1000 replikasi
library(boot)
boot_result <- boot(
data = clean_data,
statistic = boot_regression,
R = 1000
)
boot_result
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = clean_data, statistic = boot_regression, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 3.581084 0.06067069 1.1482885
## t2* 1.412127 -0.00547455 0.1074228
# Plot distribusi bootstrap
plot(boot_result)
# Hitung confidence interval 95% untuk koefisien x (index=2)
boot.ci(boot_result, type = "perc", index = 2)
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot_result, type = "perc", index = 2)
##
## Intervals :
## Level Percentile
## 95% ( 1.176, 1.596 )
## Calculations and Intervals on Original Scale
mean_x <- mean(data$x, na.rm = TRUE)
data$ximp <- ifelse(is.na(data$x), mean_x, data$x)
model_imp <- lm(y ~ ximp, data = data)
summary(model_imp)
##
## Call:
## lm(formula = y ~ ximp, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1153 -1.4394 -0.0902 1.2053 6.5280
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6538 1.2332 2.963 0.00383 **
## ximp 1.4121 0.1191 11.854 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.109 on 98 degrees of freedom
## Multiple R-squared: 0.5891, Adjusted R-squared: 0.5849
## F-statistic: 140.5 on 1 and 98 DF, p-value: < 2.2e-16
# Fungsi bootstrap setelah imputasi
boot_imp <- function(data, indices) {
d <- data[indices, ]
model <- lm(y ~ ximp, data = d)
return(coef(model))
}
boot_result_imp <- boot(data = data, statistic = boot_imp, R = 1000)
boot_result_imp
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = data, statistic = boot_imp, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 3.653794 0.053055397 1.1350004
## t2* 1.412127 -0.005093136 0.1064137
plot(boot_result_imp)
boot.ci(boot_result_imp, type = "perc", index = 2)
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot_result_imp, type = "perc", index = 2)
##
## Intervals :
## Level Percentile
## 95% ( 1.188, 1.603 )
## Calculations and Intervals on Original Scale
library(mice)
## Warning: package 'mice' was built under R version 4.5.2
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
# Lakukan multiple imputation (m=5) dengan Predictive Mean Matching
imp <- mice(
data[, c("x", "y")],
m = 5,
method = 'pmm',
seed = 123
)
##
## iter imp variable
## 1 1 x
## 1 2 x
## 1 3 x
## 1 4 x
## 1 5 x
## 2 1 x
## 2 2 x
## 2 3 x
## 2 4 x
## 2 5 x
## 3 1 x
## 3 2 x
## 3 3 x
## 3 4 x
## 3 5 x
## 4 1 x
## 4 2 x
## 4 3 x
## 4 4 x
## 4 5 x
## 5 1 x
## 5 2 x
## 5 3 x
## 5 4 x
## 5 5 x
imp_data <- complete(imp, "long")
model_mi <- with(imp, lm(y ~ x))
summary(pool(model_mi))
## term estimate std.error statistic df p.value
## 1 (Intercept) 3.619991 1.1112706 3.257524 78.99385 1.657655e-03
## 2 x 1.408248 0.1068028 13.185496 78.10532 1.472407e-21
library(mice)
library(broom)
## Warning: package 'broom' was built under R version 4.5.1
model_clean <- lm(y ~ x, data = clean_data)
clean_summary <- tidy(model_clean, conf.int = TRUE)
# Model Mean Imputation + Bootstrap
boot_ci <- boot.ci(boot_result_imp, type = "perc", index = 2)
boot_summary <- tidy(model_imp, conf.int = TRUE)
# Model MICE
model_mice <- with(imp, lm(y ~ x))
mice_summary <- summary(pool(model_mice), conf.int = TRUE)
# Membuat data frame yang lebih robust
results_table <- data.frame(
Metode = c("Data Lengkap", "Mean Imputation + Bootstrap", "MICE"),
Intercept = c(
clean_summary$estimate[1],
boot_summary$estimate[1],
mice_summary$estimate[1]
),
Slope = c(
clean_summary$estimate[2],
boot_summary$estimate[2],
mice_summary$estimate[2]
),
SE_Slope = c(
clean_summary$std.error[2],
boot_summary$std.error[2],
mice_summary$std.error[2]
),
CI_Slope = c(
sprintf("(%.3f, %.3f)", clean_summary$conf.low[2], clean_summary$conf.high[2]),
sprintf("(%.3f, %.3f)", boot_ci$percent[4], boot_ci$percent[5]),
sprintf("(%.3f, %.3f)", mice_summary$`2.5 %`[2], mice_summary$`97.5 %`[2])
),
stringsAsFactors = FALSE
)
print(results_table)
## Metode Intercept Slope SE_Slope CI_Slope
## 1 Data Lengkap 3.581084 1.412127 0.1079083 (1.198, 1.627)
## 2 Mean Imputation + Bootstrap 3.653794 1.412127 0.1191314 (1.188, 1.603)
## 3 MICE 3.619991 1.408248 0.1068028 (1.196, 1.621)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
# Data untuk plot
results <- data.frame(
Method = c("Data Lengkap", "Mean Imp + Bootstrap", "MICE"),
Slope = c(1.412127, 1.412127, 1.408248),
SE = c(0.1079083, 0.1191314, 0.1068028 ),
CI_lower = c(1.198, 1.188, 1.196),
CI_upper = c(1.627, 1.603, 1.621)
)
ggplot(results, aes(x = Method, y = Slope, color = Method)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = CI_lower, ymax = CI_upper), width = 0.2) +
labs(title = "Perbandingan Estimasi Slope dengan Berbagai Metode",
y = "Estimasi Slope (y ~ x)") +
theme_minimal()
Berdasarkan hasil analisis, ketiga metode menghasilkan slope yang sangat mirip (~ 1.41), dengan perbedaaan < 0.004 (hanya 0.3% variasi). Hal ini mengindikasikan bahwa pola missing value tidak terlalu memengaruhi hubungan x dan y. Selain itu, MICE memberikan slope paling rendaah dengan data yang lengkap dan mean imputation sama (1.412).
Dari hasil analisis didapat rentang nilai dari 3.581 (data lengkap) hingga 3.654 (mean imputation), dengan perbedaan ~ 0.073 (2% dari nilai intercept). Selain itu mean imputation menghasilkan intercept ke arah rata-rata dan sedikit bias karena pengisian nilai konstan.
SE data lengkap (0.108) dengan MICE (0.107) menghasilkan hasil yang sangat mirip. Mean imputation memiliki SE lebih besar (0.119), mencerminkan ketidakpastian dari impputasi sederhana dan sesuai ekspektasi teori karena mean imputation meremehkan variabilitas.
Pada data lengkap lebar CI sebesar 0.429 (1.627 - 1.198), pada mean imputation sebesar 0.415 (1.603 - 1.188), sedangkan pada MICE lebar intervalnya sebesar 0.425 (1.621 - 1.196).
Hal ini menunjukkan bahwa mean imputation memiliki CI paling sempit (bertentangan dengan teori), kemungkinan penyebabnya sebagai berikut:
Jumlah bootstrap tidak cukup banyak (misal hanya 100 iterasi)
Mising values sedikit (< 10%) sehingga dampak imputasi minimal
Data missing completely at random (MCAR)