Simulation Dataset
set.seed(1)
n <- 50
x <- rpois(n, lambda = 0.5)
y <- 2 + 4 * x + rpois(n, lambda = 0.1)
data <- data.frame(x,y)
data[sample(1:n, 10), "x"] <- NA
clean <- na.omit(data)
Original Model
model_valid <- lm(y ~ x, data = clean)
clean_coef <- coef(model_valid)
clean_se <- summary(model_valid)$coefficients[,2]
clean_ci <- confint(model_valid)
Bootstrap for Regression (Without Missing Value)
library(boot)
boot_fn <- function(data, index){
d <- data[index, ]
coef(lm(y ~ x, data = d))
}
set.seed(1)
boot_clean <- boot(clean, boot_fn, R = 1000)
boot_clean_slope <- mean(boot_clean$t[,2])
boot_clean_se <- sd(boot_clean$t[,2])
boot_clean_ci <- boot.ci(boot_clean, type = "perc", index = 2)
Missing Value Estimation with Bootstrap
data$ximpute <- ifelse(is.na(data$x), mean(data$x, na.rm = TRUE), data$x)
boot_imp_fn <- function(data, index){
d <- data[index, ]
coef(lm(y ~ ximpute, data = d))
}
set.seed(1)
boot_imp <- boot(data, boot_imp_fn, R = 1000)
boot_imp_slope <- mean(boot_imp$t[,2])
boot_imp_se <- sd(boot_imp$t[,2])
boot_imp_ci <- boot.ci(boot_imp, type = "perc", index = 2)
Multiple Imputation & Booststrap
library(mice)
## Warning: package 'mice' was built under R version 4.5.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
impute <- mice(data, m = 5, printFlag = FALSE)
## Warning: Number of logged events: 1
model_mice <- with(impute, lm(y ~ x))
mice_sum <- summary(pool(model_mice), conf.int = TRUE)
mice_slope <- mice_sum$estimate[mice_sum$term == "x"]
mice_se <- mice_sum$std.error[mice_sum$term == "x"]
mice_low <- mice_sum$`2.5 %`[mice_sum$term == "x"]
mice_high <- mice_sum$`97.5 %`[mice_sum$term == "x"]
Comparison
result_table <- data.frame(
Method = c("Original", "Mean Imp + Bootstrap", "MICE"),
Slope = c(
clean_coef[2],
boot_imp_slope,
mice_slope
),
SE = c(
clean_se[2],
boot_imp_se,
mice_se
),
CI_Lower = c(
clean_ci[2,1],
boot_imp_ci$percent[4],
mice_low
),
CI_Upper = c(
clean_ci[2,2],
boot_imp_ci$percent[5],
mice_high
)
)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
ggplot(result_table, aes(x = Method, y = Slope, color = Method)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = CI_Lower, ymax = CI_Upper), width = 0.2) +
labs(title = "Slope Estimation Comparison Between Methods",
y = "Slope Estimation (y ~ x)") +
theme_minimal()