# Core
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(broom)
# Imputation
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
# Vectors for each column
x1 <- c(2140,2016,1905,1675,1474,2239,2120,1990,1702,1487,
2107,1973,1864,1674,1440,2165,2048,1916,1658,1489,
2062,1929,1815,1595,1400,2047,1935,1807,1591,1388,
2071,1944,1831,1612,1410,2066,1954,1835,1616,1407)
x2 <- c(20640,20280,19860,18980,18100,20740,20305,19961,18916,18012,
20520,20130,19780,19020,18030,20680,20340,19860,18950,18700,
20500,20050,19680,18890,17870,20540,20160,19750,18890,17870,
20460,20010,19640,18710,17780,20520,20150,19750,18850,17910)
x3 <- c(30250,30010,29780,29330,28960,30083,29831,29604,29088,28675,
30120,29920,29720,29370,28940,30160,29960,29710,29250,28890,
30190,29960,29770,29360,28960,30160,29940,29760,29350,28910,
30180,29940,29750,29360,28900,30170,29950,29740,29320,28910)
x4 <- c(205,195,184,164,144,216,206,196,171,149,
195,190,180,161,139,208,199,187,164,145,
193,183,173,153,134,193,184,173,153,133,
198,186,178,156,136,197,188,178,156,137)
x5 <- c(1732,1697,1662,1598,1541,1709,1669,1640,1572,1522,
1740,1711,1682,1630,1572,1704,1679,1642,1576,1528,
1748,1713,1684,1624,1569,1746,1714,1679,1621,1561,
1729,1692,1667,1609,1552,1758,1729,1690,1616,1569)
x6 <- c(99,100,97,97,97,87,87,87,85,85,
101,100,100,100,101,98,96,94,94,94,
101,100,100,99,100,99,99,99,99,99,
102,101,101,101,101,100,99,99,99,100)
y <- c(4540,4315,4095,3650,3200,4833,4617,4340,3820,3368,
4445,4188,3981,3622,3125,4560,4340,4115,3630,3210,
4330,4119,3891,3467,3045,4411,4203,3968,3531,3074,
4350,4128,3940,3480,3064,4402,4180,3973,3530,3080)
# Combine into a single data frame
jet <- data.frame(y, x1, x2, x3, x4, x5, x6)
#Fit the baseline regression
full_model <- lm(y ~ x1 + x2 + x3 + x4 + x5 + x6, data = jet)
true_coef <- tidy(full_model) %>%
select(term, true = estimate)
summary(full_model)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6, data = jet)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.949 -19.028 -1.572 17.139 49.606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.738e+03 2.445e+03 -1.938 0.061213 .
## x1 1.119e+00 2.865e-01 3.904 0.000441 ***
## x2 -3.018e-02 3.823e-02 -0.789 0.435478
## x3 2.306e-01 1.180e-01 1.954 0.059231 .
## x4 3.850e+00 2.686e+00 1.433 0.161246
## x5 8.219e-01 3.508e-01 2.343 0.025298 *
## x6 -1.695e+01 2.620e+00 -6.468 2.45e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.51 on 33 degrees of freedom
## Multiple R-squared: 0.9977, Adjusted R-squared: 0.9972
## F-statistic: 2350 on 6 and 33 DF, p-value: < 2.2e-16
# Introduce Missingness in x3
make_missing <- function(data, prop, seed = 123) {
set.seed(seed)
n <- nrow(data)
miss_id <- sample(1:n, size = floor(prop * n))
data$x3[miss_id] <- NA
data
}
# Single Imputation Methods
impute_methods <- function(data_miss) {
# Complete case
cc <- na.omit(data_miss)
# Mean imputation
mean_imp <- data_miss
mean_imp$x3[is.na(mean_imp$x3)] <- mean(mean_imp$x3, na.rm = TRUE)
# Median imputation
median_imp <- data_miss
median_imp$x3[is.na(median_imp$x3)] <- median(median_imp$x3, na.rm = TRUE)
# Deterministic regression imputation
reg_model <- lm(x3 ~ x1 + x2 + x4 + x5 + x6, data = data_miss)
reg_imp <- data_miss
reg_imp$x3[is.na(reg_imp$x3)] <- predict(reg_model, reg_imp)[is.na(reg_imp$x3)]
# Stochastic regression imputation
resid_sd <- sd(residuals(reg_model), na.rm = TRUE)
stoch_imp <- reg_imp
stoch_imp$x3[is.na(data_miss$x3)] <-
stoch_imp$x3[is.na(data_miss$x3)] +
rnorm(sum(is.na(data_miss$x3)), 0, resid_sd)
# Hot deck imputation
hot_imp <- hotdeck(data_miss, variable = "x3")
list(
CompleteCase = cc,
Mean = mean_imp,
Median = median_imp,
RegDet = reg_imp,
RegStoch = stoch_imp,
HotDeck = hot_imp
)
}
# 2) Method-level bias summaries (attach to model_summary)
# Uses Full Data coefficients as "true"
model_metrics <- function(model) {
s <- summary(model)
data.frame(
R2 = s$r.squared,
Adj_R2 = s$adj.r.squared,
Sigma = s$sigma,
AIC = AIC(model),
BIC = BIC(model)
)
}
bias_metrics <- function(model, true_coef) {
est <- tidy(model) %>%
select(term, estimate) %>%
filter(term != "(Intercept)") %>%
left_join(true_coef, by = "term") %>%
mutate(bias = estimate - true)
data.frame(
MeanBias = mean(est$bias),
MeanAbsBias = mean(abs(est$bias)),
RMSE_Bias = sqrt(mean(est$bias^2)),
MaxAbsBias = max(abs(est$bias))
)
}
run_scenario <- function(prop_missing) {
jet_miss <- make_missing(jet, prop_missing)
imputations <- impute_methods(jet_miss)
bind_rows(lapply(names(imputations), function(m) {
model <- lm(y ~ x1 + x2 + x3 + x4 + x5 + x6,
data = imputations[[m]])
cbind(
Scenario = paste0(prop_missing * 100, "% Missing"),
Method = m,
model_metrics(model),
bias_metrics(model, true_coef)
)
}))
}
results_10 <- run_scenario(0.10)
results_30 <- run_scenario(0.30)
final_results <- bind_rows(results_10, results_30)
final_results
## Scenario Method R2 Adj_R2 Sigma AIC BIC
## 1 10% Missing CompleteCase 0.9975891 0.9970902 27.36170 348.6379 361.3061
## 2 10% Missing Mean 0.9974559 0.9969933 27.67124 387.4517 400.9627
## 3 10% Missing Median 0.9974421 0.9969771 27.74606 387.6677 401.1787
## 4 10% Missing RegDet 0.9977243 0.9973106 26.17082 382.9918 396.5029
## 5 10% Missing RegStoch 0.9976851 0.9972642 26.39565 383.6762 397.1872
## 6 10% Missing HotDeck 0.9973970 0.9969238 27.98962 388.3669 401.8779
## 7 30% Missing CompleteCase 0.9977489 0.9971057 27.88517 273.7788 284.4364
## 8 30% Missing Mean 0.9974537 0.9969907 27.68322 387.4863 400.9973
## 9 30% Missing Median 0.9974282 0.9969606 27.82156 387.8851 401.3961
## 10 30% Missing RegDet 0.9977470 0.9973373 26.04034 382.5920 396.1030
## 11 30% Missing RegStoch 0.9974668 0.9970062 27.61195 387.2801 400.7911
## 12 30% Missing HotDeck 0.9974020 0.9969296 27.96287 388.2904 401.8014
## MeanBias MeanAbsBias RMSE_Bias MaxAbsBias
## 1 -0.11970581 0.17586049 0.3039931 0.7240872
## 2 0.75664457 0.82355980 1.4767407 3.5099969
## 3 0.74615102 0.81470084 1.4610668 3.4789293
## 4 -0.21583305 0.23389951 0.4423015 1.0682639
## 5 -0.08720154 0.09530873 0.1844768 0.4456591
## 6 0.80456714 0.88021830 1.5979857 3.8248132
## 7 -0.13003493 0.20136277 0.2575820 0.5036559
## 8 0.89420143 0.97750991 1.8320232 4.4195164
## 9 0.89309589 0.97457613 1.8059815 4.3438782
## 10 -0.32492862 0.35227829 0.5736626 1.2919123
## 11 0.41010814 0.44683694 0.7957868 1.8825324
## 12 0.88220705 0.96051824 1.7340268 4.1343117