# Core
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(broom)

# Imputation
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
# Vectors for each column
x1 <- c(2140,2016,1905,1675,1474,2239,2120,1990,1702,1487,
        2107,1973,1864,1674,1440,2165,2048,1916,1658,1489,
        2062,1929,1815,1595,1400,2047,1935,1807,1591,1388,
        2071,1944,1831,1612,1410,2066,1954,1835,1616,1407)

x2 <- c(20640,20280,19860,18980,18100,20740,20305,19961,18916,18012,
        20520,20130,19780,19020,18030,20680,20340,19860,18950,18700,
        20500,20050,19680,18890,17870,20540,20160,19750,18890,17870,
        20460,20010,19640,18710,17780,20520,20150,19750,18850,17910)

x3 <- c(30250,30010,29780,29330,28960,30083,29831,29604,29088,28675,
        30120,29920,29720,29370,28940,30160,29960,29710,29250,28890,
        30190,29960,29770,29360,28960,30160,29940,29760,29350,28910,
        30180,29940,29750,29360,28900,30170,29950,29740,29320,28910)

x4 <- c(205,195,184,164,144,216,206,196,171,149,
        195,190,180,161,139,208,199,187,164,145,
        193,183,173,153,134,193,184,173,153,133,
        198,186,178,156,136,197,188,178,156,137)

x5 <- c(1732,1697,1662,1598,1541,1709,1669,1640,1572,1522,
        1740,1711,1682,1630,1572,1704,1679,1642,1576,1528,
        1748,1713,1684,1624,1569,1746,1714,1679,1621,1561,
        1729,1692,1667,1609,1552,1758,1729,1690,1616,1569)

x6 <- c(99,100,97,97,97,87,87,87,85,85,
        101,100,100,100,101,98,96,94,94,94,
        101,100,100,99,100,99,99,99,99,99,
        102,101,101,101,101,100,99,99,99,100)

y <- c(4540,4315,4095,3650,3200,4833,4617,4340,3820,3368,
       4445,4188,3981,3622,3125,4560,4340,4115,3630,3210,
       4330,4119,3891,3467,3045,4411,4203,3968,3531,3074,
       4350,4128,3940,3480,3064,4402,4180,3973,3530,3080)

# Combine into a single data frame
jet <- data.frame(y, x1, x2, x3, x4, x5, x6)

#Fit the baseline regression
full_model <- lm(y ~ x1 + x2 + x3 + x4 + x5 + x6, data = jet)

true_coef <- tidy(full_model) %>%
  select(term, true = estimate)

summary(full_model)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6, data = jet)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.949 -19.028  -1.572  17.139  49.606 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.738e+03  2.445e+03  -1.938 0.061213 .  
## x1           1.119e+00  2.865e-01   3.904 0.000441 ***
## x2          -3.018e-02  3.823e-02  -0.789 0.435478    
## x3           2.306e-01  1.180e-01   1.954 0.059231 .  
## x4           3.850e+00  2.686e+00   1.433 0.161246    
## x5           8.219e-01  3.508e-01   2.343 0.025298 *  
## x6          -1.695e+01  2.620e+00  -6.468 2.45e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.51 on 33 degrees of freedom
## Multiple R-squared:  0.9977, Adjusted R-squared:  0.9972 
## F-statistic:  2350 on 6 and 33 DF,  p-value: < 2.2e-16
#  Introduce Missingness in x3
make_missing <- function(data, prop, seed = 123) {
  set.seed(seed)
  n <- nrow(data)
  miss_id <- sample(1:n, size = floor(prop * n))
  data$x3[miss_id] <- NA
  data
}


# Single Imputation Methods
impute_methods <- function(data_miss) {
  
  # Complete case
  cc <- na.omit(data_miss)
  
  # Mean imputation
  mean_imp <- data_miss
  mean_imp$x3[is.na(mean_imp$x3)] <- mean(mean_imp$x3, na.rm = TRUE)
  
  # Median imputation
  median_imp <- data_miss
  median_imp$x3[is.na(median_imp$x3)] <- median(median_imp$x3, na.rm = TRUE)
  
  # Deterministic regression imputation
  reg_model <- lm(x3 ~ x1 + x2 + x4 + x5 + x6, data = data_miss)
  reg_imp <- data_miss
  reg_imp$x3[is.na(reg_imp$x3)] <- predict(reg_model, reg_imp)[is.na(reg_imp$x3)]
  
  # Stochastic regression imputation
  resid_sd <- sd(residuals(reg_model), na.rm = TRUE)
  stoch_imp <- reg_imp
  stoch_imp$x3[is.na(data_miss$x3)] <-
    stoch_imp$x3[is.na(data_miss$x3)] +
    rnorm(sum(is.na(data_miss$x3)), 0, resid_sd)
  
  # Hot deck imputation
  hot_imp <- hotdeck(data_miss, variable = "x3")
  
  list(
    CompleteCase = cc,
    Mean = mean_imp,
    Median = median_imp,
    RegDet = reg_imp,
    RegStoch = stoch_imp,
    HotDeck = hot_imp
  )
}
# 2) Method-level bias summaries (attach to model_summary)
#    Uses Full Data coefficients as "true"
model_metrics <- function(model) {
  s <- summary(model)
  data.frame(
    R2 = s$r.squared,
    Adj_R2 = s$adj.r.squared,
    Sigma = s$sigma,
    AIC = AIC(model),
    BIC = BIC(model)
  )
}

bias_metrics <- function(model, true_coef) {
  
  est <- tidy(model) %>%
    select(term, estimate) %>%
    filter(term != "(Intercept)") %>%
    left_join(true_coef, by = "term") %>%
    mutate(bias = estimate - true)
  
  data.frame(
    MeanBias = mean(est$bias),
    MeanAbsBias = mean(abs(est$bias)),
    RMSE_Bias = sqrt(mean(est$bias^2)),
    MaxAbsBias = max(abs(est$bias))
  )
}

run_scenario <- function(prop_missing) {
  
  jet_miss <- make_missing(jet, prop_missing)
  imputations <- impute_methods(jet_miss)
  
  bind_rows(lapply(names(imputations), function(m) {
    
    model <- lm(y ~ x1 + x2 + x3 + x4 + x5 + x6,
                data = imputations[[m]])
    
    cbind(
      Scenario = paste0(prop_missing * 100, "% Missing"),
      Method = m,
      model_metrics(model),
      bias_metrics(model, true_coef)
    )
  }))
}

results_10 <- run_scenario(0.10)
results_30 <- run_scenario(0.30)
final_results <- bind_rows(results_10, results_30)
final_results
##       Scenario       Method        R2    Adj_R2    Sigma      AIC      BIC
## 1  10% Missing CompleteCase 0.9975891 0.9970902 27.36170 348.6379 361.3061
## 2  10% Missing         Mean 0.9974559 0.9969933 27.67124 387.4517 400.9627
## 3  10% Missing       Median 0.9974421 0.9969771 27.74606 387.6677 401.1787
## 4  10% Missing       RegDet 0.9977243 0.9973106 26.17082 382.9918 396.5029
## 5  10% Missing     RegStoch 0.9976851 0.9972642 26.39565 383.6762 397.1872
## 6  10% Missing      HotDeck 0.9973970 0.9969238 27.98962 388.3669 401.8779
## 7  30% Missing CompleteCase 0.9977489 0.9971057 27.88517 273.7788 284.4364
## 8  30% Missing         Mean 0.9974537 0.9969907 27.68322 387.4863 400.9973
## 9  30% Missing       Median 0.9974282 0.9969606 27.82156 387.8851 401.3961
## 10 30% Missing       RegDet 0.9977470 0.9973373 26.04034 382.5920 396.1030
## 11 30% Missing     RegStoch 0.9974668 0.9970062 27.61195 387.2801 400.7911
## 12 30% Missing      HotDeck 0.9974020 0.9969296 27.96287 388.2904 401.8014
##       MeanBias MeanAbsBias RMSE_Bias MaxAbsBias
## 1  -0.11970581  0.17586049 0.3039931  0.7240872
## 2   0.75664457  0.82355980 1.4767407  3.5099969
## 3   0.74615102  0.81470084 1.4610668  3.4789293
## 4  -0.21583305  0.23389951 0.4423015  1.0682639
## 5  -0.08720154  0.09530873 0.1844768  0.4456591
## 6   0.80456714  0.88021830 1.5979857  3.8248132
## 7  -0.13003493  0.20136277 0.2575820  0.5036559
## 8   0.89420143  0.97750991 1.8320232  4.4195164
## 9   0.89309589  0.97457613 1.8059815  4.3438782
## 10 -0.32492862  0.35227829 0.5736626  1.2919123
## 11  0.41010814  0.44683694 0.7957868  1.8825324
## 12  0.88220705  0.96051824 1.7340268  4.1343117