## For data manipulation
library(magrittr)
## For data pooling
library(mitools)
## Joint MVN MI
library(Amelia)
library(norm)
## Conditional MI
library(mice)
library(mi)
library(smcfcs)
## Sample size
N <- 1000
## Number of variables fully observed
n_full <- 5
## Number of variables partially observed
n_partial <- 5
## Imputation count
M <- 20
rdf <- mi::rdata.frame(N = N, restriction = "MARish",
n_full = n_full, n_partial = n_partial)
dataTrue <- rdf$true[,Filter(f = function(elt) {grepl("x_|y_", elt)}, x = names(rdf$true))]
dataMiss <- rdf$obs
## Formula for outcome model
form <- sprintf("%s ~ %s",
tail(names(dataMiss), 1),
paste0(head(names(dataMiss), -1), collapse = " + "))
form
## [1] "y_5 ~ x_1 + x_2 + x_3 + x_4 + x_5 + y_1 + y_2 + y_3 + y_4"
mice::md.pattern(dataMiss)
## x_1 x_2 x_3 x_4 x_5 y_1 y_2 y_3 y_4 y_5
## 232 1 1 1 1 1 1 1 1 1 1 0
## 73 1 1 1 1 1 0 1 1 1 1 1
## 135 1 1 1 1 1 1 0 1 1 1 1
## 55 1 1 1 1 1 1 1 0 1 1 1
## 15 1 1 1 1 1 1 1 1 0 1 1
## 107 1 1 1 1 1 1 1 1 1 0 1
## 5 1 1 1 1 1 0 0 1 1 1 2
## 43 1 1 1 1 1 1 0 0 1 1 2
## 86 1 1 1 1 1 0 1 1 0 1 2
## 4 1 1 1 1 1 1 0 1 0 1 2
## 58 1 1 1 1 1 1 1 0 0 1 2
## 33 1 1 1 1 1 0 1 1 1 0 2
## 34 1 1 1 1 1 1 0 1 1 0 2
## 25 1 1 1 1 1 1 1 0 1 0 2
## 4 1 1 1 1 1 1 1 1 0 0 2
## 2 1 1 1 1 1 0 0 1 0 1 3
## 26 1 1 1 1 1 0 1 0 0 1 3
## 16 1 1 1 1 1 1 0 0 0 1 3
## 8 1 1 1 1 1 1 0 0 1 0 3
## 20 1 1 1 1 1 0 1 1 0 0 3
## 11 1 1 1 1 1 1 1 0 0 0 3
## 5 1 1 1 1 1 0 1 0 0 0 4
## 3 1 1 1 1 1 1 0 0 0 0 4
## 0 0 0 0 0 250 250 250 250 250 1250
resTrue <- lm(as.formula(form), data = dataTrue)
summary(resTrue)
##
## Call:
## lm(formula = as.formula(form), data = dataTrue)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.216306 -0.040276 0.000597 0.039962 0.179276
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.001836 0.001926 -0.953 0.341
## x_1 -0.157158 0.002709 -58.023 <2e-16 ***
## x_2 0.683393 0.002463 277.481 <2e-16 ***
## x_3 1.493350 0.003584 416.646 <2e-16 ***
## x_4 -1.330457 0.003444 -386.356 <2e-16 ***
## x_5 -0.091419 0.002975 -30.733 <2e-16 ***
## y_1 0.385594 0.002618 147.271 <2e-16 ***
## y_2 -0.128578 0.002409 -53.380 <2e-16 ***
## y_3 -0.100157 0.002839 -35.274 <2e-16 ***
## y_4 0.636762 0.002538 250.845 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06066 on 990 degrees of freedom
## Multiple R-squared: 0.9963, Adjusted R-squared: 0.9963
## F-statistic: 2.971e+04 on 9 and 990 DF, p-value: < 2.2e-16
resCc <- lm(as.formula(form), data = dataMiss)
summary(resCc)
##
## Call:
## lm(formula = as.formula(form), data = dataMiss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.161620 -0.046663 -0.002747 0.039296 0.156178
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.001060 0.005065 -0.209 0.834
## x_1 -0.166233 0.006602 -25.180 <2e-16 ***
## x_2 0.682594 0.005985 114.059 <2e-16 ***
## x_3 1.499849 0.008267 181.421 <2e-16 ***
## x_4 -1.339962 0.007543 -177.633 <2e-16 ***
## x_5 -0.096700 0.007035 -13.746 <2e-16 ***
## y_1 0.387735 0.005787 66.999 <2e-16 ***
## y_2 -0.126898 0.004911 -25.841 <2e-16 ***
## y_3 -0.110169 0.006668 -16.521 <2e-16 ***
## y_4 0.640946 0.005374 119.260 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06294 on 222 degrees of freedom
## (768 observations deleted due to missingness)
## Multiple R-squared: 0.9959, Adjusted R-squared: 0.9958
## F-statistic: 6043 on 9 and 222 DF, p-value: < 2.2e-16
timeAmelia <- system.time(dataAmelia <- amelia(dataMiss, m = M, p2s = 0))
dataAmelia
##
## Amelia output with 20 imputed datasets.
## Return code: 1
## Message: Normal EM convergence.
##
## Chain Lengths:
## --------------
## Imputation 1: 24
## Imputation 2: 18
## Imputation 3: 18
## Imputation 4: 21
## Imputation 5: 19
## Imputation 6: 19
## Imputation 7: 17
## Imputation 8: 13
## Imputation 9: 17
## Imputation 10: 16
## Imputation 11: 18
## Imputation 12: 18
## Imputation 13: 16
## Imputation 14: 17
## Imputation 15: 18
## Imputation 16: 17
## Imputation 17: 18
## Imputation 18: 17
## Imputation 19: 17
## Imputation 20: 16
## Using Zelig
library(Zelig)
resAmelia <- zelig(as.formula(form), model = "normal", data = dataAmelia)
## How to cite this model in Zelig:
## R Core Team. 2008.
## normal: Normal Regression for Continuous Dependent Variables
## in Christine Choirat, James Honaker, Kosuke Imai, Gary King, and Olivia Lau,
## "Zelig: Everyone's Statistical Software," http://zeligproject.org/
summary(resAmelia)
## Model: Combined Imputations
## Estimate Std.Error z value Pr(>|z|)
## (Intercept) -0.0001467 0.004719 -0.03108 0.9752
## x_1 -0.1613421 0.005723 -28.19392 0.0000 ***
## x_2 0.6803595 0.004323 157.36423 0.0000 ***
## x_3 1.4909804 0.008648 172.41187 0.0000 ***
## x_4 -1.3334185 0.006353 -209.87435 0.0000 ***
## x_5 -0.0902134 0.005857 -15.40349 0.0000 ***
## y_1 0.3865692 0.005802 66.62126 0.0000 ***
## y_2 -0.1297830 0.003602 -36.02745 0.0000 ***
## y_3 -0.1010767 0.006220 -16.24925 0.0000 ***
## y_4 0.6375901 0.004321 147.54152 0.0000 ***
## ---
## Signif. codes: '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## For results from individual imputed datasets, use summary(x, subset = i:j)
## Next step: Use 'setx' method
## Data extraction is easier with mitools
resAmelia <- with(mitools::imputationList(dataAmelia$imputations),
lm(as.formula(form)))
tabAmelia <- mitools::MIcombine(resAmelia)
s <- prelim.norm(as.matrix(dataMiss))
## Need this to run da.norm and imp.norm
rngseed(201512060)
## EM algorithm for initial parameters of MVN (deterministic)
resEm <- em.norm(s)
## Iterations of EM:
## 1...2...3...4...5...6...7...8...9...10...11...12...13...14...15...16...17...18...19...20...21...22...23...24...25...26...27...28...29...30...31...32...33...34...35...
## Can check MVN parameters by EM
## getparam.norm(s, resEm)
## Set IP algorithm step count
R <- 1000
## IP algorithm
timeNorm <- system.time(dataNorm <- lapply(seq_len(M), function(i) {
## IP algorithm (Impute data, estimate MVN Parameter, repeat; MCMC)
## Last P step value returns as MVN parameters (after R steps)
theta <- da.norm(s = s, start = resEm, steps = R)
## Using these MVN parameters, impute a dataset
## Here only one data draw from one MCMC chain (could draw multiple)
out <- imp.norm(s = s, theta = theta, x = as.matrix(dataMiss))
## Convert to data frame
as.data.frame(out)
}))
resNorm <- with(mitools::imputationList(dataNorm),
lm(as.formula(form)))
tabNorm <- mitools::MIcombine(resNorm)
tabNorm
## Multiple imputation results:
## with(mitools::imputationList(dataNorm), lm(as.formula(form)))
## MIcombine.default(resNorm)
## results se
## (Intercept) -0.00103876 0.004027416
## x_1 -0.16128723 0.005786933
## x_2 0.67968191 0.005181312
## x_3 1.49121783 0.007861090
## x_4 -1.33377092 0.007756583
## x_5 -0.09191798 0.005433026
## y_1 0.38492362 0.004812603
## y_2 -0.13127145 0.004940625
## y_3 -0.10159513 0.006093926
## y_4 0.63688309 0.005503139
timeMice <- system.time(dataMice <- mice::mice(dataMiss, m = M, printFlag = F))
dataMice
## Multiply imputed data set
## Call:
## mice::mice(data = dataMiss, m = M, printFlag = F)
## Number of multiple imputations: 20
## Missing cells per column:
## x_1 x_2 x_3 x_4 x_5 y_1 y_2 y_3 y_4 y_5
## 0 0 0 0 0 250 250 250 250 250
## Imputation methods:
## x_1 x_2 x_3 x_4 x_5 y_1 y_2 y_3 y_4 y_5
## "" "" "" "" "" "pmm" "pmm" "pmm" "pmm" "pmm"
## VisitSequence:
## y_1 y_2 y_3 y_4 y_5
## 6 7 8 9 10
## PredictorMatrix:
## x_1 x_2 x_3 x_4 x_5 y_1 y_2 y_3 y_4 y_5
## x_1 0 0 0 0 0 0 0 0 0 0
## x_2 0 0 0 0 0 0 0 0 0 0
## x_3 0 0 0 0 0 0 0 0 0 0
## x_4 0 0 0 0 0 0 0 0 0 0
## x_5 0 0 0 0 0 0 0 0 0 0
## y_1 1 1 1 1 1 0 1 1 1 1
## y_2 1 1 1 1 1 1 0 1 1 1
## y_3 1 1 1 1 1 1 1 0 1 1
## y_4 1 1 1 1 1 1 1 1 0 1
## y_5 1 1 1 1 1 1 1 1 1 0
## Random generator seed value: NA
resMice <- with(dataMice, lm(as.formula(form)))
tabMice <- summary(mice::pool(resMice))
tabMice
## est se t df Pr(>|t|) lo 95 hi 95 nmis fmi
## (Intercept) -0.002342495 0.005988786 -0.3911469 22.75653 6.993269e-01 -0.01473858 0.01005359 NA 0.8558587
## x_1 -0.158720505 0.006516930 -24.3551047 29.90492 0.000000e+00 -0.17203163 -0.14540938 0 0.7630718
## x_2 0.677321987 0.007026246 96.3988441 24.68727 0.000000e+00 0.66284186 0.69180211 0 0.8289777
## x_3 1.484550838 0.011408691 130.1245528 22.31368 0.000000e+00 1.46090993 1.50819175 0 0.8621363
## x_4 -1.329162460 0.010174346 -130.6386195 23.75164 0.000000e+00 -1.35017290 -1.30815202 0 0.8418816
## x_5 -0.089351739 0.007423622 -12.0361380 27.83217 1.510791e-12 -0.10456247 -0.07414101 0 0.7879284
## y_1 0.381373465 0.006012563 63.4294300 30.02586 0.000000e+00 0.36909462 0.39365231 250 0.7616760
## y_2 -0.126830572 0.005301079 -23.9254254 33.98035 0.000000e+00 -0.13760389 -0.11605725 250 0.7191721
## y_3 -0.097828575 0.007649303 -12.7892145 25.17170 1.636469e-12 -0.11357716 -0.08207999 250 0.8224085
## y_4 0.634109031 0.004255176 149.0206283 76.86407 0.000000e+00 0.62563565 0.64258241 250 0.4735005
## lambda
## (Intercept) 0.8437238
## x_1 0.7477391
## x_2 0.8156620
## x_3 0.8503094
## x_4 0.8291052
## x_5 0.7732176
## y_1 0.7463130
## y_2 0.7031158
## y_3 0.8088372
## y_4 0.4599770
timeMi <- system.time(dataMi <- mi::mi(y = mi::missing_data.frame(dataMiss), n.chains = M))
dataMi
## Object of class mi with 20 chains, each with 30 iterations.
## Each chain is the evolution of an object of missing_data.frame class with 1000 observations on 10 variables.
resMi <- mi::pool(as.formula(form), data = dataMi)
resMi
## bayesglm(formula = as.formula(form), data = dataMi)
## coef.est coef.se
## (Intercept) 0.00 0.00
## x_1 -0.16 0.01
## x_2 0.68 0.00
## x_3 1.49 0.01
## x_4 -1.33 0.01
## x_5 -0.09 0.01
## y_1 0.39 0.00
## y_2 -0.13 0.00
## y_3 -0.10 0.01
## y_4 0.64 0.00
## n = 990, k = 10
## residual deviance = 4.0, null deviance = 988.2 (difference = 984.2)
## overdispersion parameter = 0.0
## residual sd is sqrt(overdispersion) = 0.06
## Create method specification
smcfcsMethod <- names(dataMiss) %>%
gsub("x_.*", "", .) %>%
gsub("y_.*", "norm", .)
## Very last one is outcome. Need ""
smcfcsMethod[length(smcfcsMethod)] <- ""
timeSmcfcs <- system.time(dataSmcfcs <-
smcfcs::smcfcs(dataMiss, smtype = "lm",
smformula = as.formula(form),
method = smcfcsMethod, m = M))
## Warning in smcfcs::smcfcs(dataMiss, smtype = "lm", smformula = as.formula(form), : Rejection sampleing failed 231
## times (across all variables, iterations, and imputations). You may want to increase the rejection sampling limit.
resSmcfcs <- with(mitools::imputationList(dataSmcfcs$impDatasets),
lm(as.formula(form)))
tabSmcfcs <- mitools::MIcombine(resSmcfcs)
tabSmcfcs
## Multiple imputation results:
## with(mitools::imputationList(dataSmcfcs$impDatasets), lm(as.formula(form)))
## MIcombine.default(resSmcfcs)
## results se
## (Intercept) -0.005561043 0.006813337
## x_1 -0.156006021 0.007861880
## x_2 0.677715464 0.006764725
## x_3 1.483505584 0.011616464
## x_4 -1.329263343 0.010005087
## x_5 -0.090318758 0.005804712
## y_1 0.383751629 0.007379530
## y_2 -0.130400992 0.005141435
## y_3 -0.093636949 0.007256798
## y_4 0.636653622 0.007567952
nomiss is the results from the no missing data true dataset. The coefficients should be similar to these true values. The variances have to be larger than these because of uncertainty introduced by missingness.
## Point estimates
data.frame(nomiss = coef(resTrue),
cc = coef(resCc),
norm = coef(tabNorm),
amelia = coef(tabAmelia),
mice = tabMice[,"est"],
mi = coef(resMi),
smcfcs = coef(tabSmcfcs))
## nomiss cc norm amelia mice mi smcfcs
## (Intercept) -0.001835507 -0.001059939 -0.00103876 -0.0001466606 -0.002342495 -0.004612066 -0.005561043
## x_1 -0.157157733 -0.166233070 -0.16128723 -0.1613421065 -0.158720505 -0.160742326 -0.156006021
## x_2 0.683393025 0.682593597 0.67968191 0.6803595017 0.677321987 0.675871572 0.677715464
## x_3 1.493350458 1.499849412 1.49121783 1.4909804054 1.484550838 1.490109100 1.483505584
## x_4 -1.330457327 -1.339962195 -1.33377092 -1.3334185168 -1.329162460 -1.332507703 -1.329263343
## x_5 -0.091418760 -0.096700073 -0.09191798 -0.0902134099 -0.089351739 -0.094012980 -0.090318758
## y_1 0.385593717 0.387734643 0.38492362 0.3865692085 0.381373465 0.385896874 0.383751629
## y_2 -0.128577687 -0.126897720 -0.13127145 -0.1297829821 -0.126830572 -0.129714825 -0.130400992
## y_3 -0.100156860 -0.110169297 -0.10159513 -0.1010767298 -0.097828575 -0.103624698 -0.093636949
## y_4 0.636762259 0.640945579 0.63688309 0.6375900957 0.634109031 0.638588044 0.636653622
## SEs
data.frame(nomiss = coef(summary(resTrue))[,2],
cc = coef(summary(resCc))[,2],
amelia = sqrt(diag(tabAmelia$variance)),
norm = sqrt(diag(tabNorm$variance)),
mice = tabMice[,"se"],
mi = resMi@ses,
smcfcs = sqrt(diag(tabSmcfcs$variance)))
## nomiss cc amelia norm mice mi smcfcs
## (Intercept) 0.001925637 0.005064875 0.004719164 0.004027416 0.005988786 0.004693637 0.006813337
## x_1 0.002708522 0.006601763 0.005722585 0.005786933 0.006516930 0.005638183 0.007861880
## x_2 0.002462845 0.005984586 0.004323470 0.005181312 0.007026246 0.004480013 0.006764725
## x_3 0.003584221 0.008267254 0.008647783 0.007861090 0.011408691 0.007390177 0.011616464
## x_4 0.003443609 0.007543435 0.006353413 0.007756583 0.010174346 0.007229597 0.010005087
## x_5 0.002974575 0.007034794 0.005856685 0.005433026 0.007423622 0.006626966 0.005804712
## y_1 0.002618255 0.005787154 0.005802490 0.004812603 0.006012563 0.004863533 0.007379530
## y_2 0.002408740 0.004910731 0.003602336 0.004940625 0.005301079 0.004008586 0.005141435
## y_3 0.002839412 0.006668293 0.006220393 0.006093926 0.007649303 0.005927852 0.007256798
## y_4 0.002538469 0.005374366 0.004321428 0.005503139 0.004255176 0.004446172 0.007567952
## Running time
data.frame(amelia = timeAmelia[1:3],
norm = timeNorm[1:3],
mice = timeMice[1:3],
mi = timeMi[1:3],
smcfcs = timeSmcfcs[1:3])
## amelia norm mice mi smcfcs
## user.self 1.319 6.468 6.874 0.486 98.844
## sys.self 0.016 0.142 0.123 0.054 0.953
## elapsed 1.342 6.644 7.033 43.992 100.150