MI Examples

References

Load packages

## For data manipulation
library(magrittr)
## For data pooling
library(mitools)

## Joint MVN MI
library(Amelia)
library(norm)
## Conditional MI
library(mice)
library(mi)
library(smcfcs)

Configure

## Sample size
N         <- 1000
## Number of variables fully observed
n_full    <- 5
## Number of variables partially observed
n_partial <- 5
## Imputation count
M         <- 20

Generate MAR data

rdf <- mi::rdata.frame(N = N, restriction = "MARish",
                       n_full = n_full, n_partial = n_partial)
dataTrue <- rdf$true[,Filter(f = function(elt) {grepl("x_|y_", elt)}, x = names(rdf$true))]
dataMiss <- rdf$obs

Create outcome model formula

## Formula for outcome model
form <- sprintf("%s ~ %s",
                tail(names(dataMiss), 1),
                paste0(head(names(dataMiss), -1), collapse = " + "))
form

## [1] "y_5 ~ x_1 + x_2 + x_3 + x_4 + x_5 + y_1 + y_2 + y_3 + y_4"

Examine missing data patterns

mice::md.pattern(dataMiss)

##     x_1 x_2 x_3 x_4 x_5 y_1 y_2 y_3 y_4 y_5     
## 232   1   1   1   1   1   1   1   1   1   1    0
##  73   1   1   1   1   1   0   1   1   1   1    1
## 135   1   1   1   1   1   1   0   1   1   1    1
##  55   1   1   1   1   1   1   1   0   1   1    1
##  15   1   1   1   1   1   1   1   1   0   1    1
## 107   1   1   1   1   1   1   1   1   1   0    1
##   5   1   1   1   1   1   0   0   1   1   1    2
##  43   1   1   1   1   1   1   0   0   1   1    2
##  86   1   1   1   1   1   0   1   1   0   1    2
##   4   1   1   1   1   1   1   0   1   0   1    2
##  58   1   1   1   1   1   1   1   0   0   1    2
##  33   1   1   1   1   1   0   1   1   1   0    2
##  34   1   1   1   1   1   1   0   1   1   0    2
##  25   1   1   1   1   1   1   1   0   1   0    2
##   4   1   1   1   1   1   1   1   1   0   0    2
##   2   1   1   1   1   1   0   0   1   0   1    3
##  26   1   1   1   1   1   0   1   0   0   1    3
##  16   1   1   1   1   1   1   0   0   0   1    3
##   8   1   1   1   1   1   1   0   0   1   0    3
##  20   1   1   1   1   1   0   1   1   0   0    3
##  11   1   1   1   1   1   1   1   0   0   0    3
##   5   1   1   1   1   1   0   1   0   0   0    4
##   3   1   1   1   1   1   1   0   0   0   0    4
##       0   0   0   0   0 250 250 250 250 250 1250

True data analysis taking last variable as outcome

resTrue <- lm(as.formula(form), data = dataTrue)
summary(resTrue)

## 
## Call:
## lm(formula = as.formula(form), data = dataTrue)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.216306 -0.040276  0.000597  0.039962  0.179276 
## 
## Coefficients:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) -0.001836   0.001926   -0.953    0.341    
## x_1         -0.157158   0.002709  -58.023   <2e-16 ***
## x_2          0.683393   0.002463  277.481   <2e-16 ***
## x_3          1.493350   0.003584  416.646   <2e-16 ***
## x_4         -1.330457   0.003444 -386.356   <2e-16 ***
## x_5         -0.091419   0.002975  -30.733   <2e-16 ***
## y_1          0.385594   0.002618  147.271   <2e-16 ***
## y_2         -0.128578   0.002409  -53.380   <2e-16 ***
## y_3         -0.100157   0.002839  -35.274   <2e-16 ***
## y_4          0.636762   0.002538  250.845   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06066 on 990 degrees of freedom
## Multiple R-squared:  0.9963, Adjusted R-squared:  0.9963 
## F-statistic: 2.971e+04 on 9 and 990 DF,  p-value: < 2.2e-16

Complete case analysis dropping observations with missing

resCc <- lm(as.formula(form), data = dataMiss)
summary(resCc)

## 
## Call:
## lm(formula = as.formula(form), data = dataMiss)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.161620 -0.046663 -0.002747  0.039296  0.156178 
## 
## Coefficients:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) -0.001060   0.005065   -0.209    0.834    
## x_1         -0.166233   0.006602  -25.180   <2e-16 ***
## x_2          0.682594   0.005985  114.059   <2e-16 ***
## x_3          1.499849   0.008267  181.421   <2e-16 ***
## x_4         -1.339962   0.007543 -177.633   <2e-16 ***
## x_5         -0.096700   0.007035  -13.746   <2e-16 ***
## y_1          0.387735   0.005787   66.999   <2e-16 ***
## y_2         -0.126898   0.004911  -25.841   <2e-16 ***
## y_3         -0.110169   0.006668  -16.521   <2e-16 ***
## y_4          0.640946   0.005374  119.260   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06294 on 222 degrees of freedom
##   (768 observations deleted due to missingness)
## Multiple R-squared:  0.9959, Adjusted R-squared:  0.9958 
## F-statistic:  6043 on 9 and 222 DF,  p-value: < 2.2e-16

Amelia

timeAmelia <- system.time(dataAmelia <- amelia(dataMiss, m = M, p2s = 0))
dataAmelia

## 
## Amelia output with 20 imputed datasets.
## Return code:  1 
## Message:  Normal EM convergence. 
## 
## Chain Lengths:
## --------------
## Imputation 1:  24
## Imputation 2:  18
## Imputation 3:  18
## Imputation 4:  21
## Imputation 5:  19
## Imputation 6:  19
## Imputation 7:  17
## Imputation 8:  13
## Imputation 9:  17
## Imputation 10:  16
## Imputation 11:  18
## Imputation 12:  18
## Imputation 13:  16
## Imputation 14:  17
## Imputation 15:  18
## Imputation 16:  17
## Imputation 17:  18
## Imputation 18:  17
## Imputation 19:  17
## Imputation 20:  16

## Using Zelig
library(Zelig)
resAmelia <- zelig(as.formula(form), model = "normal", data = dataAmelia)

## How to cite this model in Zelig:
##   R Core Team. 2008.
##   normal: Normal Regression for Continuous Dependent Variables
##   in Christine Choirat, James Honaker, Kosuke Imai, Gary King, and Olivia Lau,
##   "Zelig: Everyone's Statistical Software," http://zeligproject.org/

summary(resAmelia)

## Model: Combined Imputations 
##               Estimate Std.Error    z value Pr(>|z|)    
## (Intercept) -0.0001467  0.004719   -0.03108   0.9752    
## x_1         -0.1613421  0.005723  -28.19392   0.0000 ***
## x_2          0.6803595  0.004323  157.36423   0.0000 ***
## x_3          1.4909804  0.008648  172.41187   0.0000 ***
## x_4         -1.3334185  0.006353 -209.87435   0.0000 ***
## x_5         -0.0902134  0.005857  -15.40349   0.0000 ***
## y_1          0.3865692  0.005802   66.62126   0.0000 ***
## y_2         -0.1297830  0.003602  -36.02745   0.0000 ***
## y_3         -0.1010767  0.006220  -16.24925   0.0000 ***
## y_4          0.6375901  0.004321  147.54152   0.0000 ***
## ---
## Signif. codes:  '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## For results from individual imputed datasets, use summary(x, subset = i:j)
## Next step: Use 'setx' method

## Data extraction is easier with mitools
resAmelia <- with(mitools::imputationList(dataAmelia$imputations),
                  lm(as.formula(form)))
tabAmelia <- mitools::MIcombine(resAmelia)

norm

s <- prelim.norm(as.matrix(dataMiss))

## Need this to run da.norm and imp.norm
rngseed(201512060)
## EM algorithm for initial parameters of MVN (deterministic)
resEm <- em.norm(s)

## Iterations of EM: 
## 1...2...3...4...5...6...7...8...9...10...11...12...13...14...15...16...17...18...19...20...21...22...23...24...25...26...27...28...29...30...31...32...33...34...35...

## Can check MVN parameters by EM
## getparam.norm(s, resEm)

## Set IP algorithm step count
R <- 1000

## IP algorithm
timeNorm <- system.time(dataNorm <- lapply(seq_len(M), function(i) {
    ## IP algorithm (Impute data, estimate MVN Parameter, repeat; MCMC)
    ## Last P step value returns as MVN parameters (after R steps)
    theta <- da.norm(s = s, start = resEm, steps = R)
    ## Using these MVN parameters, impute a dataset
    ## Here only one data draw from one MCMC chain (could draw multiple)
    out <- imp.norm(s = s, theta = theta, x = as.matrix(dataMiss))
    ## Convert to data frame
    as.data.frame(out)
}))


resNorm <- with(mitools::imputationList(dataNorm),
                lm(as.formula(form)))
tabNorm <- mitools::MIcombine(resNorm)
tabNorm

## Multiple imputation results:
##       with(mitools::imputationList(dataNorm), lm(as.formula(form)))
##       MIcombine.default(resNorm)
##                 results          se
## (Intercept) -0.00103876 0.004027416
## x_1         -0.16128723 0.005786933
## x_2          0.67968191 0.005181312
## x_3          1.49121783 0.007861090
## x_4         -1.33377092 0.007756583
## x_5         -0.09191798 0.005433026
## y_1          0.38492362 0.004812603
## y_2         -0.13127145 0.004940625
## y_3         -0.10159513 0.006093926
## y_4          0.63688309 0.005503139

mice

timeMice <- system.time(dataMice <- mice::mice(dataMiss, m = M, printFlag = F))
dataMice

## Multiply imputed data set
## Call:
## mice::mice(data = dataMiss, m = M, printFlag = F)
## Number of multiple imputations:  20
## Missing cells per column:
## x_1 x_2 x_3 x_4 x_5 y_1 y_2 y_3 y_4 y_5 
##   0   0   0   0   0 250 250 250 250 250 
## Imputation methods:
##   x_1   x_2   x_3   x_4   x_5   y_1   y_2   y_3   y_4   y_5 
##    ""    ""    ""    ""    "" "pmm" "pmm" "pmm" "pmm" "pmm" 
## VisitSequence:
## y_1 y_2 y_3 y_4 y_5 
##   6   7   8   9  10 
## PredictorMatrix:
##     x_1 x_2 x_3 x_4 x_5 y_1 y_2 y_3 y_4 y_5
## x_1   0   0   0   0   0   0   0   0   0   0
## x_2   0   0   0   0   0   0   0   0   0   0
## x_3   0   0   0   0   0   0   0   0   0   0
## x_4   0   0   0   0   0   0   0   0   0   0
## x_5   0   0   0   0   0   0   0   0   0   0
## y_1   1   1   1   1   1   0   1   1   1   1
## y_2   1   1   1   1   1   1   0   1   1   1
## y_3   1   1   1   1   1   1   1   0   1   1
## y_4   1   1   1   1   1   1   1   1   0   1
## y_5   1   1   1   1   1   1   1   1   1   0
## Random generator seed value:  NA

resMice <- with(dataMice, lm(as.formula(form)))
tabMice <- summary(mice::pool(resMice))
tabMice

##                      est          se            t       df     Pr(>|t|)       lo 95       hi 95 nmis       fmi
## (Intercept) -0.002342495 0.005988786   -0.3911469 22.75653 6.993269e-01 -0.01473858  0.01005359   NA 0.8558587
## x_1         -0.158720505 0.006516930  -24.3551047 29.90492 0.000000e+00 -0.17203163 -0.14540938    0 0.7630718
## x_2          0.677321987 0.007026246   96.3988441 24.68727 0.000000e+00  0.66284186  0.69180211    0 0.8289777
## x_3          1.484550838 0.011408691  130.1245528 22.31368 0.000000e+00  1.46090993  1.50819175    0 0.8621363
## x_4         -1.329162460 0.010174346 -130.6386195 23.75164 0.000000e+00 -1.35017290 -1.30815202    0 0.8418816
## x_5         -0.089351739 0.007423622  -12.0361380 27.83217 1.510791e-12 -0.10456247 -0.07414101    0 0.7879284
## y_1          0.381373465 0.006012563   63.4294300 30.02586 0.000000e+00  0.36909462  0.39365231  250 0.7616760
## y_2         -0.126830572 0.005301079  -23.9254254 33.98035 0.000000e+00 -0.13760389 -0.11605725  250 0.7191721
## y_3         -0.097828575 0.007649303  -12.7892145 25.17170 1.636469e-12 -0.11357716 -0.08207999  250 0.8224085
## y_4          0.634109031 0.004255176  149.0206283 76.86407 0.000000e+00  0.62563565  0.64258241  250 0.4735005
##                lambda
## (Intercept) 0.8437238
## x_1         0.7477391
## x_2         0.8156620
## x_3         0.8503094
## x_4         0.8291052
## x_5         0.7732176
## y_1         0.7463130
## y_2         0.7031158
## y_3         0.8088372
## y_4         0.4599770

mi

timeMi <- system.time(dataMi <- mi::mi(y = mi::missing_data.frame(dataMiss), n.chains = M))
dataMi

## Object of class mi with 20 chains, each with 30 iterations.
## Each chain is the evolution of an object of missing_data.frame class with 1000 observations on 10 variables.

resMi <- mi::pool(as.formula(form), data = dataMi)
resMi

## bayesglm(formula = as.formula(form), data = dataMi)
##             coef.est coef.se
## (Intercept)  0.00     0.00  
## x_1         -0.16     0.01  
## x_2          0.68     0.00  
## x_3          1.49     0.01  
## x_4         -1.33     0.01  
## x_5         -0.09     0.01  
## y_1          0.39     0.00  
## y_2         -0.13     0.00  
## y_3         -0.10     0.01  
## y_4          0.64     0.00  
## n = 990, k = 10
## residual deviance = 4.0, null deviance = 988.2 (difference = 984.2)
## overdispersion parameter = 0.0
## residual sd is sqrt(overdispersion) = 0.06

smcfcs

## Create method specification
smcfcsMethod <- names(dataMiss) %>%
    gsub("x_.*", "", .) %>%
    gsub("y_.*", "norm", .)
## Very last one is outcome. Need ""
smcfcsMethod[length(smcfcsMethod)] <- ""

timeSmcfcs <- system.time(dataSmcfcs <-
                              smcfcs::smcfcs(dataMiss, smtype = "lm",
                                             smformula = as.formula(form),
                                             method = smcfcsMethod, m = M))

## Warning in smcfcs::smcfcs(dataMiss, smtype = "lm", smformula = as.formula(form), : Rejection sampleing failed 231
## times (across all variables, iterations, and imputations). You may want to increase the rejection sampling limit.

resSmcfcs <- with(mitools::imputationList(dataSmcfcs$impDatasets),
                  lm(as.formula(form)))
tabSmcfcs <- mitools::MIcombine(resSmcfcs)
tabSmcfcs

## Multiple imputation results:
##       with(mitools::imputationList(dataSmcfcs$impDatasets), lm(as.formula(form)))
##       MIcombine.default(resSmcfcs)
##                  results          se
## (Intercept) -0.005561043 0.006813337
## x_1         -0.156006021 0.007861880
## x_2          0.677715464 0.006764725
## x_3          1.483505584 0.011616464
## x_4         -1.329263343 0.010005087
## x_5         -0.090318758 0.005804712
## y_1          0.383751629 0.007379530
## y_2         -0.130400992 0.005141435
## y_3         -0.093636949 0.007256798
## y_4          0.636653622 0.007567952

Comparison

nomiss is the results from the no missing data true dataset. The coefficients should be similar to these true values. The variances have to be larger than these because of uncertainty introduced by missingness.

## Point estimates
data.frame(nomiss = coef(resTrue),
           cc     = coef(resCc),
           norm   = coef(tabNorm),
           amelia = coef(tabAmelia),
           mice   = tabMice[,"est"],
           mi     = coef(resMi),
           smcfcs = coef(tabSmcfcs))

##                   nomiss           cc        norm        amelia         mice           mi       smcfcs
## (Intercept) -0.001835507 -0.001059939 -0.00103876 -0.0001466606 -0.002342495 -0.004612066 -0.005561043
## x_1         -0.157157733 -0.166233070 -0.16128723 -0.1613421065 -0.158720505 -0.160742326 -0.156006021
## x_2          0.683393025  0.682593597  0.67968191  0.6803595017  0.677321987  0.675871572  0.677715464
## x_3          1.493350458  1.499849412  1.49121783  1.4909804054  1.484550838  1.490109100  1.483505584
## x_4         -1.330457327 -1.339962195 -1.33377092 -1.3334185168 -1.329162460 -1.332507703 -1.329263343
## x_5         -0.091418760 -0.096700073 -0.09191798 -0.0902134099 -0.089351739 -0.094012980 -0.090318758
## y_1          0.385593717  0.387734643  0.38492362  0.3865692085  0.381373465  0.385896874  0.383751629
## y_2         -0.128577687 -0.126897720 -0.13127145 -0.1297829821 -0.126830572 -0.129714825 -0.130400992
## y_3         -0.100156860 -0.110169297 -0.10159513 -0.1010767298 -0.097828575 -0.103624698 -0.093636949
## y_4          0.636762259  0.640945579  0.63688309  0.6375900957  0.634109031  0.638588044  0.636653622

## SEs
data.frame(nomiss = coef(summary(resTrue))[,2],
           cc     = coef(summary(resCc))[,2],
           amelia = sqrt(diag(tabAmelia$variance)),
           norm   = sqrt(diag(tabNorm$variance)),
           mice   = tabMice[,"se"],
           mi     = resMi@ses,
           smcfcs = sqrt(diag(tabSmcfcs$variance)))

##                  nomiss          cc      amelia        norm        mice          mi      smcfcs
## (Intercept) 0.001925637 0.005064875 0.004719164 0.004027416 0.005988786 0.004693637 0.006813337
## x_1         0.002708522 0.006601763 0.005722585 0.005786933 0.006516930 0.005638183 0.007861880
## x_2         0.002462845 0.005984586 0.004323470 0.005181312 0.007026246 0.004480013 0.006764725
## x_3         0.003584221 0.008267254 0.008647783 0.007861090 0.011408691 0.007390177 0.011616464
## x_4         0.003443609 0.007543435 0.006353413 0.007756583 0.010174346 0.007229597 0.010005087
## x_5         0.002974575 0.007034794 0.005856685 0.005433026 0.007423622 0.006626966 0.005804712
## y_1         0.002618255 0.005787154 0.005802490 0.004812603 0.006012563 0.004863533 0.007379530
## y_2         0.002408740 0.004910731 0.003602336 0.004940625 0.005301079 0.004008586 0.005141435
## y_3         0.002839412 0.006668293 0.006220393 0.006093926 0.007649303 0.005927852 0.007256798
## y_4         0.002538469 0.005374366 0.004321428 0.005503139 0.004255176 0.004446172 0.007567952

## Running time
data.frame(amelia = timeAmelia[1:3],
           norm   = timeNorm[1:3],
           mice   = timeMice[1:3],
           mi     = timeMi[1:3],
           smcfcs = timeSmcfcs[1:3])

##           amelia  norm  mice     mi  smcfcs
## user.self  1.319 6.468 6.874  0.486  98.844
## sys.self   0.016 0.142 0.123  0.054   0.953
## elapsed    1.342 6.644 7.033 43.992 100.150

Top Page: http://rpubs.com/kaz_yos/
github: https://github.com/kaz-yos