This document explains how to include and exclude variables to be used as imputers as well as which variables with missingness are actually imputed. Most importantly, both predictorMatrix and imputation methods (obtained by a dry-run) must be manipulated.
library(tidyverse)
library(mice)
data(selfreport)
head(selfreport)
src id pop age sex hm wm hr wr prg edu etn web bm br
1 mgg 10001 NL 27 Male NA NA 190 85 <NA> Middle Autochtone No NA 23.54571
2 mgg 10002 NL 38 Male NA NA 189 93 <NA> Low Autochtone No NA 26.03511
3 mgg 10003 NL 21 Male NA NA 200 110 <NA> Low Autochtone No NA 27.50000
4 mgg 10004 NL 52 Male NA NA 176 80 <NA> Low Autochtone No NA 25.82645
5 mgg 10005 NL 44 Male NA NA 183 79 <NA> Middle Autochtone No NA 23.58984
6 mgg 10006 NL 31 Female NA NA 167 80 Not pregnant Middle Autochtone No NA 28.68514
summary(selfreport)
src id pop age sex hm wm
krul:1257 Min. :10001 NL:2060 Min. :18.00 Female:1098 Min. :143.6 Min. : 44.0
mgg : 803 1st Qu.:10612 IT: 0 1st Qu.:29.00 Male : 962 1st Qu.:166.5 1st Qu.: 66.0
Median :12302 US: 0 Median :41.00 Median :173.3 Median : 75.8
Mean :13299 Mean :42.06 Mean :174.0 Mean : 77.8
3rd Qu.:15958 3rd Qu.:54.00 3rd Qu.:181.1 3rd Qu.: 86.6
Max. :17079 Max. :75.00 Max. :207.3 Max. :149.7
NA's :803 NA's :803
hr wr prg edu etn web
Min. :143.0 Min. : 30.00 Pregnant : 0 Low : 286 Autochtone: 657 No :1858
1st Qu.:168.0 1st Qu.: 66.00 Delivery last 6 months: 0 Middle: 323 Allochtone: 146 Yes: 202
Median :175.0 Median : 75.20 Not pregnant : 403 High : 194 Unknown : 0
Mean :174.9 Mean : 77.79 NA's :1657 NA's :1257 NA's :1257
3rd Qu.:182.0 3rd Qu.: 87.00
Max. :214.0 Max. :165.00
bm br
Min. :16.83 Min. :10.76
1st Qu.:21.98 1st Qu.:22.08
Median :24.64 Median :24.71
Mean :25.68 Mean :25.39
3rd Qu.:28.12 3rd Qu.:27.67
Max. :55.17 Max. :52.08
NA's :803
## Proportion missing
gather(selfreport) %>%
group_by(key) %>%
summarize(prop_na = mean(is.na(value))) %>%
ggplot(mapping = aes(x = key, y = prop_na)) +
geom_bar(stat = "identity") +
scale_y_continuous(limits = c(0,1)) +
theme_bw() + theme(legend.key = element_blank())
Warning: attributes are not identical across measure variables; they will be dropped
## Configure parallelization
## Parallel backend for foreach (also loads foreach and parallel; includes doMC)
library(doParallel)
## Reproducible parallelization
library(doRNG)
## Detect core count
nCores <- min(parallel::detectCores(), 8)
## Used by parallel::mclapply() as default
options(mc.cores = nCores)
## Used by doParallel as default
options(cores = nCores)
## Register doParallel as the parallel backend with foreach
## http://stackoverflow.com/questions/28989855/the-difference-between-domc-and-doparallel-in-r
doParallel::registerDoParallel(cores = nCores)
## Report multicore use
cat("### Using", foreach::getDoParWorkers(), "cores\n")
cat("### Using", foreach::getDoParName(), "as backend\n")
### Using 4 cores
### Using doParallelMC as backend
Here we exclude prg from either from the imputer variables or the imputed variables. bm will be imputed, but does not participate as a imputer in the process.
cat("
###
### Missing data imputation with mice
################################################################################\n")
## Create a before-MI dataset
df_before <- selfreport
## Extract all variable names in dataset
allVars <- names(df_before)
## names of variables with missingness
missVars <- names(df_before)[colSums(is.na(df_before)) > 0]
## mice predictorMatrix
## A square matrix of size ‘ncol(data)’ containing 0/1
## data specifying the set of predictors to be used for each
## target column. Rows correspond to target variables (i.e.
## variables to be imputed), in the sequence as they appear in
## data. A value of '1' means that the column variable is used
## as a predictor for the target variable (in the rows). The
## diagonal of ‘predictorMatrix’ must be zero. The default for
## ‘predictorMatrix’ is that all other columns are used as
## predictors (sometimes called massive imputation). Note: For
## two-level imputation codes '2' and '-2' are also allowed.
##
predictorMatrix <- matrix(0, ncol = length(allVars), nrow = length(allVars))
rownames(predictorMatrix) <- allVars
colnames(predictorMatrix) <- allVars
## Avoid perfect linear dependence
## http://stats.stackexchange.com/questions/127104/r-mice-imputation-failing
## Too many weights issue in nnet called by mice
## http://stackoverflow.com/questions/28551633/error-in-r-mice-package-too-many-weights
cat("
### Specify Variables informing imputation\n")
## These can be either complete variables or variables with missingness.
## Those with missingness must be imputed.
## Explicitly specify.
imputerVars <- c("src","age","sex","wm","hr","wr","edu","etn","web")
## Keep variables that actually exist in dataset
imputerVars <- intersect(unique(imputerVars), allVars)
imputerVars
imputerMatrix <- predictorMatrix
imputerMatrix[,imputerVars] <- 1
imputerMatrix
cat("
### Specify variables with missingness to be imputed \n")
## Could specify additional variables that are imputed,
## but does not inform imputation.
imputedOnlyVars <- c("bm")
## Imputers that have missingness must be imputed.
imputedVars <- intersect(unique(c(imputedOnlyVars, imputerVars)), missVars)
imputedVars
imputedMatrix <- predictorMatrix
imputedMatrix[imputedVars,] <- 1
imputedMatrix
cat("
### Construct a full predictor matrix (rows: imputed variables; cols: imputer variables)\n")
## Keep correct imputer-imputed pairs only
predictorMatrix <- imputerMatrix * imputedMatrix
## Diagonals must be zeros (a variable cannot impute itself)
diag(predictorMatrix) <- 0
predictorMatrix
cat("
### Dry-run mice for imputation methods\n")
dryMice <- mice(data = df_before, m = 1, predictorMatrix = predictorMatrix, maxit = 0)
## Update predictor matrix
predictorMatrix <- dryMice$predictorMatrix
cat("### Imputers (non-zero columns of predictorMatrix)\n")
imputerVars <- colnames(predictorMatrix)[colSums(predictorMatrix) > 0]
imputerVars
cat("### Imputed (non-zero rows of predictorMatrix)\n")
imputedVars <- rownames(predictorMatrix)[rowSums(predictorMatrix) > 0]
imputedVars
cat("### Imputers that are complete\n")
setdiff(imputerVars, imputedVars)
cat("### Imputers with missingness\n")
intersect(imputerVars, imputedVars)
cat("### Imputed-only variables without being imputers\n")
setdiff(imputedVars, imputerVars)
cat("### Variables with missingness that are not imputed\n")
setdiff(missVars, imputedVars)
cat("### Relevant part of predictorMatrix\n")
predictorMatrix[rowSums(predictorMatrix) > 0, colSums(predictorMatrix) > 0]
## Empty imputation method to really exclude variables
## http://www.stefvanbuuren.nl/publications/MICE%20in%20R%20-%20Draft.pdf
##
## MICE will automatically skip imputation of variables that are complete.
## One of the problems in previous versions of MICE was that all incomplete
## data needed to be imputed. In MICE 2.0 it is possible to skip imputation
## of selected incomplete variables by specifying the empty method "".
## This works as long as the incomplete variable that is skipped is not being
## used as a predictor for imputing other variables.
## Note: puttting zeros in the predictorMatrix alone is NOT enough!
##
dryMice$method[setdiff(allVars, imputedVars)] <- ""
cat("### Methods used for imputation\n")
dryMice$method[sapply(dryMice$method, nchar) > 0]
cat("
### Run mice\n")
M <- 4
cat("### Imputing", M, "times\n")
## Set seed for reproducibility
set.seed(3561126)
## Parallelized execution
miceout <- foreach(i = seq_len(M), .combine = ibind) %dorng% {
cat("### Started iteration", i, "\n")
miceout <- mice(data = df_before, m = 1, print = TRUE,
predictorMatrix = predictorMatrix, method = dryMice$method,
MaxNWts = 2000)
cat("### Completed iteration", i, "\n")
## Make sure to return the output
miceout
}
cat("
### Show mice results\n")
## mice object ifself
miceout
## Variables that no longer have missingness after imputation
cat("### Variables actually imputed\n")
actuallyImputedVars <-
setdiff(names(df_before)[colSums(is.na(df_before)) > 0],
names(complete(miceout, action = 1))[colSums(is.na(complete(miceout, action = 1))) > 0])
actuallyImputedVars
## Examine discrepancies
cat("### Variables that were unexpectedly imputed\n")
setdiff(actuallyImputedVars, imputedVars)
cat("### Variables that were planned for MI but not imputed\n")
setdiff(imputedVars, actuallyImputedVars)
## Still missing variables
cat("### Variables still having missing values\n")
names(complete(miceout, action = 1))[colSums(is.na(complete(miceout, action = 1))) > 0]
###
### Missing data imputation with mice
################################################################################
### Specify Variables informing imputation
[1] "src" "age" "sex" "wm" "hr" "wr" "edu" "etn" "web"
src id pop age sex hm wm hr wr prg edu etn web bm br
src 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0
id 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0
pop 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0
age 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0
sex 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0
hm 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0
wm 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0
hr 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0
wr 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0
prg 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0
edu 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0
etn 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0
web 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0
bm 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0
br 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0
### Specify variables with missingness to be imputed
[1] "bm" "wm" "edu" "etn"
src id pop age sex hm wm hr wr prg edu etn web bm br
src 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
id 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
pop 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
age 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
sex 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
hm 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
wm 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
hr 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
wr 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
prg 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
edu 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
etn 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
web 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
bm 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
br 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
### Construct a full predictor matrix (rows: imputed variables; cols: imputer variables)
src id pop age sex hm wm hr wr prg edu etn web bm br
src 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
id 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
pop 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
age 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
sex 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
hm 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
wm 1 0 0 1 1 0 0 1 1 0 1 1 1 0 0
hr 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
wr 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
prg 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
edu 1 0 0 1 1 0 1 1 1 0 0 1 1 0 0
etn 1 0 0 1 1 0 1 1 1 0 1 0 1 0 0
web 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
bm 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0
br 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
### Dry-run mice for imputation methods
### Imputers (non-zero columns of predictorMatrix)
[1] "src" "age" "sex" "wm" "hr" "wr" "edu" "etn" "web"
### Imputed (non-zero rows of predictorMatrix)
[1] "wm" "edu" "etn" "bm"
### Imputers that are complete
[1] "src" "age" "sex" "hr" "wr" "web"
### Imputers with missingness
[1] "wm" "edu" "etn"
### Imputed-only variables without being imputers
[1] "bm"
### Variables with missingness that are not imputed
[1] "hm" "prg"
### Relevant part of predictorMatrix
src age sex wm hr wr edu etn web
wm 1 1 1 0 1 1 1 1 1
edu 1 1 1 1 1 1 0 1 1
etn 1 1 1 1 1 1 1 0 1
bm 1 1 1 1 1 1 1 1 1
### Methods used for imputation
wm edu etn bm
"pmm" "polyreg" "polyreg" "pmm"
### Run mice
### Imputing 4 times
### Show mice results
Multiply imputed data set
Call:
[[1]]
mice(data = df_before, m = 1, method = dryMice$method, predictorMatrix = predictorMatrix,
printFlag = TRUE, MaxNWts = 2000)
[[2]]
fun(x = result.1, y = result.2)
[[3]]
fun(x = accum, y = result.3)
[[4]]
fun(x = accum, y = result.4)
Number of multiple imputations: 4
Missing cells per column:
src id pop age sex hm wm hr wr prg edu etn web bm br
0 0 0 0 0 803 803 0 0 1657 1257 1257 0 803 0
Imputation methods:
src id pop age sex hm wm hr wr prg edu
"" "" "" "" "" "" "pmm" "" "" "" "polyreg"
etn web bm br
"polyreg" "" "pmm" ""
VisitSequence:
hm wm prg edu etn bm
6 7 10 11 12 14
PredictorMatrix:
src id pop age sex hm wm hr wr prg edu etn web bm br
src 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
id 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
pop 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
age 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
sex 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
hm 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
wm 1 0 0 1 1 0 0 1 1 0 1 1 1 0 0
hr 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
wr 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
prg 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
edu 1 0 0 1 1 0 1 1 1 0 0 1 1 0 0
etn 1 0 0 1 1 0 1 1 1 0 1 0 1 0 0
web 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
bm 1 0 0 1 1 0 1 1 1 0 1 1 1 0 0
br 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Random generator seed value: NA
### Variables actually imputed
[1] "wm" "edu" "etn" "bm"
### Variables that were unexpectedly imputed
character(0)
### Variables that were planned for MI but not imputed
character(0)
### Variables still having missing values
[1] "hm" "prg"