# packages
pacman::p_load(tidyverse,
rio,
here,
missRanger,
mitools,
mice)
# dplyr::select
select <- dplyr::select
# random seeds
set.seed(2025)How to conduct parametric and non-parametric MICE by mice and missRanger
Introduction
This document presents examples of multiple imputation coding using parametric MICE with the mice package and nonparametric MICE (by chained random forests) with the missRanger package, using the Diamonds dataset that generated missing values.
本ドキュメントは、欠測を生成したDiamondsデータセットを使用して、miceパッケージを用いたparametric MICEと、missRangerパッケージを用いた(chained random forestsによる)non-parametric MICEによる多重補完のコーディング例を提示します。
Preparations
Preparing the data.
データを準備します。
# prepareing the data
dat <- ggplot2::diamonds %>%
# Reduce to 1000 to save the time
slice(1:1000) %>%
# Delete order information
mutate(cut = factor(as.character(cut)),
cut = fct_relevel(cut, "Fair", "Good", "Very Good", "Premium", "Ideal")) %>%
# remove depth and table
select(-depth, -table) %>%
# rename
rename(length = x,
width = y,
depth = z) %>%
# Convert color to a categorical variable
mutate(color_cat = case_when(color %in% c("D", "E", "F", "G") ~ 0,
color %in% c("H", "I", "J") ~ 1),
color_cat = factor(color_cat,
levels = c(0, 1),
labels = c("Worse", "Better"))) %>%
# Convert clarity to a categorical variable
mutate(clarity_cat = case_when(clarity %in% c("I1", "SI2", "SI1") ~ 0,
clarity %in% c("VS2", "VS1", "VVS2") ~ 1,
clarity %in% c("VVS1", "IF") ~ 2),
clarity_cat = factor(clarity_cat,
levels = c(0, 1, 2),
labels = c("Worse", "Moderate", "Better"))) %>%
# remove unused variables
select(-color, -clarity) %>%
# generate NA with 10% rates
generateNA(p = 0.1)Check the data.
データを確認します。
head(dat)# A tibble: 6 × 8
carat cut price length width depth color_cat clarity_cat
<dbl> <fct> <int> <dbl> <dbl> <dbl> <fct> <fct>
1 0.23 Ideal 326 3.95 3.98 NA Worse Worse
2 0.21 Premium 326 NA 3.84 2.31 Worse Worse
3 0.23 Good 327 4.05 4.07 2.31 Worse Moderate
4 0.29 Premium 334 4.2 4.23 2.63 Better Moderate
5 0.31 Good NA 4.34 4.35 2.75 Better Worse
6 0.24 <NA> 336 3.94 3.96 2.48 Better Moderate
parametric MICE by mice
# generate mice dataset
dat_mice <- mice(data = dat,
m = 10,
maxit = 10,
seed = 2025,
printFlag = FALSE)
# method = method (omit)
# predictorMatrix = predictorMatrix (omit)# conduct MI
dat_mice_imputed <- dat_mice %>%
# stack up complete dataset
mice::complete("long") %>%
# nest
as_tibble() %>%
group_by(.imp) %>%
nest() %>%
# For each complete data,
# perform the procedure of interest with the map function
mutate(fit = map(.x = data, ~{
glm(price ~
carat + cut + length + width + depth + color_cat + clarity_cat,
family = gaussian,
data = .x)
})) %>%
# extract coefficients and vcov
mutate(coef_fit = map(fit, coef),
vcov_fit = map(fit, vcov))# summary with mitools package
mi_mice <- with(dat_mice_imputed, MIcombine(results = coef_fit,
variances = vcov_fit))
mi_mice_summary <- summary(mi_mice)Multiple imputation results:
MIcombine.default(results = coef_fit, variances = vcov_fit)
results se (lower upper) missInfo
(Intercept) -9909.588405 301.58245 -10509.25355 -9309.92326 34 %
carat -4849.511130 300.24295 -5447.89014 -4251.13212 37 %
cutGood -6.327025 42.07081 -89.61023 76.95618 28 %
cutVery Good 20.529914 38.63878 -56.01957 97.07940 29 %
cutPremium 11.083030 35.21546 -58.41992 80.58598 24 %
cutIdeal -1.726766 38.43569 -77.87111 74.41757 29 %
length 721.114455 128.29363 466.27088 975.95803 33 %
width 805.639823 130.27618 545.54367 1065.73597 39 %
depth 2063.055863 91.65052 1883.08533 2243.02640 12 %
color_catBetter -271.590470 18.48420 -308.31822 -234.86272 33 %
clarity_catModerate 171.568640 14.48877 143.12185 200.01543 12 %
clarity_catBetter 469.485725 26.66404 417.19837 521.77308 6 %
non-parametric MICE by missRanger
Imputation by missRanger produces one complete dataset in one run with one random seed.
Therefore, based on the specified set.seed(2025) , additional random seeds (10 in this case) is generated to ensure reproducibility.
missRangerによる補完は、1つのランダムシードを用いた1回の実行で、1つの完全データセットを生成します。
そのため、最初に指定したset.seed(2025)に基づいて、生成するデータセット分の乱数(今回は10)を発生させて、再現性を担保します。
dat_ranger_imputed <- tibble(
# 10 imputations
.imp = 1:10,
# 10 original datasets with NA
original = rep(list(dat), 10),
# random seed
seed = sample(1:9999, 10, replace = FALSE)
) %>%
# Using the map2 function,
# specify the original data set (all the same) in .x
# and a random seed in .y
mutate(data = map2(.x = original, .y = seed, ~{
missRanger(.x,
formula = . ~ .,
maxiter = 10,
pmm.k = 3,
seed = .y,
verbose = 0,
data_only = TRUE,
num.trees = 100,
mtry = 1) # or floor(sqrt(ncol(dat)))
})) %>%
mutate(fit = map(.x = data, ~{
glm(price ~
carat + cut + length + width + depth + color_cat + clarity_cat,
family = gaussian,
data = .x)
})) %>%
mutate(coef_fit = map(fit, coef),
vcov_fit = map(fit, vcov))# summary with mitools package
mi_ranger <- with(dat_ranger_imputed, MIcombine(results = coef_fit,
variances = vcov_fit))
mi_ranger_summary <- summary(mi_ranger)Multiple imputation results:
MIcombine.default(results = coef_fit, variances = vcov_fit)
results se (lower upper) missInfo
(Intercept) -8738.98706 332.34167 -9407.02971 -8070.9444 45 %
carat -3769.22492 326.26529 -4425.40094 -3113.0489 46 %
cutGood 17.70202 50.78883 -84.24521 119.6492 44 %
cutVery Good 36.37413 43.01121 -49.30631 122.0546 36 %
cutPremium 39.05739 43.18906 -47.53991 125.6547 43 %
cutIdeal 45.76395 45.55410 -45.74656 137.2745 45 %
length 549.05669 136.93218 276.62789 821.4855 35 %
width 803.05301 140.18132 522.43411 1083.6719 41 %
depth 1784.38008 116.50748 1550.88119 2017.8790 43 %
color_catBetter -303.18420 18.27847 -339.15568 -267.2127 18 %
clarity_catModerate 174.53947 19.42981 135.48087 213.5981 45 %
clarity_catBetter 459.70032 36.96544 385.37692 534.0237 46 %