How to conduct parametric and non-parametric MICE by mice and missRanger

Author

Hiroyuki Kuroda, Yokohama City University

Introduction

This document presents examples of multiple imputation coding using parametric MICE with the mice package and nonparametric MICE (by chained random forests) with the missRanger package, using the Diamonds dataset that generated missing values.

本ドキュメントは、欠測を生成したDiamondsデータセットを使用して、miceパッケージを用いたparametric MICEと、missRangerパッケージを用いた(chained random forestsによる)non-parametric MICEによる多重補完のコーディング例を提示します。

Preparations

# packages
pacman::p_load(tidyverse, 
               rio, 
               here, 
               missRanger, 
               mitools, 
               mice)

# dplyr::select
select <- dplyr::select

# random seeds
set.seed(2025)

Preparing the data.

データを準備します。

# prepareing the data
dat <- ggplot2::diamonds %>% 
  
  # Reduce to 1000 to save the time
  slice(1:1000) %>% 
  
  # Delete order information
  mutate(cut = factor(as.character(cut)),  
         cut = fct_relevel(cut, "Fair", "Good", "Very Good", "Premium", "Ideal")) %>% 
  
  # remove depth and table
  select(-depth, -table) %>% 
  
  # rename
  rename(length = x, 
         width = y, 
         depth = z) %>% 
  
  # Convert color to a categorical variable
  mutate(color_cat = case_when(color %in% c("D", "E", "F", "G") ~ 0, 
                               color %in% c("H", "I", "J") ~ 1), 
         color_cat = factor(color_cat, 
                            levels = c(0, 1), 
                            labels = c("Worse", "Better"))) %>% 
  
  # Convert clarity to a categorical variable
  mutate(clarity_cat = case_when(clarity %in% c("I1", "SI2", "SI1") ~ 0, 
                                 clarity %in% c("VS2", "VS1", "VVS2") ~ 1, 
                                 clarity %in% c("VVS1", "IF") ~ 2), 
         clarity_cat = factor(clarity_cat, 
                              levels = c(0, 1, 2), 
                              labels = c("Worse", "Moderate", "Better"))) %>% 
  
  # remove unused variables
  select(-color, -clarity) %>% 
  
  # generate NA with 10% rates
  generateNA(p = 0.1)

Check the data.

データを確認します。

head(dat)
# A tibble: 6 × 8
  carat cut     price length width depth color_cat clarity_cat
  <dbl> <fct>   <int>  <dbl> <dbl> <dbl> <fct>     <fct>      
1  0.23 Ideal     326   3.95  3.98 NA    Worse     Worse      
2  0.21 Premium   326  NA     3.84  2.31 Worse     Worse      
3  0.23 Good      327   4.05  4.07  2.31 Worse     Moderate   
4  0.29 Premium   334   4.2   4.23  2.63 Better    Moderate   
5  0.31 Good       NA   4.34  4.35  2.75 Better    Worse      
6  0.24 <NA>      336   3.94  3.96  2.48 Better    Moderate   

parametric MICE by mice

# generate mice dataset
dat_mice <- mice(data = dat,
                 m = 10, 
                 maxit = 10,
                 seed = 2025,
                 printFlag = FALSE)
                 # method = method (omit)
                 # predictorMatrix = predictorMatrix (omit)
# conduct MI

dat_mice_imputed <- dat_mice %>% 

  # stack up complete dataset
  mice::complete("long") %>%
  
  # nest
  as_tibble() %>%
  group_by(.imp) %>%
  nest() %>%
  
  # For each complete data, 
  # perform the procedure of interest with the map function
  mutate(fit = map(.x = data, ~{
    glm(price ~ 
          carat + cut + length + width + depth + color_cat + clarity_cat, 
        family = gaussian, 
        data = .x)
  })) %>%
  
  # extract coefficients and vcov
  mutate(coef_fit = map(fit, coef),
         vcov_fit = map(fit, vcov))
# summary with mitools package
mi_mice <- with(dat_mice_imputed, MIcombine(results = coef_fit,
                                            variances = vcov_fit))

mi_mice_summary <- summary(mi_mice)
Multiple imputation results:
      MIcombine.default(results = coef_fit, variances = vcov_fit)
                         results        se       (lower      upper) missInfo
(Intercept)         -9909.588405 301.58245 -10509.25355 -9309.92326     34 %
carat               -4849.511130 300.24295  -5447.89014 -4251.13212     37 %
cutGood                -6.327025  42.07081    -89.61023    76.95618     28 %
cutVery Good           20.529914  38.63878    -56.01957    97.07940     29 %
cutPremium             11.083030  35.21546    -58.41992    80.58598     24 %
cutIdeal               -1.726766  38.43569    -77.87111    74.41757     29 %
length                721.114455 128.29363    466.27088   975.95803     33 %
width                 805.639823 130.27618    545.54367  1065.73597     39 %
depth                2063.055863  91.65052   1883.08533  2243.02640     12 %
color_catBetter      -271.590470  18.48420   -308.31822  -234.86272     33 %
clarity_catModerate   171.568640  14.48877    143.12185   200.01543     12 %
clarity_catBetter     469.485725  26.66404    417.19837   521.77308      6 %

non-parametric MICE by missRanger

Imputation by missRanger produces one complete dataset in one run with one random seed.
Therefore, based on the specified set.seed(2025) , additional random seeds (10 in this case) is generated to ensure reproducibility.

missRangerによる補完は、1つのランダムシードを用いた1回の実行で、1つの完全データセットを生成します。
そのため、最初に指定したset.seed(2025)に基づいて、生成するデータセット分の乱数(今回は10)を発生させて、再現性を担保します。

dat_ranger_imputed <- tibble(
  # 10 imputations
  .imp = 1:10, 
  
  # 10 original datasets with NA
  original = rep(list(dat), 10), 
  
  # random seed 
  seed = sample(1:9999, 10, replace = FALSE)
  ) %>% 
  
  # Using the map2 function, 
  # specify the original data set (all the same) in .x 
  # and a random seed in .y
  mutate(data = map2(.x = original, .y = seed, ~{
    missRanger(.x, 
               formula = . ~ ., 
               maxiter = 10, 
               pmm.k = 3, 
               seed = .y, 
               verbose = 0, 
               data_only = TRUE, 
               num.trees = 100, 
               mtry = 1) # or floor(sqrt(ncol(dat)))
  })) %>% 

  mutate(fit = map(.x = data, ~{
    glm(price ~ 
          carat + cut + length + width + depth + color_cat + clarity_cat, 
        family = gaussian, 
        data = .x)
  })) %>%
  
  mutate(coef_fit = map(fit, coef),
         vcov_fit = map(fit, vcov))
# summary with mitools package
mi_ranger <- with(dat_ranger_imputed, MIcombine(results = coef_fit,
                                                variances = vcov_fit))

mi_ranger_summary <- summary(mi_ranger)
Multiple imputation results:
      MIcombine.default(results = coef_fit, variances = vcov_fit)
                        results        se      (lower     upper) missInfo
(Intercept)         -8738.98706 332.34167 -9407.02971 -8070.9444     45 %
carat               -3769.22492 326.26529 -4425.40094 -3113.0489     46 %
cutGood                17.70202  50.78883   -84.24521   119.6492     44 %
cutVery Good           36.37413  43.01121   -49.30631   122.0546     36 %
cutPremium             39.05739  43.18906   -47.53991   125.6547     43 %
cutIdeal               45.76395  45.55410   -45.74656   137.2745     45 %
length                549.05669 136.93218   276.62789   821.4855     35 %
width                 803.05301 140.18132   522.43411  1083.6719     41 %
depth                1784.38008 116.50748  1550.88119  2017.8790     43 %
color_catBetter      -303.18420  18.27847  -339.15568  -267.2127     18 %
clarity_catModerate   174.53947  19.42981   135.48087   213.5981     45 %
clarity_catBetter     459.70032  36.96544   385.37692   534.0237     46 %