This example will illustrate typical aspects of dealing with missing data. Topics will include: Mean imputation, modal imputation for categorical data, and multiple imputation of complex patterns of missing data.

For this example I am using 2014 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART county data. Link

library(car)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(mice) #Library for multiple imputation
## Loading required package: Rcpp
## mice 2.25 2015-11-09
library(MASS)

#load brfss
load("brfss_14.Rdata")
nams<-names(brfss14)
newnames<-gsub(pattern = "x_",replacement =  "",x =  nams)
names(brfss14)<-tolower(newnames)

set.seed(1234)


brfss14$strat<-sprintf("%05d", brfss14$ststr)
brfss14$state<-substr(brfss14$strat, 1,2)

#just for brevity, I just select TX respondents with non missing weights
brfss14<-brfss14[brfss14$state=="48",]

#insurance
brfss14$ins<-recode(brfss14$hlthpln1, recodes="1=1;2=0;else=NA")
#bmi
brfss14$bmi<-ifelse(is.na(brfss14$bmi5)==T, NA, brfss14$bmi5/100)
#poor or fair health
brfss14$badhealth<-recode(brfss14$genhlth, recodes="4:5=1; 1:3=0; else=NA")

#race
brfss14$black<-recode(brfss14$racegr3, recodes="2=1; 9=NA; else=0")
brfss14$white<-recode(brfss14$racegr3, recodes="1=1; 9=NA; else=0")
brfss14$other<-recode(brfss14$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss14$hispanic<-recode(brfss14$racegr3, recodes="5=1; 9=NA; else=0")
brfss14$race_eth<-recode(brfss14$racegr3, recodes="1='nhwhite'; 2='nh black'; 3='nh other';
                         4='nh multirace'; 5='hispanic'; else=NA", as.factor.result = T)
brfss14$race_eth<-relevel(brfss14$race_eth, ref = "nhwhite")

#have a personal doctor?
brfss14$doc<-recode(brfss14$persdoc2, recodes="1:2=1; 3=0; else=NA", as.factor.result=F)

#needed care in last year but couldn't get it because of cost
brfss14$medacc<-recode(brfss14$medcost, recodes="1=1;2=0;else=NA")

#education level
brfss14$educ<-recode(brfss14$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor.result=T)
brfss14$educ<-relevel(brfss14$educ, ref='2hsgrad')

#employment
brfss14$emp<-recode(brfss14$employ, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor.result=T)
brfss14$emp<-relevel(brfss14$emp, ref='Employed')

#marital status
brfss14$marst<-recode(brfss14$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor.result=T)
brfss14$marst<-relevel(brfss14$marst, ref='married')

#income
brfss14$inc<-recode(brfss14$income2, recodes="1='<10k'; 2='10 to 15k'; 3='15 to 20k'; 4='20 to 25k'; 5='25 to 35k';6='35 to 50k'; 7='50 to 75k';8='75k+'; 77:99=NA", as.factor.result = T)

#Age cut into intervals
brfss14$agec<-cut(brfss14$age80, breaks=c(0,24,39,59,79,99), include.lowest = T)

#Health behaviors
#insurance
brfss14$ins<-recode(brfss14$hlthpln1, recodes = "1=1; 2=0; else=NA" )

#smoking currently
brfss14$smoke<-recode(brfss14$smoker3, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor.result=T)
brfss14$smoke<-relevel(brfss14$smoke, ref = "NeverSmoked")

Now, we can get a general idea of the missingness of these variables by just using summary(brfss14)

summary(brfss14[, c("ins", "smoke",  "bmi", "badhealth", "race_eth", "doc", "educ", "emp", "marst", "inc")])
##       ins                 smoke           bmi          badhealth     
##  Min.   :0.0000   NeverSmoked:2377   Min.   :13.77   Min.   :0.0000  
##  1st Qu.:1.0000   Current    : 682   1st Qu.:23.91   1st Qu.:0.0000  
##  Median :1.0000   Former     :1279   Median :27.32   Median :0.0000  
##  Mean   :0.9024   NA's       : 131   Mean   :28.18   Mean   :0.1984  
##  3rd Qu.:1.0000                      3rd Qu.:31.09   3rd Qu.:0.0000  
##  Max.   :1.0000                      Max.   :88.19   Max.   :1.0000  
##  NA's   :12                          NA's   :258     NA's   :9       
##          race_eth         doc               educ            emp      
##  nhwhite     :3286   Min.   :0.0000   2hsgrad :1274   Employed:2204  
##  hispanic    : 261   1st Qu.:1.0000   0Prim   : 102   nilf    : 662  
##  nh black    : 356   Median :1.0000   1somehs : 277   retired :1185  
##  nh multirace: 223   Mean   :0.8294   3somecol:1288   unable  : 394  
##  nh other    : 310   3rd Qu.:1.0000   4colgrad:1504   NA's    :  24  
##  NA's        :  33   Max.   :1.0000   NA's    :  24                  
##                      NA's   :3                                       
##        marst             inc      
##  married  :2374   75k+     :1089  
##  cohab    : 113   35 to 50k: 588  
##  divorced : 675   50 to 75k: 581  
##  nm       : 588   25 to 35k: 491  
##  separated:  94   20 to 25k: 375  
##  widowed  : 599   (Other)  : 719  
##  NA's     :  26   NA's     : 626

Which shows that, among these recoded variables, hc or the indicator for smoking is missing the most with 626 people in our TX sample missing, or 14.007608% of the sample.

The lowest number of missings is in the current doctor variable, which only has 0.0671291% missing.

Mean imputation

Now, i’m going to illustrate mean imputation of a continuous variable, BMI.

#I'm going to play with 3 outcomes, bmi, having a regular doctor and income category
summary(brfss14$bmi) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   13.77   23.91   27.32   28.18   31.09   88.19     258
#what happens when we replace the missings with the mean?
brfss14$bmi.imp.mean<-ifelse(is.na(brfss14$bmi)==T, mean(brfss14$bmi, na.rm=T), brfss14$bmi)

mean(brfss14$bmi, na.rm=T)
## [1] 28.18241
mean(brfss14$bmi.imp.mean) #no difference!
## [1] 28.18241
median(brfss14$bmi, na.rm=T)
## [1] 27.32
median(brfss14$bmi.imp.mean) #slight difference
## [1] 27.48
var(brfss14$bmi, na.rm=T)
## [1] 40.16017
var(brfss14$bmi.imp.mean) # more noticeable difference!
## [1] 37.84117

So what we see here, is that imputing with the mean does nothing to central tendency (when measured using the mean, but does affect the median slightly), but it does reduce the variance in the outcome. This is because you’re replacing all missing cases with the most likely value (the mean), so you’re artificially deflating the variance. That’s not good.

We can see this in a histogram:

#plot the histogram
hist(brfss14$bmi.imp.mean)
hist(brfss14$bmi, add=T ,col=1) #

Where you can see the extra values at the mean by the white area over the mode.

Multiple Imputation

These days, these types of imputation have been far surpassed by more complete methods that are based upon regression methods. These methods are generally referred to as multiple imputation, because we are really interested in imputing multiple variables simultaneously. Instead of reviewing this perspective here, I suggest you have a look at Joe Schafer’s site that gives a nice treatment of the subject. Here, I will use the imputation techniques in the mice library in R, which you can read about here.

I have used these in practice in publications and generally like the framework the library uses. Another popular technique is in the Amelia library of Gary King, which I haven’t used much. If you are serious about doing multiple imputation it would be advised to investigate multiple methodologies.

To begin, I explore the various patterns of missingness in the data. The md.pattern function in mice does this nicely. Here, each row corresponds to a particular pattern of missingness (1 = observed, 0=missing)

#look at the patterns of missingness
md.pattern(brfss14[,c("bmi", "inc", "agec","educ","race_eth")])
##      agec educ race_eth bmi inc    
## 3688    1    1        1   1   1   0
##  135    1    1        1   0   1   1
##  497    1    1        1   1   0   1
##    4    1    0        1   1   1   1
##   15    1    1        0   1   1   1
##   95    1    1        1   0   0   2
##    1    1    1        0   0   1   2
##    7    1    1        0   1   0   2
##   17    1    0        1   0   0   3
##    7    1    1        0   0   0   3
##    3    1    0        0   0   0   4
##         0   24       33 258 626 941

Shows that 3688 rows of the data are complete (first row).

The second row shows that 135 people are missing only the bmi variable. I say only because 258 people are missing the bmi variable in total. Apparently some folks are missing in combination with other variables.

Sure enough, if we look down, we see that 95 people are missing bmi AND income, 17 are missing bmi AND income and education, and so on.

The bottom row tells how many total people are missing each variable, in ANY combination with other variables.

If you want to see how pairs of variables are missing together, the md.pairs() function will show this. A pair of variables can have exactly four missingness patterns: both variables are observed (pattern rr), the first variable is observed and the second variable is missing (pattern rm), the first variable is missing and the second variable is observed (pattern mr), and both are missing (pattern mm).

md.pairs(brfss14[,c("bmi", "inc", "agec","educ","race_eth")])
## $rr
##           bmi  inc agec educ race_eth
## bmi      4211 3707 4211 4207     4189
## inc      3707 3843 3843 3839     3827
## agec     4211 3843 4469 4445     4436
## educ     4207 3839 4445 4445     4415
## race_eth 4189 3827 4436 4415     4436
## 
## $rm
##          bmi inc agec educ race_eth
## bmi        0 504    0    4       22
## inc      136   0    0    4       16
## agec     258 626    0   24       33
## educ     238 606    0    0       30
## race_eth 247 609    0   21        0
## 
## $mr
##          bmi inc agec educ race_eth
## bmi        0 136  258  238      247
## inc      504   0  626  606      609
## agec       0   0    0    0        0
## educ       4   4   24    0       21
## race_eth  22  16   33   30        0
## 
## $mm
##          bmi inc agec educ race_eth
## bmi      258 122    0   20       11
## inc      122 626    0   20       17
## agec       0   0    0    0        0
## educ      20  20    0   24        3
## race_eth  11  17    0    3       33

Basic imputation:

We can perform a basic multiple imputation by simply doing: Note this may take a very long time with big data sets

imp<-mice(data = brfss14[,c("bmi", "inc", "agec","educ","race_eth")], seed = 22)
## 
##  iter imp variable
##   1   1  bmi  inc  educ  race_eth
##   1   2  bmi  inc  educ  race_eth
##   1   3  bmi  inc  educ  race_eth
##   1   4  bmi  inc  educ  race_eth
##   1   5  bmi  inc  educ  race_eth
##   2   1  bmi  inc  educ  race_eth
##   2   2  bmi  inc  educ  race_eth
##   2   3  bmi  inc  educ  race_eth
##   2   4  bmi  inc  educ  race_eth
##   2   5  bmi  inc  educ  race_eth
##   3   1  bmi  inc  educ  race_eth
##   3   2  bmi  inc  educ  race_eth
##   3   3  bmi  inc  educ  race_eth
##   3   4  bmi  inc  educ  race_eth
##   3   5  bmi  inc  educ  race_eth
##   4   1  bmi  inc  educ  race_eth
##   4   2  bmi  inc  educ  race_eth
##   4   3  bmi  inc  educ  race_eth
##   4   4  bmi  inc  educ  race_eth
##   4   5  bmi  inc  educ  race_eth
##   5   1  bmi  inc  educ  race_eth
##   5   2  bmi  inc  educ  race_eth
##   5   3  bmi  inc  educ  race_eth
##   5   4  bmi  inc  educ  race_eth
##   5   5  bmi  inc  educ  race_eth
print(imp)
## Multiply imputed data set
## Call:
## mice(data = brfss14[, c("bmi", "inc", "agec", "educ", "race_eth")], 
##     seed = 22)
## Number of multiple imputations:  5
## Missing cells per column:
##      bmi      inc     agec     educ race_eth 
##      258      626        0       24       33 
## Imputation methods:
##       bmi       inc      agec      educ  race_eth 
##     "pmm" "polyreg"        "" "polyreg" "polyreg" 
## VisitSequence:
##      bmi      inc     educ race_eth 
##        1        2        4        5 
## PredictorMatrix:
##          bmi inc agec educ race_eth
## bmi        0   1    1    1        1
## inc        1   0    1    1        1
## agec       0   0    0    0        0
## educ       1   1    1    0        1
## race_eth   1   1    1    1        0
## Random generator seed value:  22

Shows how many imputations were done. It also shows total missingness, which imputation method was used for each variable (because you wouldn’t want to use a normal distribution for a categorical variable!!). It also shows the sequence of how each variable is visited (or imputed, the default is left to right).

We may want to make sure imputed values are plausible by having a look:

head(imp$imp$bmi)
##            1     2     3     4     5
## 140290 25.10 36.91 34.33 14.53 40.24
## 140297 21.97 25.39 26.78 26.29 28.50
## 140342 37.23 34.72 25.85 19.84 31.57
## 140352 25.82 25.80 26.61 28.37 28.19
## 140379 19.67 21.46 21.77 21.95 20.31
## 140401 27.89 37.12 47.47 49.38 26.52
head(imp$imp$inc)
##                1         2         3         4         5
## 140283 50 to 75k 50 to 75k 35 to 50k 35 to 50k 25 to 35k
## 140290 10 to 15k 15 to 20k 35 to 50k      75k+ 15 to 20k
## 140292 25 to 35k 15 to 20k 35 to 50k      75k+      <10k
## 140297      75k+ 35 to 50k 35 to 50k 20 to 25k 10 to 15k
## 140300 20 to 25k 35 to 50k 25 to 35k 35 to 50k 50 to 75k
## 140304 35 to 50k      75k+ 20 to 25k 25 to 35k 50 to 75k

Which shows the imputed values for the first 6 cases across the 5 different imputations. We can see that there is variation across the imputations, because the imputed values are not the same.

We can also do some plotting. For instance if we want to see how the observed and imputed values of bmi look with respect to race, we can do:

library(lattice)
stripplot(imp,bmi~race_eth|.imp, pch=20)

and we see the distribution of the original data (blue dots), the imputed data (red dots) across the levels of race, for each of the five different imputation runs(the number at the top shows which run, and the first plot is the original data).

This plot shows that the bmi values correspond well with the observed data, so they are probably plausible valuesU.

If we want to get our new, imputed data, we can use the complete() function, which by default extracts the first imputed data set. If we want a different one, we can do complete(imp, action=3) for example, to get the third imputed data set.

dat.imp<-complete(imp)
head(dat.imp, n=10)
##          bmi       inc    agec     educ race_eth
## 140282 24.33 25 to 35k  [0,24]  2hsgrad hispanic
## 140283 37.25 50 to 75k (39,59]  2hsgrad  nhwhite
## 140284 22.24      75k+  [0,24] 3somecol  nhwhite
## 140285 25.62      75k+ (39,59] 3somecol nh other
## 140286 21.63      75k+ (24,39] 4colgrad  nhwhite
## 140287 25.84      75k+ (59,79] 4colgrad  nhwhite
## 140288 24.41 25 to 35k  [0,24] 3somecol  nhwhite
## 140289 19.22 50 to 75k (24,39] 3somecol  nhwhite
## 140290 25.10 10 to 15k (24,39] 3somecol nh black
## 140291 28.65 50 to 75k (24,39] 3somecol hispanic
#Compare to the original data
head(brfss14[,c("bmi", "inc", "agec","educ","black","white","hispanic")], n=10)
##          bmi       inc    agec     educ black white hispanic
## 140282 24.33 25 to 35k  [0,24]  2hsgrad     0     0        1
## 140283 37.25      <NA> (39,59]  2hsgrad     0     1        0
## 140284 22.24      75k+  [0,24] 3somecol     0     1        0
## 140285 25.62      75k+ (39,59] 3somecol     0     0        0
## 140286 21.63      75k+ (24,39] 4colgrad     0     1        0
## 140287 25.84      75k+ (59,79] 4colgrad     0     1        0
## 140288 24.41 25 to 35k  [0,24] 3somecol     0     1        0
## 140289 19.22 50 to 75k (24,39] 3somecol     0     1        0
## 140290    NA      <NA> (24,39]     <NA>     1     0        0
## 140291 28.65 50 to 75k (24,39] 3somecol     0     0        1

While the first few cases don’t show much missingness, we can coax some more interesting cases out and compare the original data to the imputed:

head(dat.imp[is.na(brfss14$bmi)==T,], n=10)
##          bmi       inc    agec     educ race_eth
## 140290 25.10 10 to 15k (24,39] 3somecol nh black
## 140297 21.97      75k+  [0,24] 4colgrad  nhwhite
## 140342 37.23 35 to 50k (59,79] 4colgrad  nhwhite
## 140352 25.82 15 to 20k  [0,24]  1somehs hispanic
## 140379 19.67 20 to 25k (59,79]    0Prim hispanic
## 140401 27.89 35 to 50k (39,59]  2hsgrad  nhwhite
## 140443 31.32 20 to 25k (39,59]  2hsgrad  nhwhite
## 140451 34.70 20 to 25k (59,79] 3somecol nh other
## 140470 41.84 10 to 15k (79,99]  2hsgrad nh black
## 140486 22.15 15 to 20k (24,39]    0Prim hispanic
head(brfss14[is.na(brfss14$bmi)==T,c("bmi", "inc", "agec","educ","race_eth")], n=10)
##        bmi       inc    agec     educ race_eth
## 140290  NA      <NA> (24,39]     <NA> nh black
## 140297  NA      <NA>  [0,24]     <NA>  nhwhite
## 140342  NA      <NA> (59,79] 4colgrad  nhwhite
## 140352  NA 15 to 20k  [0,24]  1somehs hispanic
## 140379  NA      <NA> (59,79]    0Prim hispanic
## 140401  NA      <NA> (39,59]  2hsgrad  nhwhite
## 140443  NA 20 to 25k (39,59]  2hsgrad  nhwhite
## 140451  NA 20 to 25k (59,79] 3somecol nh other
## 140470  NA 10 to 15k (79,99]  2hsgrad nh black
## 140486  NA      <NA> (24,39]    0Prim hispanic

Analyzing the imputed data

A key element of using imputed data, is that the relationships we want to know about should be maintained after imputation, and presumably, the relationships within each imputed data set will be the same. So if we used each of the (5 in this case) imputed data sets in a model, then we should see similar results across the five different models.

Here I look at a linear model for bmi:

#Now, I will see the variability in the 5 different imputations for each outcom
fit.bmi<-with(data=imp ,exp=lm(bmi~inc+agec+educ+race_eth))
fit.bmi
## call :
## with.mids(data = imp, expr = lm(bmi ~ inc + agec + educ + race_eth))
## 
## call1 :
## mice(data = brfss14[, c("bmi", "inc", "agec", "educ", "race_eth")], 
##     seed = 22)
## 
## nmis :
##      bmi      inc     agec     educ race_eth 
##      258      626        0       24       33 
## 
## analyses :
## [[1]]
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
## 
## Coefficients:
## (Intercept)         inc2         inc3         inc4         inc5  
##   26.458327     0.335561     0.520844    -0.374462    -0.850894  
##        inc6         inc7         inc8        agec2        agec3  
##   -0.785629    -0.726448    -1.119064     3.023091     3.774867  
##       agec4        agec5        educ2        educ3        educ4  
##    3.004520     0.024796    -1.360424    -1.025615     0.004437  
##       educ5    race_eth2    race_eth3    race_eth4    race_eth5  
##   -1.033684    -0.032551     0.425236    -0.169849    -0.347025  
## 
## 
## [[2]]
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
## 
## Coefficients:
## (Intercept)         inc2         inc3         inc4         inc5  
##    26.63602      0.16638     -0.50579     -0.84253     -1.02144  
##        inc6         inc7         inc8        agec2        agec3  
##    -0.89876     -0.72390     -1.35295      2.73334      3.75046  
##       agec4        agec5        educ2        educ3        educ4  
##     3.04012      0.06632     -1.90401     -0.91305      0.08618  
##       educ5    race_eth2    race_eth3    race_eth4    race_eth5  
##    -1.03414     -0.09571      0.43466     -0.09511     -0.34998  
## 
## 
## [[3]]
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
## 
## Coefficients:
## (Intercept)         inc2         inc3         inc4         inc5  
##     26.2337       0.6805       0.4253      -0.5538      -0.5335  
##        inc6         inc7         inc8        agec2        agec3  
##     -0.5322      -0.1929      -0.8283       2.7203       3.7345  
##       agec4        agec5        educ2        educ3        educ4  
##      2.9431      -0.0658      -0.8253      -0.8740       0.1829  
##       educ5    race_eth2    race_eth3    race_eth4    race_eth5  
##     -0.9673      -0.3177       0.2808      -0.2371      -0.4758  
## 
## 
## [[4]]
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
## 
## Coefficients:
## (Intercept)         inc2         inc3         inc4         inc5  
##    26.34589      0.26108      0.36190     -0.55454     -0.97619  
##        inc6         inc7         inc8        agec2        agec3  
##    -0.74598     -0.48829     -1.18510      3.00405      3.89441  
##       agec4        agec5        educ2        educ3        educ4  
##     3.12241      0.12303     -0.91581     -1.06391      0.03855  
##       educ5    race_eth2    race_eth3    race_eth4    race_eth5  
##    -1.05597      0.42373      0.36907     -0.17874     -0.20772  
## 
## 
## [[5]]
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
## 
## Coefficients:
## (Intercept)         inc2         inc3         inc4         inc5  
##    26.18129      0.50754      0.56280     -0.33546     -0.55342  
##        inc6         inc7         inc8        agec2        agec3  
##    -0.47992     -0.16140     -0.90981      2.90280      3.59461  
##       agec4        agec5        educ2        educ3        educ4  
##     2.96881      0.10866     -1.50723     -0.80422      0.14856  
##       educ5    race_eth2    race_eth3    race_eth4    race_eth5  
##    -0.99911     -0.04744      0.46538     -0.18594     -0.39986
with (data=imp, exp=(sd(bmi)))
## call :
## with.mids(data = imp, expr = (sd(bmi)))
## 
## call1 :
## mice(data = brfss14[, c("bmi", "inc", "agec", "educ", "race_eth")], 
##     seed = 22)
## 
## nmis :
##      bmi      inc     agec     educ race_eth 
##      258      626        0       24       33 
## 
## analyses :
## [[1]]
## [1] 6.424797
## 
## [[2]]
## [1] 6.345079
## 
## [[3]]
## [1] 6.392502
## 
## [[4]]
## [1] 6.405062
## 
## [[5]]
## [1] 6.319322
with (data=imp, exp=(prop.table(table(inc))))
## call :
## with.mids(data = imp, expr = (prop.table(table(inc))))
## 
## call1 :
## mice(data = brfss14[, c("bmi", "inc", "agec", "educ", "race_eth")], 
##     seed = 22)
## 
## nmis :
##      bmi      inc     agec     educ race_eth 
##      258      626        0       24       33 
## 
## analyses :
## [[1]]
## inc
##       <10k  10 to 15k  15 to 20k  20 to 25k  25 to 35k  35 to 50k 
## 0.05840233 0.05862609 0.07854106 0.10002238 0.12530768 0.15730588 
##  50 to 75k       75k+ 
## 0.15014545 0.27164914 
## 
## [[2]]
## inc
##       <10k  10 to 15k  15 to 20k  20 to 25k  25 to 35k  35 to 50k 
## 0.05370329 0.05616469 0.07831730 0.10337883 0.12844037 0.15506825 
##  50 to 75k       75k+ 
## 0.14902663 0.27590065 
## 
## [[3]]
## inc
##       <10k  10 to 15k  15 to 20k  20 to 25k  25 to 35k  35 to 50k 
## 0.05728351 0.06019244 0.07585590 0.10114119 0.13582457 0.15417319 
##  50 to 75k       75k+ 
## 0.14701275 0.26851645 
## 
## [[4]]
## inc
##       <10k  10 to 15k  15 to 20k  20 to 25k  25 to 35k  35 to 50k 
## 0.05594093 0.05862609 0.07809353 0.09756098 0.13000671 0.15596330 
##  50 to 75k       75k+ 
## 0.14835534 0.27545312 
## 
## [[5]]
## inc
##       <10k  10 to 15k  15 to 20k  20 to 25k  25 to 35k  35 to 50k 
## 0.05526964 0.05795480 0.07898859 0.09935109 0.13291564 0.15305437 
##  50 to 75k       75k+ 
## 0.14813157 0.27433430
with (data=imp, exp=(prop.table(table(race_eth))))
## call :
## with.mids(data = imp, expr = (prop.table(table(race_eth))))
## 
## call1 :
## mice(data = brfss14[, c("bmi", "inc", "agec", "educ", "race_eth")], 
##     seed = 22)
## 
## nmis :
##      bmi      inc     agec     educ race_eth 
##      258      626        0       24       33 
## 
## analyses :
## [[1]]
## race_eth
##      nhwhite     hispanic     nh black nh multirace     nh other 
##   0.74110539   0.05862609   0.08077870   0.05012307   0.06936675 
## 
## [[2]]
## race_eth
##      nhwhite     hispanic     nh black nh multirace     nh other 
##   0.73998657   0.05907362   0.08033117   0.05101812   0.06959051 
## 
## [[3]]
## race_eth
##      nhwhite     hispanic     nh black nh multirace     nh other 
##   0.74177668   0.05862609   0.07988364   0.04989931   0.06981428 
## 
## [[4]]
## race_eth
##      nhwhite     hispanic     nh black nh multirace     nh other 
##   0.74065787   0.05884985   0.08033117   0.05012307   0.07003804 
## 
## [[5]]
## race_eth
##      nhwhite     hispanic     nh black nh multirace     nh other 
##   0.74088163   0.05840233   0.08033117   0.05012307   0.07026180
with (data=imp, exp=(prop.table(table(agec))))
## call :
## with.mids(data = imp, expr = (prop.table(table(agec))))
## 
## call1 :
## mice(data = brfss14[, c("bmi", "inc", "agec", "educ", "race_eth")], 
##     seed = 22)
## 
## nmis :
##      bmi      inc     agec     educ race_eth 
##      258      626        0       24       33 
## 
## analyses :
## [[1]]
## agec
##     [0,24]    (24,39]    (39,59]    (59,79]    (79,99] 
## 0.05817856 0.16894160 0.31752070 0.38174088 0.07361826 
## 
## [[2]]
## agec
##     [0,24]    (24,39]    (39,59]    (59,79]    (79,99] 
## 0.05817856 0.16894160 0.31752070 0.38174088 0.07361826 
## 
## [[3]]
## agec
##     [0,24]    (24,39]    (39,59]    (59,79]    (79,99] 
## 0.05817856 0.16894160 0.31752070 0.38174088 0.07361826 
## 
## [[4]]
## agec
##     [0,24]    (24,39]    (39,59]    (59,79]    (79,99] 
## 0.05817856 0.16894160 0.31752070 0.38174088 0.07361826 
## 
## [[5]]
## agec
##     [0,24]    (24,39]    (39,59]    (59,79]    (79,99] 
## 0.05817856 0.16894160 0.31752070 0.38174088 0.07361826
with (data=imp, exp=(prop.table(table(educ))))
## call :
## with.mids(data = imp, expr = (prop.table(table(educ))))
## 
## call1 :
## mice(data = brfss14[, c("bmi", "inc", "agec", "educ", "race_eth")], 
##     seed = 22)
## 
## nmis :
##      bmi      inc     agec     educ race_eth 
##      258      626        0       24       33 
## 
## analyses :
## [[1]]
## educ
##    2hsgrad      0Prim    1somehs   3somecol   4colgrad 
## 0.28686507 0.02282390 0.06220631 0.28910271 0.33900201 
## 
## [[2]]
## educ
##    2hsgrad      0Prim    1somehs   3somecol   4colgrad 
## 0.28664131 0.02304766 0.06243007 0.29022153 0.33765943 
## 
## [[3]]
## educ
##    2hsgrad      0Prim    1somehs   3somecol   4colgrad 
## 0.28597002 0.02282390 0.06220631 0.29022153 0.33877825 
## 
## [[4]]
## educ
##    2hsgrad      0Prim    1somehs   3somecol   4colgrad 
## 0.28708883 0.02282390 0.06220631 0.29044529 0.33743567 
## 
## [[5]]
## educ
##    2hsgrad      0Prim    1somehs   3somecol   4colgrad 
## 0.28708883 0.02282390 0.06198255 0.28999776 0.33810696

Which we can compare to the model fit on the original data, with missings eliminated:

summary(lm(bmi~inc+agec+educ+race_eth, data=brfss14))
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth, data = brfss14)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.883  -4.183  -0.940   2.957  57.537 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          26.18592    0.66661  39.282  < 2e-16 ***
## inc10 to 15k          0.29581    0.63766   0.464 0.642751    
## inc15 to 20k          0.21908    0.59562   0.368 0.713033    
## inc20 to 25k         -0.33909    0.57228  -0.593 0.553536    
## inc25 to 35k         -0.77314    0.54841  -1.410 0.158686    
## inc35 to 50k         -0.58545    0.54149  -1.081 0.279690    
## inc50 to 75k         -0.35103    0.54558  -0.643 0.519996    
## inc75k+              -1.01161    0.52710  -1.919 0.055036 .  
## agec(24,39]           3.14463    0.53954   5.828 6.08e-09 ***
## agec(39,59]           3.92339    0.51111   7.676 2.09e-14 ***
## agec(59,79]           3.17112    0.50819   6.240 4.87e-10 ***
## agec(79,99]           0.07633    0.62690   0.122 0.903104    
## educ0Prim            -1.33281    0.88433  -1.507 0.131863    
## educ1somehs          -0.80800    0.49356  -1.637 0.101698    
## educ3somecol          0.24741    0.27806   0.890 0.373639    
## educ4colgrad         -1.07534    0.28372  -3.790 0.000153 ***
## race_ethhispanic      0.12929    0.52704   0.245 0.806233    
## race_ethnh black      0.41594    0.38947   1.068 0.285616    
## race_ethnh multirace -0.35715    0.48301  -0.739 0.459697    
## race_ethnh other     -0.36974    0.42531  -0.869 0.384718    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.302 on 3668 degrees of freedom
##   (781 observations deleted due to missingness)
## Multiple R-squared:  0.04216,    Adjusted R-squared:  0.0372 
## F-statistic: 8.498 on 19 and 3668 DF,  p-value: < 2.2e-16

And we see the same relationships, and the same significant effects.

Now we pool the separate models:

est.p<-pool(fit.bmi)
print(est.p)
## Call: pool(object = fit.bmi)
## 
## Pooled coefficients:
## (Intercept)        inc2        inc3        inc4        inc5        inc6 
## 26.37103775  0.39020258  0.27300328 -0.53216328 -0.78709221 -0.68850194 
##        inc7        inc8       agec2       agec3       agec4       agec5 
## -0.45859443 -1.07905432  2.87672264  3.74977204  3.01579518  0.05140060 
##       educ2       educ3       educ4       educ5   race_eth2   race_eth3 
## -1.30256077 -0.93615786  0.09212783 -1.01803343 -0.01392793  0.39501903 
##   race_eth4   race_eth5 
## -0.17333842 -0.35607302 
## 
## Fraction of information about the coefficients missing due to nonresponse: 
## (Intercept)        inc2        inc3        inc4        inc5        inc6 
##  0.11600727  0.14738563  0.51177410  0.17230775  0.23299471  0.15089883 
##        inc7        inc8       agec2       agec3       agec4       agec5 
##  0.30646265  0.21406968  0.11341483  0.07213157  0.03238494  0.02483647 
##       educ2       educ3       educ4       educ5   race_eth2   race_eth3 
##  0.36347516  0.07484830  0.10049004  0.02219006  0.33613479  0.04949621 
##   race_eth4   race_eth5 
##  0.01679925  0.07832378
summary(est.p)
##                     est        se           t         df     Pr(>|t|)
## (Intercept) 26.37103775 0.6024667 43.77177624  304.01316 0.000000e+00
## inc2         0.39020258 0.6022361  0.64792297  197.01825 5.177891e-01
## inc3         0.27300328 0.7130986  0.38284088   18.59852 7.061781e-01
## inc4        -0.53216328 0.5471652 -0.97258244  147.85483 3.323493e-01
## inc5        -0.78709221 0.5459563 -1.44167625   84.41333 1.530934e-01
## inc6        -0.68850194 0.5137380 -1.34018107  188.70394 1.817979e-01
## inc7        -0.45859443 0.5692809 -0.80556793   50.40505 4.242761e-01
## inc8        -1.07905432 0.5208763 -2.07161354   98.87026 4.090524e-02
## agec2        2.87672264 0.4815111  5.97436420  316.56209 6.211051e-09
## agec3        3.74977204 0.4450830  8.42488230  690.86706 2.220446e-16
## agec4        3.01579518 0.4320963  6.97945197 2085.22337 3.966605e-12
## agec5        0.05140060 0.5359170  0.09591149 2661.86671 9.235981e-01
## educ2       -1.30256077 0.8451104 -1.54129064   36.40563 1.318944e-01
## educ3       -0.93615786 0.4383870 -2.13546012  649.97019 3.309661e-02
## educ4        0.09212783 0.2624508  0.35102902  392.35361 7.257550e-01
## educ5       -1.01803343 0.2608126 -3.90331377 2894.07073 9.704868e-05
## race_eth2   -0.01392793 0.5354786 -0.02601024   42.28309 9.793715e-01
## race_eth3    0.39501903 0.3638634  1.08562444 1232.84787 2.778574e-01
## race_eth4   -0.17333842 0.4381532 -0.39561143 3397.25550 6.924165e-01
## race_eth5   -0.35607302 0.3910950 -0.91045147  602.57075 3.629485e-01
##                  lo 95       hi 95 nmis        fmi     lambda
## (Intercept) 25.1855051 27.55657040   NA 0.11600727 0.11021085
## inc2        -0.7974539  1.57785902   NA 0.14738563 0.13877415
## inc3        -1.2217128  1.76771935   NA 0.51177410 0.46195138
## inc4        -1.6134375  0.54911096   NA 0.17230775 0.16118696
## inc5        -1.8727085  0.29852407   NA 0.23299471 0.21503486
## inc6        -1.7019092  0.32490535   NA 0.15089883 0.14194697
## inc7        -1.6018011  0.68461227   NA 0.30646265 0.27947942
## inc8        -2.1126026 -0.04550603   NA 0.21406968 0.19833065
## agec2        1.9293563  3.82408902   NA 0.11341483 0.10783114
## agec3        2.8758944  4.62364966   NA 0.07213157 0.06944935
## agec4        2.1684102  3.86318017   NA 0.03238494 0.03145732
## agec5       -0.9994552  1.10225643   NA 0.02483647 0.02410406
## educ2       -3.0158613  0.41073974   NA 0.36347516 0.32944153
## educ3       -1.7969835 -0.07533225   NA 0.07484830 0.07200592
## educ4       -0.4238579  0.60811354   NA 0.10049004 0.09591650
## educ5       -1.5294306 -0.50663624   NA 0.02219006 0.02151456
## race_eth2   -1.0943533  1.06649742   NA 0.33613479 0.30545929
## race_eth3   -0.3188410  1.10887908   NA 0.04949621 0.04795550
## race_eth4   -1.0324090  0.68573216   NA 0.01679925 0.01622060
## race_eth5   -1.1241479  0.41200186   NA 0.07832378 0.07526970

We need to pay attention to the fmi column and the lambda column. These convey information about how much the missingness of each particular variable affects the model coefficients.

barplot(est.p$fmi)

barplot(est.p$lambda)

It appears that the third and seventh income categories, the second education category (somehs) and the first race_eth category (hispanic) have large variances to them. This suggests that there may be noticeable variation in the resulting coefficient of the model, depending on which imputed data set we use.

Compare imputed model to original data

Here, I compare the coefficients from the model where we eliminated all missing data to the one that we fit on the imputed data:

summary(lm(bmi~inc+agec+educ+race_eth, data=brfss14))
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth, data = brfss14)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.883  -4.183  -0.940   2.957  57.537 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          26.18592    0.66661  39.282  < 2e-16 ***
## inc10 to 15k          0.29581    0.63766   0.464 0.642751    
## inc15 to 20k          0.21908    0.59562   0.368 0.713033    
## inc20 to 25k         -0.33909    0.57228  -0.593 0.553536    
## inc25 to 35k         -0.77314    0.54841  -1.410 0.158686    
## inc35 to 50k         -0.58545    0.54149  -1.081 0.279690    
## inc50 to 75k         -0.35103    0.54558  -0.643 0.519996    
## inc75k+              -1.01161    0.52710  -1.919 0.055036 .  
## agec(24,39]           3.14463    0.53954   5.828 6.08e-09 ***
## agec(39,59]           3.92339    0.51111   7.676 2.09e-14 ***
## agec(59,79]           3.17112    0.50819   6.240 4.87e-10 ***
## agec(79,99]           0.07633    0.62690   0.122 0.903104    
## educ0Prim            -1.33281    0.88433  -1.507 0.131863    
## educ1somehs          -0.80800    0.49356  -1.637 0.101698    
## educ3somecol          0.24741    0.27806   0.890 0.373639    
## educ4colgrad         -1.07534    0.28372  -3.790 0.000153 ***
## race_ethhispanic      0.12929    0.52704   0.245 0.806233    
## race_ethnh black      0.41594    0.38947   1.068 0.285616    
## race_ethnh multirace -0.35715    0.48301  -0.739 0.459697    
## race_ethnh other     -0.36974    0.42531  -0.869 0.384718    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.302 on 3668 degrees of freedom
##   (781 observations deleted due to missingness)
## Multiple R-squared:  0.04216,    Adjusted R-squared:  0.0372 
## F-statistic: 8.498 on 19 and 3668 DF,  p-value: < 2.2e-16

So for instance, the coefficient for college grad in the imputed model is -1.01, but is -1.07 in the model where the data were limited to complete cases only.

If we wanted to see the ranges of the betas in the five imputed data models, we could do that:

#get the coefficients from each of the 5 imputations of bmi
coefs<-matrix(unlist(lapply(fit.bmi$analyses, coef)), nrow=5, ncol=20, byrow=T)

#plot the coefficients from each of the different rounds of imputation to see the variability in the
#results
plot(coefs[1,-1], ylim=c(-2, 4), xaxt="n",cex=1.5, pch=20, ylab="beta")
axis(1, at=1:19, labels=names(fit.bmi$analyses[[1]]$coef[-1]))
cols=2:5
for(i in 1:dim(coefs)[1]-1){
  points(coefs[i+1,-1], col=cols[i], cex=1.5, pch=20)
}
title(main="Estimated Betas from Each Imputed Regression Model for BMI Outcome")

Customized imputations

Now, what if I don’t want to use all information within the data to impute the values, let’s say I only want to use age to predict BMI, and not race. I can customize which variables are used in the imputation process by generating a predictor matrix:

#I have to provide a matrix of which variables will be used for imputing the other variables,
#I basically don't want the bmi, doc or inc variables to be influenced by the key predictors I want to use
#to analyze them later. So I only use income and age to predict BMI
predictorMatrix<-matrix(
c(
0,1,1,0,0,
1,0,1,1,1,
1,1,0,1,1,
0,1,1,0,1,
0,1,1,1,0), nrow=5, ncol=5, byrow=T)

colnames(predictorMatrix)<-rownames(predictorMatrix)<-c("bmi", "inc", "agec","educ","race_eth")
predictorMatrix
##          bmi inc agec educ race_eth
## bmi        0   1    1    0        0
## inc        1   0    1    1        1
## agec       1   1    0    1        1
## educ       0   1    1    0        1
## race_eth   0   1    1    1        0
newimp<-mice(data = brfss14[,c("bmi","inc", "agec","educ","race_eth")], seed = 1234, predictorMatrix = predictorMatrix)
## 
##  iter imp variable
##   1   1  bmi  inc  educ  race_eth
##   1   2  bmi  inc  educ  race_eth
##   1   3  bmi  inc  educ  race_eth
##   1   4  bmi  inc  educ  race_eth
##   1   5  bmi  inc  educ  race_eth
##   2   1  bmi  inc  educ  race_eth
##   2   2  bmi  inc  educ  race_eth
##   2   3  bmi  inc  educ  race_eth
##   2   4  bmi  inc  educ  race_eth
##   2   5  bmi  inc  educ  race_eth
##   3   1  bmi  inc  educ  race_eth
##   3   2  bmi  inc  educ  race_eth
##   3   3  bmi  inc  educ  race_eth
##   3   4  bmi  inc  educ  race_eth
##   3   5  bmi  inc  educ  race_eth
##   4   1  bmi  inc  educ  race_eth
##   4   2  bmi  inc  educ  race_eth
##   4   3  bmi  inc  educ  race_eth
##   4   4  bmi  inc  educ  race_eth
##   4   5  bmi  inc  educ  race_eth
##   5   1  bmi  inc  educ  race_eth
##   5   2  bmi  inc  educ  race_eth
##   5   3  bmi  inc  educ  race_eth
##   5   4  bmi  inc  educ  race_eth
##   5   5  bmi  inc  educ  race_eth
print(newimp)
## Multiply imputed data set
## Call:
## mice(data = brfss14[, c("bmi", "inc", "agec", "educ", "race_eth")], 
##     predictorMatrix = predictorMatrix, seed = 1234)
## Number of multiple imputations:  5
## Missing cells per column:
##      bmi      inc     agec     educ race_eth 
##      258      626        0       24       33 
## Imputation methods:
##       bmi       inc      agec      educ  race_eth 
##     "pmm" "polyreg"        "" "polyreg" "polyreg" 
## VisitSequence:
##      bmi      inc     educ race_eth 
##        1        2        4        5 
## PredictorMatrix:
##          bmi inc agec educ race_eth
## bmi        0   1    1    0        0
## inc        1   0    1    1        1
## agec       0   0    0    0        0
## educ       0   1    1    0        1
## race_eth   0   1    1    1        0
## Random generator seed value:  1234
fit.bmi2<-with(data=newimp ,exp=lm(bmi~inc+agec+educ+race_eth))
est.p2<-pool(fit.bmi)
summary(est.p2)
##                     est        se           t         df     Pr(>|t|)
## (Intercept) 26.37103775 0.6024667 43.77177624  304.01316 0.000000e+00
## inc2         0.39020258 0.6022361  0.64792297  197.01825 5.177891e-01
## inc3         0.27300328 0.7130986  0.38284088   18.59852 7.061781e-01
## inc4        -0.53216328 0.5471652 -0.97258244  147.85483 3.323493e-01
## inc5        -0.78709221 0.5459563 -1.44167625   84.41333 1.530934e-01
## inc6        -0.68850194 0.5137380 -1.34018107  188.70394 1.817979e-01
## inc7        -0.45859443 0.5692809 -0.80556793   50.40505 4.242761e-01
## inc8        -1.07905432 0.5208763 -2.07161354   98.87026 4.090524e-02
## agec2        2.87672264 0.4815111  5.97436420  316.56209 6.211051e-09
## agec3        3.74977204 0.4450830  8.42488230  690.86706 2.220446e-16
## agec4        3.01579518 0.4320963  6.97945197 2085.22337 3.966605e-12
## agec5        0.05140060 0.5359170  0.09591149 2661.86671 9.235981e-01
## educ2       -1.30256077 0.8451104 -1.54129064   36.40563 1.318944e-01
## educ3       -0.93615786 0.4383870 -2.13546012  649.97019 3.309661e-02
## educ4        0.09212783 0.2624508  0.35102902  392.35361 7.257550e-01
## educ5       -1.01803343 0.2608126 -3.90331377 2894.07073 9.704868e-05
## race_eth2   -0.01392793 0.5354786 -0.02601024   42.28309 9.793715e-01
## race_eth3    0.39501903 0.3638634  1.08562444 1232.84787 2.778574e-01
## race_eth4   -0.17333842 0.4381532 -0.39561143 3397.25550 6.924165e-01
## race_eth5   -0.35607302 0.3910950 -0.91045147  602.57075 3.629485e-01
##                  lo 95       hi 95 nmis        fmi     lambda
## (Intercept) 25.1855051 27.55657040   NA 0.11600727 0.11021085
## inc2        -0.7974539  1.57785902   NA 0.14738563 0.13877415
## inc3        -1.2217128  1.76771935   NA 0.51177410 0.46195138
## inc4        -1.6134375  0.54911096   NA 0.17230775 0.16118696
## inc5        -1.8727085  0.29852407   NA 0.23299471 0.21503486
## inc6        -1.7019092  0.32490535   NA 0.15089883 0.14194697
## inc7        -1.6018011  0.68461227   NA 0.30646265 0.27947942
## inc8        -2.1126026 -0.04550603   NA 0.21406968 0.19833065
## agec2        1.9293563  3.82408902   NA 0.11341483 0.10783114
## agec3        2.8758944  4.62364966   NA 0.07213157 0.06944935
## agec4        2.1684102  3.86318017   NA 0.03238494 0.03145732
## agec5       -0.9994552  1.10225643   NA 0.02483647 0.02410406
## educ2       -3.0158613  0.41073974   NA 0.36347516 0.32944153
## educ3       -1.7969835 -0.07533225   NA 0.07484830 0.07200592
## educ4       -0.4238579  0.60811354   NA 0.10049004 0.09591650
## educ5       -1.5294306 -0.50663624   NA 0.02219006 0.02151456
## race_eth2   -1.0943533  1.06649742   NA 0.33613479 0.30545929
## race_eth3   -0.3188410  1.10887908   NA 0.04949621 0.04795550
## race_eth4   -1.0324090  0.68573216   NA 0.01679925 0.01622060
## race_eth5   -1.1241479  0.41200186   NA 0.07832378 0.07526970
#get the coefficients from each of the 5 imputations of bmi
coefs2<-matrix(unlist(lapply(fit.bmi2$analyses, coef)), nrow=5, ncol=20, byrow=T)

#plot the coefficients from each of the different rounds of imputation to see the variability in the
#results
plot(coefs2[1,-1], ylim=c(-2, 4), xaxt="n",cex=1.5, pch=20, ylab="beta")
axis(1, at=1:19, labels=names(fit.bmi2$analyses[[1]]$coef[-1]))
cols=2:5
for(i in 1:dim(coefs2)[1]-1){
  points(coefs2[i+1,-1], col=cols[i], cex=1.5, pch=20)
}
title(main="Estimated Betas from Each Imputed Regression Model for BMI Outcome")