This example will illustrate typical aspects of dealing with missing data. Topics will include: Mean imputation, modal imputation for categorical data, and multiple imputation of complex patterns of missing data. For this example I am using 2011 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART county data. Link.

library(car)
library(survey)
## Loading required package: grid
## 
## Attaching package: 'survey'
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(mice) #Library for multiple imputation
## Loading required package: Rcpp
## Loading required package: lattice
## mice 2.22 2014-06-10
library(MASS)

#load brfss
load("~/Google Drive/dem7283/data/brfss_11.Rdata")
nams<-names(brfss_11)
newnames<-gsub("_", "", nams)
names(brfss_11)<-tolower(newnames)


#just for brevity, I just select TX respondents with non missing weights
brfss_11<-brfss_11[brfss_11$state==48&is.na(brfss_11$cntywt)==F,]

#insurance
brfss_11$ins<-recode(brfss_11$hlthpln1, recodes="1=1;2=0;else=NA")
#smoking currently
brfss_11$smoke<-recode(brfss_11$smoker3, recodes="1:2=1; 3:4=0; else=NA")
#low physical activity
brfss_11$lowact<-recode(brfss_11$pacat, recodes="1:2=0; 3:4=1; else=NA")
#high blood pressure
brfss_11$hbp<-recode(brfss_11$bphigh4, recodes="1=1; 2:4=0; else=NA")
#high cholesterol
brfss_11$hc<-recode(brfss_11$toldhi2, recodes="1=1; 2=0; else=NA")
#bmi
brfss_11$bmi<-ifelse(is.na(brfss_11$bmi5)==T, NA, brfss_11$bmi5/100)
#poor or fair health
brfss_11$badhealth<-recode(brfss_11$genhlth, recodes="4:5=1; 1:3=0; else=NA")

#race
brfss_11$black<-recode(brfss_11$racegr2, recodes="2=1; 9=NA; else=0", as.factor.result=T)
brfss_11$white<-recode(brfss_11$racegr2, recodes="1=1; 9=NA; else=0", as.factor.result=T)
brfss_11$other<-recode(brfss_11$racegr2, recodes="3:4=1; 9=NA; else=0", as.factor.result=T)
brfss_11$hispanic<-recode(brfss_11$racegr2, recodes="5=1; 9=NA; else=0", as.factor.result=T)

#have a personal doctor?
brfss_11$doc<-recode(brfss_11$persdoc2, recodes="1:2=1; 3=0; else=NA", as.factor.result=F)

#needed care in last year but couldn't get it because of cost
brfss_11$medacc<-recode(brfss_11$medcost, recodes="1=1;2=0;else=NA")

#education level
brfss_11$educ<-recode(brfss_11$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor.result=T)
brfss_11$educ<-relevel(brfss_11$educ, ref='2hsgrad')

#employment
brfss_11$emp<-recode(brfss_11$employ, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor.result=T)
brfss_11$emp<-relevel(brfss_11$emp, ref='Employed')

#marital status
brfss_11$marst<-recode(brfss_11$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor.result=T)
brfss_11$marst<-relevel(brfss_11$marst, ref='married')

#income
brfss_11$inc<-recode(brfss_11$incomg, recodes="1='<15k'; 2='15 to 25k'; 3='25 to 35k'; 4='35 to 50k'; 5='50k+';9=NA", as.factor.result = T)

#Age cut into intervals
brfss_11$agec<-cut(brfss_11$age, breaks=c(0,24,39,59,79,99))

Now, we can get a general idea of the missingness of these variables by just using summary(brfss_11)

summary(brfss_11[, c("ins", "smoke", "lowact", "hbp", "hc", "bmi", "badhealth", "black", "doc", "educ", "emp", "marst", "inc")])
##       ins             smoke           lowact            hbp        
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.8591   Mean   :0.131   Mean   :0.4905   Mean   :0.4228  
##  3rd Qu.:1.0000   3rd Qu.:0.000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##  NA's   :28       NA's   :54      NA's   :676      NA's   :27      
##        hc              bmi          badhealth       black     
##  Min.   :0.0000   Min.   :12.55   Min.   :0.0000   0   :6769  
##  1st Qu.:0.0000   1st Qu.:23.44   1st Qu.:0.0000   1   : 681  
##  Median :0.0000   Median :26.57   Median :0.0000   NA's: 144  
##  Mean   :0.4769   Mean   :27.55   Mean   :0.1908              
##  3rd Qu.:1.0000   3rd Qu.:30.50   3rd Qu.:0.0000              
##  Max.   :1.0000   Max.   :82.28   Max.   :1.0000              
##  NA's   :1117     NA's   :467     NA's   :35                  
##       doc              educ            emp             marst     
##  Min.   :0.000   2hsgrad :1649   Employed:3580   married  :4250  
##  1st Qu.:1.000   0Prim   : 361   nilf    :1369   cohab    : 159  
##  Median :1.000   1somehs : 460   retired :2102   divorced :1041  
##  Mean   :0.834   3somecol:1909   unable  : 488   nm       : 845  
##  3rd Qu.:1.000   4colgrad:3189   NA's    :  55   separated: 180  
##  Max.   :1.000   NA's    :  26                   widowed  :1062  
##  NA's   :34                                      NA's     :  57  
##         inc      
##  <15k     : 753  
##  15 to 25k:1092  
##  25 to 35k: 650  
##  35 to 50k: 857  
##  50k+     :3181  
##  NA's     :1061  
## 

Which shows that, among these recoded variables, hc or the indicator for high cholesterol is missing the most with 1117 people in our TX sample missing, or 14.7089808% of the sample. The lowest number of missings is in the education variable, which only has 0.3423756% missing.

Mean imputation

Now, i’m going to illustrate mean imputation of a continuous variable, BMI.

#I'm going to play with 3 outcomes, bmi, having a regular doctor and income category
summary(brfss_11$bmi) #467 missing, 6%
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   12.55   23.44   26.57   27.55   30.51   82.28     467
#what happens when we replace the missings with the mean?
brfss_11$bmi.imp.mean<-ifelse(is.na(brfss_11$bmi)==T, mean(brfss_11$bmi, na.rm=T), brfss_11$bmi)

mean(brfss_11$bmi, na.rm=T)
## [1] 27.55206
mean(brfss_11$bmi.imp.mean) #no difference!
## [1] 27.55206
median(brfss_11$bmi, na.rm=T)
## [1] 26.57
median(brfss_11$bmi.imp.mean) #slight difference
## [1] 26.94
var(brfss_11$bmi, na.rm=T)
## [1] 36.38479
var(brfss_11$bmi.imp.mean) # more noticeable difference!
## [1] 34.14698

So what we see here, is that imputing with the mean does nothing to central tendency (when measured using the mean, but does affect the median), but it does reduce the variance in the outcome. This is because you’re replacing all missing cases with the most likely value (the mean), so you’re artificially deflating the variance. That’s not good.

We can see this in a histogram:

#plot the histogram
hist(brfss_11$bmi.imp.mean)
hist(brfss_11$bmi, add=T ,col=1) #you can see the extra values at the mean

Multiple Imputation

These days, these types of imputation have been far surpassed by more complete methods that are based upon regression methods. These methods are generally referred to as multiple imputation, because we are really interested in imputing multiple variables simultaneously. Instead of reviewing this perspective here, I suggest you have a look at Joe Schafer’s site that gives a nice treatment of the subject. Here, I will use the imputation techniques in the mice library in R, which you can read about here.

I have used these in practice in publications and generally like the framework the library uses. Another popular technique is in the Amelia library of Gary King, which I haven’t used much. If you are serious about doing multiple imputation it would be advised to investigate multiple methodologies.

To begin, I explore the various patterns of missingness in the data. The md.pattern function in mice does this nicely. Here, each row corresponds to a particular pattern of missingness (1 = observed, 0=missing)

#look at the patterns of missingness
md.pattern(brfss_11[,c("bmi", "inc", "agec","educ","black","white","hispanic")])
##      agec educ black white hispanic bmi  inc     
## 6136    1    1     1     1        1   1    1    0
##  288    1    1     1     1        1   0    1    1
##  855    1    1     1     1        1   1    0    1
##    8    1    0     1     1        1   1    1    1
##  152    1    1     1     1        1   0    0    2
##    3    1    0     1     1        1   0    1    2
##    6    1    0     1     1        1   1    0    2
##    2    1    0     1     1        1   0    0    3
##   85    1    1     0     0        0   1    1    3
##   10    1    1     0     0        0   0    1    4
##   33    1    1     0     0        0   1    0    4
##    2    1    0     0     0        0   1    1    4
##    9    1    1     0     0        0   0    0    5
##    1    1    0     0     0        0   0    1    5
##    2    1    0     0     0        0   1    0    5
##    2    1    0     0     0        0   0    0    6
##         0   26   144   144      144 467 1061 1986

Shows that 6,116 rows of the data are complete (first row). The second row shows that 288 people are missing only the bmi variable. I say only because 467 people are missing the bmi variable in total. Apparently some folks are missing in combination with other variables. Sure enough, if we look down, we see that 3 people are missing bmi AND educ, 152 are missing bmi AND inc, and so on. The bottom row tells how many total people are missing each variable, in ANY combination with other variables.

If you want to see how pairs of variables are missing together, the md.pairs() function will show this. A pair of variables can have exactly four missingness patterns: both variables are observed (pattern rr), the first variable is observed and the second variable is missing (pattern rm), the first variable is missing and the second variable is observed (pattern mr), and both are missing (pattern mm).

md.pairs(brfss_11[,c("bmi", "inc", "agec","educ","black","white","hispanic")])
## $rr
##           bmi  inc agec educ black white hispanic
## bmi      7127 6231 7127 7109  7005  7005     7005
## inc      6231 6533 6533 6519  6435  6435     6435
## agec     7127 6533 7594 7568  7450  7450     7450
## educ     7109 6519 7568 7568  7431  7431     7431
## black    7005 6435 7450 7431  7450  7450     7450
## white    7005 6435 7450 7431  7450  7450     7450
## hispanic 7005 6435 7450 7431  7450  7450     7450
## 
## $rm
##          bmi  inc agec educ black white hispanic
## bmi        0  896    0   18   122   122      122
## inc      302    0    0   14    98    98       98
## agec     467 1061    0   26   144   144      144
## educ     459 1049    0    0   137   137      137
## black    445 1015    0   19     0     0        0
## white    445 1015    0   19     0     0        0
## hispanic 445 1015    0   19     0     0        0
## 
## $mr
##          bmi inc agec educ black white hispanic
## bmi        0 302  467  459   445   445      445
## inc      896   0 1061 1049  1015  1015     1015
## agec       0   0    0    0     0     0        0
## educ      18  14   26    0    19    19       19
## black    122  98  144  137     0     0        0
## white    122  98  144  137     0     0        0
## hispanic 122  98  144  137     0     0        0
## 
## $mm
##          bmi  inc agec educ black white hispanic
## bmi      467  165    0    8    22    22       22
## inc      165 1061    0   12    46    46       46
## agec       0    0    0    0     0     0        0
## educ       8   12    0   26     7     7        7
## black     22   46    0    7   144   144      144
## white     22   46    0    7   144   144      144
## hispanic  22   46    0    7   144   144      144

Basic imputation:

We can perform a basic multiple imputation by simply doing: Note this may take a long time with big data sets

imp<-mice(data = brfss_11[,c("bmi", "inc", "agec","educ","black","white","hispanic")], seed = 22)
## 
##  iter imp variable
##   1   1  bmi  inc  educ  black  white  hispanic
##   1   2  bmi  inc  educ  black  white  hispanic
##   1   3  bmi  inc  educ  black  white  hispanic
##   1   4  bmi  inc  educ  black  white  hispanic
##   1   5  bmi  inc  educ  black  white  hispanic
##   2   1  bmi  inc  educ  black  white  hispanic
##   2   2  bmi  inc  educ  black  white  hispanic
##   2   3  bmi  inc  educ  black  white  hispanic
##   2   4  bmi  inc  educ  black  white  hispanic
##   2   5  bmi  inc  educ  black  white  hispanic
##   3   1  bmi  inc  educ  black  white  hispanic
##   3   2  bmi  inc  educ  black  white  hispanic
##   3   3  bmi  inc  educ  black  white  hispanic
##   3   4  bmi  inc  educ  black  white  hispanic
##   3   5  bmi  inc  educ  black  white  hispanic
##   4   1  bmi  inc  educ  black  white  hispanic
##   4   2  bmi  inc  educ  black  white  hispanic
##   4   3  bmi  inc  educ  black  white  hispanic
##   4   4  bmi  inc  educ  black  white  hispanic
##   4   5  bmi  inc  educ  black  white  hispanic
##   5   1  bmi  inc  educ  black  white  hispanic
##   5   2  bmi  inc  educ  black  white  hispanic
##   5   3  bmi  inc  educ  black  white  hispanic
##   5   4  bmi  inc  educ  black  white  hispanic
##   5   5  bmi  inc  educ  black  white  hispanic
print(imp)
## Multiply imputed data set
## Call:
## mice(data = brfss_11[, c("bmi", "inc", "agec", "educ", "black", 
##     "white", "hispanic")], seed = 22)
## Number of multiple imputations:  5
## Missing cells per column:
##      bmi      inc     agec     educ    black    white hispanic 
##      467     1061        0       26      144      144      144 
## Imputation methods:
##       bmi       inc      agec      educ     black     white  hispanic 
##     "pmm" "polyreg"        "" "polyreg"  "logreg"  "logreg"  "logreg" 
## VisitSequence:
##      bmi      inc     educ    black    white hispanic 
##        1        2        4        5        6        7 
## PredictorMatrix:
##          bmi inc agec educ black white hispanic
## bmi        0   1    1    1     1     1        1
## inc        1   0    1    1     1     1        1
## agec       0   0    0    0     0     0        0
## educ       1   1    1    0     1     1        1
## black      1   1    1    1     0     1        1
## white      1   1    1    1     1     0        1
## hispanic   1   1    1    1     1     1        0
## Random generator seed value:  22

Shows how many imputations were done. It also shows total missingness, which imputation method was used for each variable (because you wouldn’t want to use a normal distribution for a categorical variable!!). It also shows the sequence of how each variable is visited (or imputed, the default is left to right).

We may want to make sure imputed values are plausible by having a look:

head(imp$imp$bmi)
##            1     2     3     4     5
## 112488 20.05 23.17 24.95 28.25 22.47
## 112505 33.47 41.57 23.87 40.60 29.97
## 112508 41.35 31.25 23.30 36.49 31.89
## 112528 26.57 31.79 24.07 37.12 35.28
## 112537 32.28 32.08 29.76 22.73 23.40
## 112591 34.33 20.54 27.32 29.85 38.32
head(imp$imp$inc)
##                1         2    3         4         5
## 112406 15 to 25k      50k+ 50k+ 15 to 25k 15 to 25k
## 112434      <15k      50k+ <15k      50k+      <15k
## 112436      50k+      50k+ 50k+      50k+      50k+
## 112445      50k+      50k+ 50k+      50k+      50k+
## 112450 15 to 25k 25 to 35k <15k 15 to 25k 15 to 25k
## 112470      50k+ 35 to 50k 50k+      50k+      50k+

We can also do some plotting. For instance if we want to see how the observed and imputed values of bmi look with respect to income, we can do:

stripplot(imp,bmi~inc|.imp, pch=20)

and we see the distribution of the original data (blue dots), the imputed data (red dots) across the levels of income, for each of the five different imputation runs(the number at the top shows which run, and the first plot is the original data).

This plot shows that the bmi values correspond well with the observed data, so they are probably plausible values.

If we want to get our new, imputed data, we can use the complete() function, which by default extracts the first imputed data set. If we want a different one, we can do complete(imp, action=3) for example, to get the third imputed data set.

dat.imp<-complete(imp)
head(dat.imp, n=10)
##          bmi       inc    agec     educ black white hispanic
## 112405 22.32      50k+ (59,79] 4colgrad     0     1        0
## 112406 22.15 15 to 25k (24,39] 3somecol     0     1        0
## 112407 21.81      50k+ (24,39] 4colgrad     0     1        0
## 112408 30.17      50k+ (24,39] 4colgrad     0     0        0
## 112409 23.03 25 to 35k  (0,24]  2hsgrad     0     0        1
## 112410 24.41      50k+ (39,59] 4colgrad     0     0        1
## 112411 25.09 35 to 50k (39,59]  2hsgrad     0     1        0
## 112412 20.18      50k+  (0,24] 3somecol     0     1        0
## 112413 23.75      50k+ (24,39] 4colgrad     0     1        0
## 112414 30.27      50k+ (24,39] 4colgrad     0     0        0
#Compare to the original data
head(brfss_11[,c("bmi", "inc", "agec","educ","black","white","hispanic")], n=10)
##          bmi       inc    agec     educ black white hispanic
## 112405 22.32      50k+ (59,79] 4colgrad     0     1        0
## 112406 22.15      <NA> (24,39] 3somecol     0     1        0
## 112407 21.81      50k+ (24,39] 4colgrad     0     1        0
## 112408 30.17      50k+ (24,39] 4colgrad     0     0        0
## 112409 23.03 25 to 35k  (0,24]  2hsgrad     0     0        1
## 112410 24.41      50k+ (39,59] 4colgrad     0     0        1
## 112411 25.09 35 to 50k (39,59]  2hsgrad     0     1        0
## 112412 20.18      50k+  (0,24] 3somecol     0     1        0
## 112413 23.75      50k+ (24,39] 4colgrad     0     1        0
## 112414 30.27      50k+ (24,39] 4colgrad     0     0        0

While the first few cases don’t show much missingness, we can coax some more interesting cases out and compare the original data to the imputed:

head(dat.imp[is.na(brfss_11$bmi)==T,], n=10)
##          bmi       inc    agec     educ black white hispanic
## 112488 20.05      50k+ (59,79] 4colgrad     0     0        0
## 112505 33.47 35 to 50k (24,39]  2hsgrad     0     1        0
## 112508 41.35 15 to 25k (39,59]  1somehs     0     0        1
## 112528 26.57 15 to 25k (59,79] 3somecol     1     0        0
## 112537 32.28      50k+ (39,59] 4colgrad     0     0        1
## 112591 34.33 15 to 25k (59,79]  2hsgrad     0     0        1
## 112605 24.69 35 to 50k (39,59]  2hsgrad     0     1        0
## 112606 20.05      50k+ (59,79] 4colgrad     0     1        0
## 112691 23.57 35 to 50k (59,79] 3somecol     0     1        0
## 112702 28.35      <15k (59,79] 4colgrad     0     1        0
head(brfss_11[is.na(brfss_11$bmi)==T,c("bmi", "inc", "agec","educ","black","white","hispanic")], n=10)
##        bmi       inc    agec     educ black white hispanic
## 112488  NA      <NA> (59,79] 4colgrad  <NA>  <NA>     <NA>
## 112505  NA 35 to 50k (24,39]  2hsgrad     0     1        0
## 112508  NA 15 to 25k (39,59]  1somehs     0     0        1
## 112528  NA      <NA> (59,79] 3somecol     1     0        0
## 112537  NA      50k+ (39,59] 4colgrad     0     0        1
## 112591  NA      <NA> (59,79]  2hsgrad     0     0        1
## 112605  NA 35 to 50k (39,59]  2hsgrad     0     1        0
## 112606  NA      50k+ (59,79] 4colgrad     0     1        0
## 112691  NA      <NA> (59,79] 3somecol     0     1        0
## 112702  NA      <15k (59,79] 4colgrad  <NA>  <NA>     <NA>

Analyzing the imputed data

A key element of using imputed data, is that the relationships we want to know about should be maintained after imputation, and presumably, the relationships within each imputed data set will be the same. So if we used each of the (5 in this case) imputed data sets in a model, then we should see similar results across the five different models.

Here I look at a linear model for bmi:

#Now, I will see the variability in the 5 different imputations for each outcom
fit.bmi<-with(data=imp ,exp=lm(bmi~inc+agec+educ+black+hispanic))
fit.bmi
## call :
## with.mids(data = imp, expr = lm(bmi ~ inc + agec + educ + black + 
##     hispanic))
## 
## call1 :
## mice(data = brfss_11[, c("bmi", "inc", "agec", "educ", "black", 
##     "white", "hispanic")], seed = 22)
## 
## nmis :
##      bmi      inc     agec     educ    black    white hispanic 
##      467     1061        0       26      144      144      144 
## 
## analyses :
## [[1]]
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + black + hispanic)
## 
## Coefficients:
## (Intercept)         inc2         inc3         inc4         inc5  
##     25.2841      -0.4035      -0.7955      -0.8272      -1.1873  
##       agec2        agec3        agec4        agec5        educ2  
##      2.9697       3.6971       3.2278       0.4934       1.4142  
##       educ3        educ4        educ5       black2    hispanic2  
##      0.3235       0.1260      -0.6790       2.4349       0.9206  
## 
## 
## [[2]]
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + black + hispanic)
## 
## Coefficients:
## (Intercept)         inc2         inc3         inc4         inc5  
##    25.39351     -0.58945     -0.74322     -0.69994     -1.14988  
##       agec2        agec3        agec4        agec5        educ2  
##     2.96031      3.56140      3.10236      0.46484      0.73748  
##       educ3        educ4        educ5       black2    hispanic2  
##     0.07692      0.11767     -0.83615      2.49769      0.92526  
## 
## 
## [[3]]
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + black + hispanic)
## 
## Coefficients:
## (Intercept)         inc2         inc3         inc4         inc5  
##     25.4947      -0.5781      -0.8876      -0.7314      -1.2941  
##       agec2        agec3        agec4        agec5        educ2  
##      2.8075       3.6386       3.1418       0.3749       1.1017  
##       educ3        educ4        educ5       black2    hispanic2  
##      0.1276       0.1176      -0.7927       2.3336       0.8731  
## 
## 
## [[4]]
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + black + hispanic)
## 
## Coefficients:
## (Intercept)         inc2         inc3         inc4         inc5  
##    25.46596     -0.61609     -0.78489     -0.52006     -1.06511  
##       agec2        agec3        agec4        agec5        educ2  
##     2.60768      3.46996      3.03328      0.40306      0.97205  
##       educ3        educ4        educ5       black2    hispanic2  
##     0.17251      0.06283     -0.78019      2.60211      0.82089  
## 
## 
## [[5]]
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + black + hispanic)
## 
## Coefficients:
## (Intercept)         inc2         inc3         inc4         inc5  
##     25.6508      -0.7195      -0.7867      -1.0157      -1.3082  
##       agec2        agec3        agec4        agec5        educ2  
##      2.7382       3.4794       3.0711       0.3786       0.6830  
##       educ3        educ4        educ5       black2    hispanic2  
##      0.1529       0.1309      -0.8705       2.5121       0.7874
with (data=imp, exp=(sd(bmi)))
## call :
## with.mids(data = imp, expr = (sd(bmi)))
## 
## call1 :
## mice(data = brfss_11[, c("bmi", "inc", "agec", "educ", "black", 
##     "white", "hispanic")], seed = 22)
## 
## nmis :
##      bmi      inc     agec     educ    black    white hispanic 
##      467     1061        0       26      144      144      144 
## 
## analyses :
## [[1]]
## [1] 6.106734
## 
## [[2]]
## [1] 6.020854
## 
## [[3]]
## [1] 6.087242
## 
## [[4]]
## [1] 6.036923
## 
## [[5]]
## [1] 6.01707
with (data=imp, exp=(sd(inc)))
## call :
## with.mids(data = imp, expr = (sd(inc)))
## 
## call1 :
## mice(data = brfss_11[, c("bmi", "inc", "agec", "educ", "black", 
##     "white", "hispanic")], seed = 22)
## 
## nmis :
##      bmi      inc     agec     educ    black    white hispanic 
##      467     1061        0       26      144      144      144 
## 
## analyses :
## [[1]]
## [1] 1.491672
## 
## [[2]]
## [1] 1.499184
## 
## [[3]]
## [1] 1.495172
## 
## [[4]]
## [1] 1.49721
## 
## [[5]]
## [1] 1.492826
with (data=imp, exp=(sd(black)))
## call :
## with.mids(data = imp, expr = (sd(black)))
## 
## call1 :
## mice(data = brfss_11[, c("bmi", "inc", "agec", "educ", "black", 
##     "white", "hispanic")], seed = 22)
## 
## nmis :
##      bmi      inc     agec     educ    black    white hispanic 
##      467     1061        0       26      144      144      144 
## 
## analyses :
## [[1]]
## [1] 0.2891104
## 
## [[2]]
## [1] 0.2879921
## 
## [[3]]
## [1] 0.2909599
## 
## [[4]]
## [1] 0.2885521
## 
## [[5]]
## [1] 0.2885521
with (data=imp, exp=(sd(hispanic)))
## call :
## with.mids(data = imp, expr = (sd(hispanic)))
## 
## call1 :
## mice(data = brfss_11[, c("bmi", "inc", "agec", "educ", "black", 
##     "white", "hispanic")], seed = 22)
## 
## nmis :
##      bmi      inc     agec     educ    black    white hispanic 
##      467     1061        0       26      144      144      144 
## 
## analyses :
## [[1]]
## [1] 0.3836722
## 
## [[2]]
## [1] 0.3841118
## 
## [[3]]
## [1] 0.3828998
## 
## [[4]]
## [1] 0.3837822
## 
## [[5]]
## [1] 0.3836722
with (data=imp, exp=(sd(agec)))
## call :
## with.mids(data = imp, expr = (sd(agec)))
## 
## call1 :
## mice(data = brfss_11[, c("bmi", "inc", "agec", "educ", "black", 
##     "white", "hispanic")], seed = 22)
## 
## nmis :
##      bmi      inc     agec     educ    black    white hispanic 
##      467     1061        0       26      144      144      144 
## 
## analyses :
## [[1]]
## [1] 0.9868412
## 
## [[2]]
## [1] 0.9868412
## 
## [[3]]
## [1] 0.9868412
## 
## [[4]]
## [1] 0.9868412
## 
## [[5]]
## [1] 0.9868412
with (data=imp, exp=(sd(educ)))
## call :
## with.mids(data = imp, expr = (sd(educ)))
## 
## call1 :
## mice(data = brfss_11[, c("bmi", "inc", "agec", "educ", "black", 
##     "white", "hispanic")], seed = 22)
## 
## nmis :
##      bmi      inc     agec     educ    black    white hispanic 
##      467     1061        0       26      144      144      144 
## 
## analyses :
## [[1]]
## [1] 1.576475
## 
## [[2]]
## [1] 1.574776
## 
## [[3]]
## [1] 1.576993
## 
## [[4]]
## [1] 1.576846
## 
## [[5]]
## [1] 1.576354

Which we can compare to the model fit on the original data, with missings eliminated:

summary(lm(bmi~inc+agec+educ+black+hispanic, data=brfss_11))
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + black + hispanic, data = brfss_11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.500  -3.919  -0.919   2.912  51.609 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   25.7836     0.4634  55.642  < 2e-16 ***
## inc15 to 25k  -0.5571     0.2920  -1.908 0.056501 .  
## inc25 to 35k  -0.8082     0.3314  -2.439 0.014759 *  
## inc35 to 50k  -0.8099     0.3158  -2.564 0.010366 *  
## inc50k+       -1.2512     0.2767  -4.523 6.23e-06 ***
## agec(24,39]    2.5238     0.4349   5.803 6.83e-09 ***
## agec(39,59]    3.3917     0.4102   8.268  < 2e-16 ***
## agec(59,79]    2.8778     0.4113   6.996 2.91e-12 ***
## agec(79,99]    0.4257     0.4716   0.903 0.366724    
## educ0Prim      0.6516     0.4448   1.465 0.143006    
## educ1somehs    0.3543     0.3599   0.984 0.324937    
## educ3somecol   0.1935     0.2215   0.873 0.382438    
## educ4colgrad  -0.8018     0.2160  -3.713 0.000207 ***
## black1         2.6049     0.2656   9.810  < 2e-16 ***
## hispanic1      0.8436     0.2271   3.715 0.000205 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.859 on 6121 degrees of freedom
##   (1458 observations deleted due to missingness)
## Multiple R-squared:  0.06081,    Adjusted R-squared:  0.05866 
## F-statistic: 28.31 on 14 and 6121 DF,  p-value: < 2.2e-16

And we see the same relationships, and the same significant effects.

Now we pool the separate models:

est.p<-pool(fit.bmi)
print(est.p)
## Call: pool(object = fit.bmi)
## 
## Pooled coefficients:
## (Intercept)        inc2        inc3        inc4        inc5       agec2 
##  25.4578144  -0.5813415  -0.7995774  -0.7588615  -1.2008980   2.8166979 
##       agec3       agec4       agec5       educ2       educ3       educ4 
##   3.5693006   3.1152752   0.4229708   0.9816962   0.1706885   0.1109902 
##       educ5      black2   hispanic2 
##  -0.7917089   2.4760851   0.8654392 
## 
## Fraction of information about the coefficients missing due to nonresponse: 
## (Intercept)        inc2        inc3        inc4        inc5       agec2 
##  0.13828603  0.20812702  0.03889433  0.37107244  0.18346234  0.19573670 
##       agec3       agec4       agec5       educ2       educ3       educ4 
##  0.10072339  0.05877721  0.02292614  0.49462411  0.09969444  0.02286819 
##       educ5      black2   hispanic2 
##  0.15049545  0.18135604  0.10155164
summary(est.p)
##                    est        se          t         df     Pr(>|t|)
## (Intercept) 25.4578144 0.4092780 62.2017714  226.12589 0.000000e+00
## inc2        -0.5813415 0.2841284 -2.0460519  105.28848 4.324344e-02
## inc3        -0.7995774 0.2991165 -2.6731309 2011.94039 7.575412e-03
## inc4        -0.7588615 0.3430104 -2.2123571   35.12953 3.353602e-02
## inc5        -1.2008980 0.2693144 -4.4590939  133.40132 1.729905e-05
## agec2        2.8166979 0.3933157  7.1614170  118.14848 7.360157e-11
## agec3        3.5693006 0.3492304 10.2204753  405.75432 0.000000e+00
## agec4        3.1152752 0.3419729  9.1097127 1050.59986 0.000000e+00
## agec5        0.4229708 0.3905056  1.0831363 3836.78796 2.788160e-01
## educ2        0.9816962 0.4852687  2.0229952   19.97432 5.666501e-02
## educ3        0.1706885 0.3286549  0.5193547  413.43180 6.037913e-01
## educ4        0.1109902 0.2015461  0.5506938 3846.27487 5.818756e-01
## educ5       -0.7917089 0.2108900 -3.7541320  193.17256 2.300566e-04
## black2       2.4760851 0.2649433  9.3457152  136.31843 2.220446e-16
## hispanic2    0.8654392 0.2135866  4.0519356  399.72907 6.104608e-05
##                   lo 95       hi 95 nmis        fmi     lambda
## (Intercept) 24.65132798 26.26430091   NA 0.13828603 0.13069804
## inc2        -1.14469778 -0.01798532   NA 0.20812702 0.19322657
## inc3        -1.38618779 -0.21296705   NA 0.03889433 0.03793941
## inc4        -1.45511781 -0.06260512   NA 0.37107244 0.33625729
## inc5        -1.73357662 -0.66821934   NA 0.18346234 0.17131160
## agec2        2.03783586  3.59556001   NA 0.19573670 0.18223651
## agec3        2.88277385  4.25582745   NA 0.10072339 0.09630167
## agec4        2.44424751  3.78630290   NA 0.05877721 0.05698713
## agec5       -0.34264767  1.18858924   NA 0.02292614 0.02241695
## educ2       -0.03063993  1.99403236   NA 0.49462411 0.44643414
## educ3       -0.47535456  0.81673149   NA 0.09969444 0.09534967
## educ4       -0.28415727  0.50613763   NA 0.02286819 0.02236023
## educ5       -1.20765151 -0.37576620   NA 0.15049545 0.14174545
## black2       1.95215449  3.00001566   NA 0.18135604 0.16943275
## hispanic2    0.44554577  1.28533260   NA 0.10155164 0.09706757
#compare to original data
summary(lm(bmi~inc+agec+educ+black+hispanic, data=brfss_11))
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + black + hispanic, data = brfss_11)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.500  -3.919  -0.919   2.912  51.609 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   25.7836     0.4634  55.642  < 2e-16 ***
## inc15 to 25k  -0.5571     0.2920  -1.908 0.056501 .  
## inc25 to 35k  -0.8082     0.3314  -2.439 0.014759 *  
## inc35 to 50k  -0.8099     0.3158  -2.564 0.010366 *  
## inc50k+       -1.2512     0.2767  -4.523 6.23e-06 ***
## agec(24,39]    2.5238     0.4349   5.803 6.83e-09 ***
## agec(39,59]    3.3917     0.4102   8.268  < 2e-16 ***
## agec(59,79]    2.8778     0.4113   6.996 2.91e-12 ***
## agec(79,99]    0.4257     0.4716   0.903 0.366724    
## educ0Prim      0.6516     0.4448   1.465 0.143006    
## educ1somehs    0.3543     0.3599   0.984 0.324937    
## educ3somecol   0.1935     0.2215   0.873 0.382438    
## educ4colgrad  -0.8018     0.2160  -3.713 0.000207 ***
## black1         2.6049     0.2656   9.810  < 2e-16 ***
## hispanic1      0.8436     0.2271   3.715 0.000205 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.859 on 6121 degrees of freedom
##   (1458 observations deleted due to missingness)
## Multiple R-squared:  0.06081,    Adjusted R-squared:  0.05866 
## F-statistic: 28.31 on 14 and 6121 DF,  p-value: < 2.2e-16

If we wanted to see the ranges of the betas in the five imputed data models, we could do that:

#get the coefficients from each of the 5 imputations of bmi
coefs<-matrix(unlist(lapply(fit.bmi$analyses, coef)), nrow=5, ncol=15, byrow=T)

#plot the coefficients from each of the different rounds of imputation to see the variability in the
#results
plot(coefs[1,-1], ylim=c(-2, 4), xaxt="n",cex=1.5, pch=20, ylab="beta")
axis(1, at=1:14, labels=names(fit.bmi$analyses[[1]]$coef[-1]))
cols=2:5
for(i in 1:dim(coefs)[1]-1){
  points(coefs[i+1,-1], col=cols[i], cex=1.5, pch=20)
}
title(main="Estimated Betas from Each Imputed Regression Model for BMI Outcome")

Customized imputations

Now, what if I don’t want to use all information within the data to impute the values, let’s say I only want to use age to predict BMI, and not race. I can customize which variables are used in the imputation process by generating a predictor matrix:

#I have to provide a matrix of which variables will be used for imputing the other variables,
#I basically don't want the bmi, doc or inc variables to be influenced by the key predictors I want to use
#to analyze them later. So I only use income and age to predict BMI
predictorMatrix<-matrix(
c(
0,1,1,0,0,0,0,
0,0,1,1,0,0,0,
1,1,0,0,0,0,0,
0,1,1,0,1,1,1,
0,1,1,1,0,1,1,
0,1,1,1,1,0,1,
0,1,1,1,1,1,0), nrow=7, ncol=7, byrow=T)

colnames(predictorMatrix)<-rownames(predictorMatrix)<-c("bmi", "inc", "agec","educ","black","white","hispanic")
predictorMatrix
##          bmi inc agec educ black white hispanic
## bmi        0   1    1    0     0     0        0
## inc        0   0    1    1     0     0        0
## agec       1   1    0    0     0     0        0
## educ       0   1    1    0     1     1        1
## black      0   1    1    1     0     1        1
## white      0   1    1    1     1     0        1
## hispanic   0   1    1    1     1     1        0
newimp<-mice(data = brfss_11[,c("bmi","inc", "agec","educ","black","white","hispanic")], seed = 22, predictorMatrix = predictorMatrix)
## 
##  iter imp variable
##   1   1  bmi  inc  educ  black  white  hispanic
##   1   2  bmi  inc  educ  black  white  hispanic
##   1   3  bmi  inc  educ  black  white  hispanic
##   1   4  bmi  inc  educ  black  white  hispanic
##   1   5  bmi  inc  educ  black  white  hispanic
##   2   1  bmi  inc  educ  black  white  hispanic
##   2   2  bmi  inc  educ  black  white  hispanic
##   2   3  bmi  inc  educ  black  white  hispanic
##   2   4  bmi  inc  educ  black  white  hispanic
##   2   5  bmi  inc  educ  black  white  hispanic
##   3   1  bmi  inc  educ  black  white  hispanic
##   3   2  bmi  inc  educ  black  white  hispanic
##   3   3  bmi  inc  educ  black  white  hispanic
##   3   4  bmi  inc  educ  black  white  hispanic
##   3   5  bmi  inc  educ  black  white  hispanic
##   4   1  bmi  inc  educ  black  white  hispanic
##   4   2  bmi  inc  educ  black  white  hispanic
##   4   3  bmi  inc  educ  black  white  hispanic
##   4   4  bmi  inc  educ  black  white  hispanic
##   4   5  bmi  inc  educ  black  white  hispanic
##   5   1  bmi  inc  educ  black  white  hispanic
##   5   2  bmi  inc  educ  black  white  hispanic
##   5   3  bmi  inc  educ  black  white  hispanic
##   5   4  bmi  inc  educ  black  white  hispanic
##   5   5  bmi  inc  educ  black  white  hispanic
print(newimp)
## Multiply imputed data set
## Call:
## mice(data = brfss_11[, c("bmi", "inc", "agec", "educ", "black", 
##     "white", "hispanic")], predictorMatrix = predictorMatrix, 
##     seed = 22)
## Number of multiple imputations:  5
## Missing cells per column:
##      bmi      inc     agec     educ    black    white hispanic 
##      467     1061        0       26      144      144      144 
## Imputation methods:
##       bmi       inc      agec      educ     black     white  hispanic 
##     "pmm" "polyreg"        "" "polyreg"  "logreg"  "logreg"  "logreg" 
## VisitSequence:
##      bmi      inc     educ    black    white hispanic 
##        1        2        4        5        6        7 
## PredictorMatrix:
##          bmi inc agec educ black white hispanic
## bmi        0   1    1    0     0     0        0
## inc        0   0    1    1     0     0        0
## agec       0   0    0    0     0     0        0
## educ       0   1    1    0     1     1        1
## black      0   1    1    1     0     1        1
## white      0   1    1    1     1     0        1
## hispanic   0   1    1    1     1     1        0
## Random generator seed value:  22
fit.bmi2<-with(data=newimp ,exp=lm(bmi~inc+agec+educ+black+hispanic))
est.p2<-pool(fit.bmi)
summary(est.p2)
##                    est        se          t         df     Pr(>|t|)
## (Intercept) 25.4578144 0.4092780 62.2017714  226.12589 0.000000e+00
## inc2        -0.5813415 0.2841284 -2.0460519  105.28848 4.324344e-02
## inc3        -0.7995774 0.2991165 -2.6731309 2011.94039 7.575412e-03
## inc4        -0.7588615 0.3430104 -2.2123571   35.12953 3.353602e-02
## inc5        -1.2008980 0.2693144 -4.4590939  133.40132 1.729905e-05
## agec2        2.8166979 0.3933157  7.1614170  118.14848 7.360157e-11
## agec3        3.5693006 0.3492304 10.2204753  405.75432 0.000000e+00
## agec4        3.1152752 0.3419729  9.1097127 1050.59986 0.000000e+00
## agec5        0.4229708 0.3905056  1.0831363 3836.78796 2.788160e-01
## educ2        0.9816962 0.4852687  2.0229952   19.97432 5.666501e-02
## educ3        0.1706885 0.3286549  0.5193547  413.43180 6.037913e-01
## educ4        0.1109902 0.2015461  0.5506938 3846.27487 5.818756e-01
## educ5       -0.7917089 0.2108900 -3.7541320  193.17256 2.300566e-04
## black2       2.4760851 0.2649433  9.3457152  136.31843 2.220446e-16
## hispanic2    0.8654392 0.2135866  4.0519356  399.72907 6.104608e-05
##                   lo 95       hi 95 nmis        fmi     lambda
## (Intercept) 24.65132798 26.26430091   NA 0.13828603 0.13069804
## inc2        -1.14469778 -0.01798532   NA 0.20812702 0.19322657
## inc3        -1.38618779 -0.21296705   NA 0.03889433 0.03793941
## inc4        -1.45511781 -0.06260512   NA 0.37107244 0.33625729
## inc5        -1.73357662 -0.66821934   NA 0.18346234 0.17131160
## agec2        2.03783586  3.59556001   NA 0.19573670 0.18223651
## agec3        2.88277385  4.25582745   NA 0.10072339 0.09630167
## agec4        2.44424751  3.78630290   NA 0.05877721 0.05698713
## agec5       -0.34264767  1.18858924   NA 0.02292614 0.02241695
## educ2       -0.03063993  1.99403236   NA 0.49462411 0.44643414
## educ3       -0.47535456  0.81673149   NA 0.09969444 0.09534967
## educ4       -0.28415727  0.50613763   NA 0.02286819 0.02236023
## educ5       -1.20765151 -0.37576620   NA 0.15049545 0.14174545
## black2       1.95215449  3.00001566   NA 0.18135604 0.16943275
## hispanic2    0.44554577  1.28533260   NA 0.10155164 0.09706757
#get the coefficients from each of the 5 imputations of bmi
coefs2<-matrix(unlist(lapply(fit.bmi2$analyses, coef)), nrow=5, ncol=15, byrow=T)

#plot the coefficients from each of the different rounds of imputation to see the variability in the
#results
plot(coefs2[1,-1], ylim=c(-2, 4), xaxt="n",cex=1.5, pch=20, ylab="beta")
axis(1, at=1:14, labels=names(fit.bmi2$analyses[[1]]$coef[-1]))
cols=2:5
for(i in 1:dim(coefs2)[1]-1){
  points(coefs2[i+1,-1], col=cols[i], cex=1.5, pch=20)
}
title(main="Estimated Betas from Each Imputed Regression Model for BMI Outcome")