This example will illustrate typical aspects of dealing with missing data. Topics will include: Mean imputation, modal imputation for categorical data, and multiple imputation of complex patterns of missing data.

For this example I am using 2016 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART county data. Link

library(car)
library(mice)
## Loading required package: lattice
library(ggplot2)
load(file = "~/Google Drive/classes/dem7283/class18/data/brfss16_mmsa.Rdata")
nams<-names(brfss16m)
head(nams, n=10)
##  [1] "DISPCODE" "HHADULT"  "GENHLTH"  "PHYSHLTH" "MENTHLTH" "POORHLTH"
##  [7] "HLTHPLN1" "PERSDOC2" "MEDCOST"  "CHECKUP1"
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
set.seed(1234)
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(brfss16m)<-newnames

samp<-sample(1:dim(brfss16m)[1], size = 50000) #smaller sample for brevity
brfss16m<-brfss16m[samp,]
#Healthy days
brfss16m$healthdays<-recode(brfss16m$physhlth, recodes = "88=0; 77=NA; 99=NA")

#Healthy mental health days
brfss16m$healthmdays<-recode(brfss16m$menthlth, recodes = "88=0; 77=NA; 99=NA")

brfss16m$badhealth<-recode(brfss16m$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss16m$black<-recode(brfss16m$racegr3, recodes="2=1; 9=NA; else=0")
brfss16m$white<-recode(brfss16m$racegr3, recodes="1=1; 9=NA; else=0")
brfss16m$other<-recode(brfss16m$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss16m$hispanic<-recode(brfss16m$racegr3, recodes="5=1; 9=NA; else=0")

brfss16m$race_eth<-recode(brfss16m$racegr3, 
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor.result = T)
brfss16m$race_eth<-relevel(brfss16m$race_eth, ref = "nhwhite")

#insurance
brfss16m$ins<-recode(brfss16m$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

#income grouping
brfss16m$inc<-recode(brfss16m$incomg, recodes = "9= NA;1='1_lt15k'; 2='2_15-25k';3='3_25-35k';4='4_35-50k';5='5_50kplus'", as.factor.result = T)
brfss16m$inc<-as.ordered(brfss16m$inc)
#education level
brfss16m$educ<-recode(brfss16m$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor.result=T)
brfss16m$educ<-relevel(brfss16m$educ, ref='2hsgrad')

#employloyment
brfss16m$employ<-recode(brfss16m$employ1,
recodes="1:2='employloyed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor.result=T)
brfss16m$employ<-relevel(brfss16m$employ, ref='employloyed')

#marital status
brfss16m$marst<-recode(brfss16m$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor.result=T)
brfss16m$marst<-relevel(brfss16m$marst, ref='married')

#Age cut into intervals
brfss16m$agec<-cut(brfss16m$age80, breaks=c(0,24,39,59,79,99))

#BMI, in the brfss16ma the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's

brfss16m$bmi<-brfss16m$bmi5/100

#smoking currently
brfss16m$smoke<-recode(brfss16m$smoker3, 
recodes="1:2=1; 3:4=0; else=NA")
#brfss16m$smoke<-relevel(brfss16m$smoke, ref = "NeverSmoked")

Now, we can get a general idea of the missingness of these variables by just using summary(brfss16m)

summary(brfss16m[, c("ins", "smoke",  "bmi", "badhealth", "race_eth",  "educ", "employ", "marst", "inc")])
##       ins             smoke             bmi          badhealth     
##  Min.   :0.0000   Min.   :0.0000   Min.   :12.80   Min.   :0.0000  
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:23.71   1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000   Median :26.72   Median :0.0000  
##  Mean   :0.9307   Mean   :0.1351   Mean   :27.84   Mean   :0.1732  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:30.85   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :81.53   Max.   :1.0000  
##  NA's   :220      NA's   :2129     NA's   :4275    NA's   :127     
##          race_eth           educ               employ     
##  nhwhite     :36563   2hsgrad :12526   employloyed:25391  
##  hispanic    : 4774   0Prim   : 1176   nilf       : 6333  
##  nh black    : 4985   1somehs : 2244   retired    :14575  
##  nh multirace:  723   3somecol:13442   unable     : 3263  
##  nh other    : 1954   4colgrad:20409   NA's       :  438  
##  NA's        : 1001   NA's    :  203                      
##                                                           
##        marst              inc       
##  married  :25698   1_lt15k  : 3685  
##  cohab    : 1678   2_15-25k : 6433  
##  divorced : 6561   3_25-35k : 4107  
##  nm       : 8790   4_35-50k : 5635  
##  separated:  995   5_50kplus:21648  
##  widowed  : 5875   NA's     : 8492  
##  NA's     :  403

Which shows that, among these recoded variables, inc , the income variable, 8492 people in the BRFSS, or 16.984% of the sample.

The lowest number of missings is in the bad health variable, which only has 0.254% missing.

Mean imputation

Now, i’m going to illustrate mean imputation of a continuous variable, BMI.

#I'm going to play with 3 outcomes, bmi, having a regular doctor and income category
summary(brfss16m$bmi) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   12.80   23.71   26.72   27.84   30.85   81.53    4275
#what happens when we replace the missings with the mean?
brfss16m$bmi.imp.mean<-ifelse(is.na(brfss16m$bmi)==T, mean(brfss16m$bmi, na.rm=T), brfss16m$bmi)

mean(brfss16m$bmi, na.rm=T)
## [1] 27.83772
mean(brfss16m$bmi.imp.mean) #no difference!
## [1] 27.83772
median(brfss16m$bmi, na.rm=T)
## [1] 26.72
median(brfss16m$bmi.imp.mean) #slight difference
## [1] 27.44
var(brfss16m$bmi, na.rm=T)
## [1] 36.8107
var(brfss16m$bmi.imp.mean) # more noticeable difference!
## [1] 33.66332

So what we see here, is that imputing with the mean does nothing to central tendency (when measured using the mean, but does affect the median slightly), but it does reduce the variance in the outcome. This is because you’re replacing all missing cases with the most likely value (the mean), so you’re artificially deflating the variance. That’s not good.

We can see this in a histogram:

#plot the histogram
hist(brfss16m$bmi.imp.mean)
hist(brfss16m$bmi, add=T ,col=1) #

Where you can see the extra values at the mean by the white area over the mode.

Multiple Imputation

These days, these types of imputation have been far surpassed by more complete methods that are based upon regression methods. These methods are generally referred to as multiple imputation, because we are really interested in imputing multiple variables simultaneously. Instead of reviewing this perspective here, I suggest you have a look at Joe Schafer’s site that gives a nice treatment of the subject. Here, I will use the imputation techniques in the mice library in R, which you can read about here.

I have used these in practice in publications and generally like the framework the library uses. Another popular technique is in the Amelia library of Gary King, which I haven’t used much. If you are serious about doing multiple imputation it would be advised to investigate multiple methodologies.

To begin, I explore the various patterns of missingness in the data. The md.pattern function in mice does this nicely. Here, each row corresponds to a particular pattern of missingness (1 = observed, 0=missing)

#look at the patterns of missingness
md.pattern(brfss16m[,c("bmi", "inc", "agec","educ","race_eth")])
##       agec educ race_eth  bmi  inc      
## 38652    1    1        1    1    1     0
##  2202    1    1        1    0    1     1
##  6226    1    1        1    1    0     1
##    45    1    0        1    1    1     1
##   533    1    1        0    1    1     1
##  1773    1    1        1    0    0     2
##    10    1    0        1    0    1     2
##    44    1    0        1    1    0     2
##    62    1    1        0    0    1     2
##   209    1    1        0    1    0     2
##    47    1    0        1    0    0     3
##   140    1    1        0    0    0     3
##     4    1    0        0    0    1     3
##    16    1    0        0    1    0     3
##    37    1    0        0    0    0     4
##          0  203     1001 4275 8492 13971

Shows that 3688 rows of the data are complete (first row).

The second row shows that 135 people are missing only the bmi variable. I say only because 4275 people are missing the bmi variable in total. Apparently some folks are missing in combination with other variables.

Sure enough, if we look down, we see that 95 people are missing bmi AND income, 17 are missing bmi AND income and education, and so on.

The bottom row tells how many total people are missing each variable, in ANY combination with other variables.

If you want to see how pairs of variables are missing together, the md.pairs() function will show this.

A pair of variables can have exactly four missingness patterns: Both variables are observed (pattern rr), the first variable is observed and the second variable is missing (pattern rm), the first variable is missing and the second variable is observed (pattern mr), and both are missing (pattern mm).

md.pairs(brfss16m[,c("bmi", "inc", "agec","educ","race_eth")])
## $rr
##            bmi   inc  agec  educ race_eth
## bmi      45725 39230 45725 45620    44967
## inc      39230 41508 41508 41449    40909
## agec     45725 41508 50000 49797    48999
## educ     45620 41449 49797 49797    48853
## race_eth 44967 40909 48999 48853    48999
## 
## $rm
##           bmi  inc agec educ race_eth
## bmi         0 6495    0  105      758
## inc      2278    0    0   59      599
## agec     4275 8492    0  203     1001
## educ     4177 8348    0    0      944
## race_eth 4032 8090    0  146        0
## 
## $mr
##           bmi  inc agec educ race_eth
## bmi         0 2278 4275 4177     4032
## inc      6495    0 8492 8348     8090
## agec        0    0    0    0        0
## educ      105   59  203    0      146
## race_eth  758  599 1001  944        0
## 
## $mm
##           bmi  inc agec educ race_eth
## bmi      4275 1997    0   98      243
## inc      1997 8492    0  144      402
## agec        0    0    0    0        0
## educ       98  144    0  203       57
## race_eth  243  402    0   57     1001

Basic imputation:

We can perform a basic multiple imputation by simply doing: Note this may take a very long time with big data sets

imp<-mice(data = brfss16m[,c("bmi", "inc", "agec","educ","race_eth")], seed = 22)
## 
##  iter imp variable
##   1   1  bmi  inc  educ  race_eth
##   1   2  bmi  inc  educ  race_eth
##   1   3  bmi  inc  educ  race_eth
##   1   4  bmi  inc  educ  race_eth
##   1   5  bmi  inc  educ  race_eth
##   2   1  bmi  inc  educ  race_eth
##   2   2  bmi  inc  educ  race_eth
##   2   3  bmi  inc  educ  race_eth
##   2   4  bmi  inc  educ  race_eth
##   2   5  bmi  inc  educ  race_eth
##   3   1  bmi  inc  educ  race_eth
##   3   2  bmi  inc  educ  race_eth
##   3   3  bmi  inc  educ  race_eth
##   3   4  bmi  inc  educ  race_eth
##   3   5  bmi  inc  educ  race_eth
##   4   1  bmi  inc  educ  race_eth
##   4   2  bmi  inc  educ  race_eth
##   4   3  bmi  inc  educ  race_eth
##   4   4  bmi  inc  educ  race_eth
##   4   5  bmi  inc  educ  race_eth
##   5   1  bmi  inc  educ  race_eth
##   5   2  bmi  inc  educ  race_eth
##   5   3  bmi  inc  educ  race_eth
##   5   4  bmi  inc  educ  race_eth
##   5   5  bmi  inc  educ  race_eth
print(imp)
## Multiply imputed data set
## Call:
## mice(data = brfss16m[, c("bmi", "inc", "agec", "educ", "race_eth")], 
##     seed = 22)
## Number of multiple imputations:  5
## Missing cells per column:
##      bmi      inc     agec     educ race_eth 
##     4275     8492        0      203     1001 
## Imputation methods:
##       bmi       inc      agec      educ  race_eth 
##     "pmm"    "polr"        "" "polyreg" "polyreg" 
## VisitSequence:
##      bmi      inc     educ race_eth 
##        1        2        4        5 
## PredictorMatrix:
##          bmi inc agec educ race_eth
## bmi        0   1    1    1        1
## inc        1   0    1    1        1
## agec       0   0    0    0        0
## educ       1   1    1    0        1
## race_eth   1   1    1    1        0
## Random generator seed value:  22

Shows how many imputations were done. It also shows total missingness, which imputation method was used for each variable (because you wouldn’t want to use a normal distribution for a categorical variable!!).

It also shows the sequence of how each variable is visited (or imputed, the default is left to right).

We may want to make sure imputed values are plausible by having a look. For instance, are the BMI values outside of the range of the data.

head(imp$imp$bmi)
##        1     2     3     4     5
## 11 23.33 35.62 45.35 27.44 33.84
## 19 19.37 23.26 24.21 17.33 32.12
## 23 45.18 32.22 29.09 19.13 36.80
## 52 31.87 20.10 51.37 36.81 45.17
## 55 25.77 24.96 20.09 28.99 22.50
## 62 24.39 29.05 26.07 22.31 22.14
summary(imp$imp$bmi)
##        1               2               3               4        
##  Min.   :13.19   Min.   :13.66   Min.   :13.73   Min.   :13.19  
##  1st Qu.:23.80   1st Qu.:23.63   1st Qu.:23.17   1st Qu.:24.21  
##  Median :27.12   Median :27.12   Median :27.07   Median :26.92  
##  Mean   :27.95   Mean   :28.21   Mean   :27.89   Mean   :27.85  
##  3rd Qu.:30.99   3rd Qu.:31.62   3rd Qu.:31.18   3rd Qu.:30.54  
##  Max.   :65.47   Max.   :60.55   Max.   :71.06   Max.   :81.53  
##        5        
##  Min.   :13.19  
##  1st Qu.:24.03  
##  Median :27.41  
##  Mean   :28.62  
##  3rd Qu.:31.74  
##  Max.   :81.53
summary(brfss16m$bmi)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   12.80   23.71   26.72   27.84   30.85   81.53    4275
head(imp$imp$inc)
##            1         2         3         4         5
## 2  5_50kplus 5_50kplus  4_35-50k  4_35-50k   1_lt15k
## 4   4_35-50k  2_15-25k 5_50kplus  4_35-50k  2_15-25k
## 6  5_50kplus  3_25-35k 5_50kplus  3_25-35k 5_50kplus
## 11 5_50kplus 5_50kplus 5_50kplus 5_50kplus 5_50kplus
## 16  2_15-25k 5_50kplus 5_50kplus 5_50kplus  4_35-50k
## 19 5_50kplus 5_50kplus 5_50kplus 5_50kplus 5_50kplus
summary(imp$imp$inc)
##          1                2                3                4       
##  1_lt15k  : 948   1_lt15k  : 963   1_lt15k  : 898   1_lt15k  : 932  
##  2_15-25k :1585   2_15-25k :1566   2_15-25k :1647   2_15-25k :1614  
##  3_25-35k : 942   3_25-35k : 920   3_25-35k : 941   3_25-35k : 972  
##  4_35-50k :1234   4_35-50k :1245   4_35-50k :1175   4_35-50k :1205  
##  5_50kplus:3783   5_50kplus:3798   5_50kplus:3831   5_50kplus:3769  
##          5       
##  1_lt15k  : 909  
##  2_15-25k :1561  
##  3_25-35k :1016  
##  4_35-50k :1199  
##  5_50kplus:3807

Which shows the imputed values for the first 6 cases across the 5 different imputations, as well as the numeric summary of the imputed values. We can see that there is variation across the imputations, because the imputed values are not the same.

We can also do some plotting. For instance if we want to see how the observed and imputed values of bmi look with respect to race, we can do:

library(lattice)
stripplot(imp,bmi~race_eth|.imp, pch=20)

and we see the distribution of the original data (blue dots), the imputed data (red dots) across the levels of race, for each of the five different imputation runs(the number at the top shows which run, and the first plot is the original data).

This plot shows that the bmi values correspond well with the observed data, so they are probably plausible valuesU.

If we want to get our new, imputed data, we can use the complete() function, which by default extracts the first imputed data set. If we want a different one, we can do complete(imp, action=3) for example, to get the third imputed data set.

dat.imp<-complete(imp, action = 5)
head(dat.imp, n=10)
##      bmi       inc    agec     educ race_eth
## 1  33.45 5_50kplus (39,59] 4colgrad  nhwhite
## 2  40.34   1_lt15k (24,39]  2hsgrad  nhwhite
## 3  30.41   1_lt15k (39,59] 3somecol hispanic
## 4  22.24  2_15-25k (24,39]  2hsgrad  nhwhite
## 5  20.53 5_50kplus (39,59] 4colgrad  nhwhite
## 6  37.59 5_50kplus (24,39] 3somecol  nhwhite
## 7  26.78 5_50kplus (39,59] 3somecol  nhwhite
## 8  20.66   1_lt15k (24,39] 3somecol hispanic
## 9  30.62 5_50kplus (39,59] 3somecol  nhwhite
## 10 25.83 5_50kplus (39,59] 3somecol  nhwhite
#Compare to the original data
head(brfss16m[,c("bmi", "inc", "agec","educ","race_eth")], n=10)
## # A tibble: 10 x 5
##      bmi inc       agec    educ     race_eth
##    <dbl> <ord>     <fct>   <fct>    <fct>   
##  1  33.4 5_50kplus (39,59] 4colgrad nhwhite 
##  2  40.3 <NA>      (24,39] 2hsgrad  nhwhite 
##  3  30.4 1_lt15k   (39,59] 3somecol hispanic
##  4  22.2 <NA>      (24,39] 2hsgrad  <NA>    
##  5  20.5 5_50kplus (39,59] 4colgrad nhwhite 
##  6  37.6 <NA>      (24,39] 3somecol nhwhite 
##  7  26.8 5_50kplus (39,59] 3somecol nhwhite 
##  8  20.7 1_lt15k   (24,39] 3somecol hispanic
##  9  30.6 5_50kplus (39,59] 3somecol nhwhite 
## 10  25.8 5_50kplus (39,59] 3somecol nhwhite

While the first few cases don’t show much missingness, we can coax some more interesting cases out and compare the original data to the imputed:

head(dat.imp[is.na(brfss16m$bmi)==T,], n=10)
##      bmi       inc    agec     educ     race_eth
## 11 33.84 5_50kplus (39,59]  2hsgrad      nhwhite
## 19 32.12 5_50kplus (59,79] 4colgrad      nhwhite
## 23 36.80 5_50kplus (39,59]    0Prim     hispanic
## 52 45.17 5_50kplus (24,39] 3somecol     nh black
## 55 22.50  3_25-35k (79,99]    0Prim      nhwhite
## 62 22.14  2_15-25k (39,59] 4colgrad      nhwhite
## 63 22.66  3_25-35k (59,79] 4colgrad      nhwhite
## 69 40.24  3_25-35k (59,79] 3somecol nh multirace
## 85 18.79  3_25-35k (59,79] 4colgrad      nhwhite
## 94 30.41  4_35-50k (59,79] 4colgrad      nhwhite
head(brfss16m[is.na(brfss16m$bmi)==T,c("bmi", "inc", "agec","educ","race_eth")], n=10)
## # A tibble: 10 x 5
##      bmi inc      agec    educ     race_eth    
##    <dbl> <ord>    <fct>   <fct>    <fct>       
##  1    NA <NA>     (39,59] 2hsgrad  nhwhite     
##  2    NA <NA>     (59,79] 4colgrad nhwhite     
##  3    NA <NA>     (39,59] 0Prim    hispanic    
##  4    NA <NA>     (24,39] 3somecol nh black    
##  5    NA <NA>     (79,99] 0Prim    nhwhite     
##  6    NA 2_15-25k (39,59] 4colgrad nhwhite     
##  7    NA 3_25-35k (59,79] 4colgrad nhwhite     
##  8    NA <NA>     (59,79] 3somecol nh multirace
##  9    NA 3_25-35k (59,79] 4colgrad nhwhite     
## 10    NA 4_35-50k (59,79] 4colgrad nhwhite

Analyzing the imputed data

A key element of using imputed data, is that the relationships we want to know about should be maintained after imputation, and presumably, the relationships within each imputed data set will be the same. So if we used each of the (5 in this case) imputed data sets in a model, then we should see similar results across the five different models.

Here I look at a linear model for bmi:

#Now, I will see the variability in the 5 different imputations for each outcom
fit.bmi<-with(data=imp ,expr=lm(bmi~inc+agec+educ+race_eth))
fit.bmi
## call :
## with.mids(data = imp, expr = lm(bmi ~ inc + agec + educ + race_eth))
## 
## call1 :
## mice(data = brfss16m[, c("bmi", "inc", "agec", "educ", "race_eth")], 
##     seed = 22)
## 
## nmis :
##      bmi      inc     agec     educ race_eth 
##     4275     8492        0      203     1001 
## 
## analyses :
## [[1]]
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
## 
## Coefficients:
## (Intercept)         inc2         inc3         inc4         inc5  
##    25.40213      0.01538     -0.05631      0.02177     -0.58724  
##       agec2        agec3        agec4        agec5        educ2  
##     2.80088      3.93049      3.36767      1.09953      0.70810  
##       educ3        educ4        educ5    race_eth2    race_eth3  
##    -0.07790     -0.01623     -1.20590      0.26286      1.64974  
##   race_eth4    race_eth5  
##     0.67008     -1.35024  
## 
## 
## [[2]]
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
## 
## Coefficients:
## (Intercept)         inc2         inc3         inc4         inc5  
##    25.43681      0.12826     -0.03799      0.04040     -0.50593  
##       agec2        agec3        agec4        agec5        educ2  
##     2.78119      3.93716      3.42638      1.07864      0.46801  
##       educ3        educ4        educ5    race_eth2    race_eth3  
##    -0.17763     -0.18392     -1.33359      0.38598      1.66145  
##   race_eth4    race_eth5  
##     0.64751     -1.35637  
## 
## 
## [[3]]
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
## 
## Coefficients:
## (Intercept)         inc2         inc3         inc4         inc5  
##    25.46516      0.04820     -0.09977     -0.06893     -0.66278  
##       agec2        agec3        agec4        agec5        educ2  
##     2.79133      3.93306      3.41565      1.06044      0.55002  
##       educ3        educ4        educ5    race_eth2    race_eth3  
##    -0.11551     -0.07607     -1.20453      0.28702      1.56743  
##   race_eth4    race_eth5  
##     0.53912     -1.42390  
## 
## 
## [[4]]
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
## 
## Coefficients:
## (Intercept)         inc2         inc3         inc4         inc5  
##   25.467411     0.087854    -0.171936     0.003613    -0.617410  
##       agec2        agec3        agec4        agec5        educ2  
##    2.826107     3.830944     3.377238     1.081824     0.734674  
##       educ3        educ4        educ5    race_eth2    race_eth3  
##   -0.182231    -0.097424    -1.207000     0.372184     1.529913  
##   race_eth4    race_eth5  
##    0.716437    -1.403044  
## 
## 
## [[5]]
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
## 
## Coefficients:
## (Intercept)         inc2         inc3         inc4         inc5  
##    25.49823      0.11389     -0.08410      0.02830     -0.57735  
##       agec2        agec3        agec4        agec5        educ2  
##     2.78344      3.82964      3.31524      0.95153      0.70768  
##       educ3        educ4        educ5    race_eth2    race_eth3  
##    -0.12674      0.02034     -1.15914      0.27828      1.55631  
##   race_eth4    race_eth5  
##     0.42663     -1.49853

variation in bmi

with (data=imp, exp=(sd(bmi)))
## call :
## with.mids(data = imp, expr = (sd(bmi)))
## 
## call1 :
## mice(data = brfss16m[, c("bmi", "inc", "agec", "educ", "race_eth")], 
##     seed = 22)
## 
## nmis :
##      bmi      inc     agec     educ race_eth 
##     4275     8492        0      203     1001 
## 
## analyses :
## [[1]]
## [1] 6.083388
## 
## [[2]]
## [1] 6.11109
## 
## [[3]]
## [1] 6.102539
## 
## [[4]]
## [1] 6.104557
## 
## [[5]]
## [1] 6.144922

Frequency table for income

with (data=imp, exp=(prop.table(table(inc))))
## call :
## with.mids(data = imp, expr = (prop.table(table(inc))))
## 
## call1 :
## mice(data = brfss16m[, c("bmi", "inc", "agec", "educ", "race_eth")], 
##     seed = 22)
## 
## nmis :
##      bmi      inc     agec     educ race_eth 
##     4275     8492        0      203     1001 
## 
## analyses :
## [[1]]
## inc
##   1_lt15k  2_15-25k  3_25-35k  4_35-50k 5_50kplus 
##   0.09266   0.16036   0.10098   0.13738   0.50862 
## 
## [[2]]
## inc
##   1_lt15k  2_15-25k  3_25-35k  4_35-50k 5_50kplus 
##   0.09296   0.15998   0.10054   0.13760   0.50892 
## 
## [[3]]
## inc
##   1_lt15k  2_15-25k  3_25-35k  4_35-50k 5_50kplus 
##   0.09166   0.16160   0.10096   0.13620   0.50958 
## 
## [[4]]
## inc
##   1_lt15k  2_15-25k  3_25-35k  4_35-50k 5_50kplus 
##   0.09234   0.16094   0.10158   0.13680   0.50834 
## 
## [[5]]
## inc
##   1_lt15k  2_15-25k  3_25-35k  4_35-50k 5_50kplus 
##   0.09188   0.15988   0.10246   0.13668   0.50910

Frequency table for race/ethnicty

with (data=imp, exp=(prop.table(table(race_eth))))
## call :
## with.mids(data = imp, expr = (prop.table(table(race_eth))))
## 
## call1 :
## mice(data = brfss16m[, c("bmi", "inc", "agec", "educ", "race_eth")], 
##     seed = 22)
## 
## nmis :
##      bmi      inc     agec     educ race_eth 
##     4275     8492        0      203     1001 
## 
## analyses :
## [[1]]
## race_eth
##      nhwhite     hispanic     nh black nh multirace     nh other 
##      0.74608      0.09742      0.10182      0.01468      0.04000 
## 
## [[2]]
## race_eth
##      nhwhite     hispanic     nh black nh multirace     nh other 
##      0.74584      0.09764      0.10184      0.01484      0.03984 
## 
## [[3]]
## race_eth
##      nhwhite     hispanic     nh black nh multirace     nh other 
##      0.74592      0.09750      0.10194      0.01488      0.03976 
## 
## [[4]]
## race_eth
##      nhwhite     hispanic     nh black nh multirace     nh other 
##      0.74632      0.09712      0.10180      0.01478      0.03998 
## 
## [[5]]
## race_eth
##      nhwhite     hispanic     nh black nh multirace     nh other 
##      0.74634      0.09768      0.10168      0.01460      0.03970

Frequency table for education

with (data=imp, exp=(prop.table(table(educ))))
## call :
## with.mids(data = imp, expr = (prop.table(table(educ))))
## 
## call1 :
## mice(data = brfss16m[, c("bmi", "inc", "agec", "educ", "race_eth")], 
##     seed = 22)
## 
## nmis :
##      bmi      inc     agec     educ race_eth 
##     4275     8492        0      203     1001 
## 
## analyses :
## [[1]]
## educ
##  2hsgrad    0Prim  1somehs 3somecol 4colgrad 
##  0.25192  0.02370  0.04506  0.26978  0.40954 
## 
## [[2]]
## educ
##  2hsgrad    0Prim  1somehs 3somecol 4colgrad 
##  0.25160  0.02368  0.04518  0.26994  0.40960 
## 
## [[3]]
## educ
##  2hsgrad    0Prim  1somehs 3somecol 4colgrad 
##  0.25150  0.02366  0.04518  0.26982  0.40984 
## 
## [[4]]
## educ
##  2hsgrad    0Prim  1somehs 3somecol 4colgrad 
##  0.25156  0.02366  0.04510  0.26980  0.40988 
## 
## [[5]]
## educ
##  2hsgrad    0Prim  1somehs 3somecol 4colgrad 
##  0.25144  0.02374  0.04524  0.26974  0.40984

Now we pool the separate models from each imputed data set:

est.p<-pool(fit.bmi)
print(est.p)
## Call: pool(object = fit.bmi)
## 
## Pooled coefficients:
##  (Intercept)         inc2         inc3         inc4         inc5 
## 25.453949770  0.078716819 -0.090021697  0.005028554 -0.590143360 
##        agec2        agec3        agec4        agec5        educ2 
##  2.796590508  3.892258991  3.380435939  1.054391469  0.633695206 
##        educ3        educ4        educ5    race_eth2    race_eth3 
## -0.136002503 -0.070659334 -1.222031478  0.317264370  1.592967650 
##    race_eth4    race_eth5 
##  0.599956060 -1.406414347 
## 
## Fraction of information about the coefficients missing due to nonresponse: 
## (Intercept)        inc2        inc3        inc4        inc5       agec2 
##  0.07008382  0.18853812  0.18711186  0.15095386  0.29474286  0.02402665 
##       agec3       agec4       agec5       educ2       educ3       educ4 
##  0.22668242  0.14826795  0.17301703  0.35342441  0.11439014  0.63007923 
##       educ5   race_eth2   race_eth3   race_eth4   race_eth5 
##  0.54614575  0.31987555  0.37120243  0.27140985  0.19859261
summary(est.p)
##                      est        se            t         df     Pr(>|t|)
## (Intercept) 25.453949770 0.1522548 167.17994201  851.67460 0.000000e+00
## inc2         0.078716819 0.1218953   0.64577383  128.69284 5.195756e-01
## inc3        -0.090021697 0.1354296  -0.66471224  130.56468 5.074069e-01
## inc4         0.005028554 0.1260918   0.03988009  196.48026 9.682292e-01
## inc5        -0.590143360 0.1215960  -4.85331207   54.97302 1.042651e-05
## agec2        2.796590508 0.1295817  21.58168241 6208.78821 0.000000e+00
## agec3        3.892258991 0.1354081  28.74464559   90.65978 0.000000e+00
## agec4        3.380435939 0.1291021  26.18420663  203.31784 0.000000e+00
## agec5        1.054391469 0.1609031   6.55295835  151.53838 8.313514e-10
## educ2        0.633695206 0.2280994   2.77815384   38.79272 8.380520e-03
## educ3       -0.136002503 0.1459919  -0.93157594  333.56218 3.522295e-01
## educ4       -0.070659334 0.1139123  -0.62029600   12.14262 5.465286e-01
## educ5       -1.222031478 0.1021910 -11.95830332   16.38484 1.639863e-09
## race_eth2    0.317264370 0.1162498   2.72915980   46.99980 8.905691e-03
## race_eth3    1.592967650 0.1112532  14.31839487   35.27991 4.440892e-16
## race_eth4    0.599956060 0.2561603   2.34211171   64.34719 2.228109e-02
## race_eth5   -1.406414347 0.1531547  -9.18296448  116.58733 1.776357e-15
##                   lo 95      hi 95 nmis        fmi     lambda
## (Intercept) 25.15511115 25.7527884   NA 0.07008382 0.06790264
## inc2        -0.16246154  0.3198952   NA 0.18853812 0.17602452
## inc3        -0.35794203  0.1778986   NA 0.18711186 0.17475462
## inc4        -0.24363857  0.2536957   NA 0.15095386 0.14235507
## inc5        -0.83382989 -0.3464568   NA 0.29474286 0.26954296
## agec2        2.54256558  3.0506154   NA 0.02402665 0.02371232
## agec3        3.62327376  4.1612442   NA 0.22668242 0.20980876
## agec4        3.12588529  3.6349866   NA 0.14826795 0.13993062
## agec5        0.73648837  1.3722946   NA 0.17301703 0.16217408
## educ2        0.17224180  1.0951486   NA 0.35342441 0.32092723
## educ3       -0.42318329  0.1511783   NA 0.11439014 0.10909600
## educ4       -0.31852994  0.1772113   NA 0.63007923 0.57378594
## educ5       -1.43825401 -1.0058089   NA 0.54614575 0.49393311
## race_eth2    0.08339985  0.5511289   NA 0.31987555 0.29153692
## race_eth3    1.36717563  1.8187597   NA 0.37120243 0.33653876
## race_eth4    0.08827019  1.1116419   NA 0.27140985 0.24911080
## race_eth5   -1.70974046 -1.1030882   NA 0.19859261 0.18496176

We need to pay attention to the fmi column and the lambda column. These convey information about how much the missingness of each particular variable affects the model coefficients.

lam<-data.frame(lam=est.p$lambda, param=names(est.p$lambda))

ggplot(data=lam,aes(x=param, y=lam))+geom_col()+theme(axis.text.x = element_text(angle = 45, hjust = 1))

It appears that the education variables and race/ethnicity have large variances to them. This suggests that there may be noticeable variation in the resulting coefficient of the model, depending on which imputed data set we use.

We can also compare to the model fit on the original data, with missings eliminated:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
bnm<-brfss16m%>%
  select(bmi, inc, agec, educ, race_eth)%>%
  filter(complete.cases(.))%>%
  as.data.frame()

summary(lm(bmi~inc+agec+educ+race_eth, bnm))
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth, data = bnm)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.588  -4.000  -0.888   2.907  53.793 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          25.63163    0.14552 176.140  < 2e-16 ***
## inc.L                -0.43449    0.08628  -5.036 4.79e-07 ***
## inc.Q                -0.31235    0.08527  -3.663 0.000249 ***
## inc.C                -0.17141    0.07934  -2.160 0.030744 *  
## inc^4                -0.19809    0.08859  -2.236 0.025349 *  
## agec(24,39]           2.62671    0.15182  17.301  < 2e-16 ***
## agec(39,59]           3.71578    0.14365  25.866  < 2e-16 ***
## agec(59,79]           3.24749    0.14332  22.659  < 2e-16 ***
## agec(79,99]           0.91184    0.17743   5.139 2.77e-07 ***
## educ0Prim             0.75203    0.23752   3.166 0.001546 ** 
## educ1somehs          -0.25063    0.16296  -1.538 0.124056    
## educ3somecol         -0.08821    0.08518  -1.036 0.300410    
## educ4colgrad         -1.23067    0.08251 -14.916  < 2e-16 ***
## race_ethhispanic      0.30708    0.11214   2.738 0.006176 ** 
## race_ethnh black      1.62964    0.10335  15.769  < 2e-16 ***
## race_ethnh multirace  0.65539    0.25022   2.619 0.008816 ** 
## race_ethnh other     -1.57342    0.16134  -9.752  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.93 on 38635 degrees of freedom
## Multiple R-squared:  0.05155,    Adjusted R-squared:  0.05115 
## F-statistic: 131.2 on 16 and 38635 DF,  p-value: < 2.2e-16

Compare imputed model to original data

Here, I compare the coefficients from the model where we eliminated all missing data to the one that we fit on the imputed data:

fit1<-lm(bmi~inc+agec+educ+race_eth, data=brfss16m)
summary(fit1)
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth, data = brfss16m)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.588  -4.000  -0.888   2.907  53.793 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          25.63163    0.14552 176.140  < 2e-16 ***
## inc.L                -0.43449    0.08628  -5.036 4.79e-07 ***
## inc.Q                -0.31235    0.08527  -3.663 0.000249 ***
## inc.C                -0.17141    0.07934  -2.160 0.030744 *  
## inc^4                -0.19809    0.08859  -2.236 0.025349 *  
## agec(24,39]           2.62671    0.15182  17.301  < 2e-16 ***
## agec(39,59]           3.71578    0.14365  25.866  < 2e-16 ***
## agec(59,79]           3.24749    0.14332  22.659  < 2e-16 ***
## agec(79,99]           0.91184    0.17743   5.139 2.77e-07 ***
## educ0Prim             0.75203    0.23752   3.166 0.001546 ** 
## educ1somehs          -0.25063    0.16296  -1.538 0.124056    
## educ3somecol         -0.08821    0.08518  -1.036 0.300410    
## educ4colgrad         -1.23067    0.08251 -14.916  < 2e-16 ***
## race_ethhispanic      0.30708    0.11214   2.738 0.006176 ** 
## race_ethnh black      1.62964    0.10335  15.769  < 2e-16 ***
## race_ethnh multirace  0.65539    0.25022   2.619 0.008816 ** 
## race_ethnh other     -1.57342    0.16134  -9.752  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.93 on 38635 degrees of freedom
##   (11348 observations deleted due to missingness)
## Multiple R-squared:  0.05155,    Adjusted R-squared:  0.05115 
## F-statistic: 131.2 on 16 and 38635 DF,  p-value: < 2.2e-16
fit.imp<-lm(bmi~inc+agec+educ+race_eth, data=dat.imp)
summary(fit.imp)
## 
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth, data = dat.imp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.356  -4.014  -0.944   2.905  53.799 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 25.49823    0.14813 172.132  < 2e-16 ***
## inc2         0.11389    0.11164   1.020 0.307667    
## inc3        -0.08410    0.12363  -0.680 0.496326    
## inc4         0.02830    0.11775   0.240 0.810087    
## inc5        -0.57735    0.10485  -5.506 3.68e-08 ***
## agec2        2.78344    0.12887  21.598  < 2e-16 ***
## agec3        3.82964    0.12115  31.611  < 2e-16 ***
## agec4        3.31524    0.12050  27.512  < 2e-16 ***
## agec5        0.95153    0.14826   6.418 1.39e-10 ***
## educ2        0.70768    0.18902   3.744 0.000181 ***
## educ3       -0.12674    0.13859  -0.915 0.360457    
## educ4        0.02034    0.07487   0.272 0.785858    
## educ5       -1.15914    0.07318 -15.839  < 2e-16 ***
## race_eth2    0.27828    0.09839   2.829 0.004678 ** 
## race_eth3    1.55631    0.09130  17.046  < 2e-16 ***
## race_eth4    0.42663    0.22458   1.900 0.057479 .  
## race_eth5   -1.49853    0.13942 -10.748  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.986 on 49983 degrees of freedom
## Multiple R-squared:  0.05126,    Adjusted R-squared:  0.05095 
## F-statistic: 168.8 on 16 and 49983 DF,  p-value: < 2.2e-16

So for instance, the coefficient for college grad in the imputed model is -1.20, but is -1.23 in the model where the data were limited to complete cases only. The notable patter that emerges in the imputed data is the lack of significance for the income variables. In the analysis that only uses complete cases, we see a significant income effect on bmi, but not once we impute the missing values. This suggests a significant selection effect for the income variable.

Flag variables

We can construct a flag variable. This is a useful exercise to see whethere we have missing not at random within the data:

fit1<-lm(bmi~agec+educ+race_eth+is.na(inc), data=brfss16m)
summary(fit1)
## 
## Call:
## lm(formula = bmi ~ agec + educ + race_eth + is.na(inc), data = brfss16m)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.470  -3.978  -0.905   2.871  53.701 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          25.472769   0.127810 199.302  < 2e-16 ***
## agec(24,39]           2.648301   0.134502  19.690  < 2e-16 ***
## agec(39,59]           3.688721   0.125994  29.277  < 2e-16 ***
## agec(59,79]           3.305475   0.125121  26.418  < 2e-16 ***
## agec(79,99]           1.182587   0.152492   7.755 9.01e-15 ***
## educ0Prim             0.795509   0.206753   3.848 0.000119 ***
## educ1somehs          -0.000858   0.144133  -0.006 0.995250    
## educ3somecol         -0.192862   0.077454  -2.490 0.012777 *  
## educ4colgrad         -1.472911   0.071512 -20.597  < 2e-16 ***
## race_ethhispanic      0.475369   0.101801   4.670 3.03e-06 ***
## race_ethnh black      1.743845   0.094601  18.434  < 2e-16 ***
## race_ethnh multirace  0.781121   0.230828   3.384 0.000715 ***
## race_ethnh other     -1.319767   0.146442  -9.012  < 2e-16 ***
## is.na(inc)TRUE       -0.826256   0.081874 -10.092  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.905 on 44864 degrees of freedom
##   (5122 observations deleted due to missingness)
## Multiple R-squared:  0.05345,    Adjusted R-squared:  0.05317 
## F-statistic: 194.9 on 13 and 44864 DF,  p-value: < 2.2e-16

And indeed we see that those with missing incomes have signifcantly lower bmi’s.

Examining the variation in the models for the imputed data

If we wanted to see the ranges of the betas in the five imputed data models, we could do that:

#get the coefficients from each of the 5 imputations of bmi
coefs<-matrix(unlist(lapply(fit.bmi$analyses, coef)), nrow=5, ncol=17, byrow=T)

#plot the coefficients from each of the different rounds of imputation to see the variability in the
#results
plot(coefs[1,-1], ylim=c(-2, 4), xaxt="n",cex=1.5, pch=20, ylab="beta")
axis(1, at=1:16, labels=names(fit.bmi$analyses[[1]]$coef[-1]))
cols=2:5
for(i in 1:dim(coefs)[1]-1){
  points(coefs[i+1,-1], col=cols[i], cex=1.5, pch=20)
}
title(main="Estimated Betas from Each Imputed Regression Model for BMI Outcome")