This example will illustrate typical aspects of dealing with missing data. Topics will include: Mean imputation, modal imputation for categorical data, and multiple imputation of complex patterns of missing data.
For this example I am using 2016 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART county data. Link
library(car)
library(mice)
## Loading required package: lattice
library(ggplot2)
load(file = "~/Google Drive/classes/dem7283/class18/data/brfss16_mmsa.Rdata")
nams<-names(brfss16m)
head(nams, n=10)
## [1] "DISPCODE" "HHADULT" "GENHLTH" "PHYSHLTH" "MENTHLTH" "POORHLTH"
## [7] "HLTHPLN1" "PERSDOC2" "MEDCOST" "CHECKUP1"
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
set.seed(1234)
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(brfss16m)<-newnames
samp<-sample(1:dim(brfss16m)[1], size = 50000) #smaller sample for brevity
brfss16m<-brfss16m[samp,]
#Healthy days
brfss16m$healthdays<-recode(brfss16m$physhlth, recodes = "88=0; 77=NA; 99=NA")
#Healthy mental health days
brfss16m$healthmdays<-recode(brfss16m$menthlth, recodes = "88=0; 77=NA; 99=NA")
brfss16m$badhealth<-recode(brfss16m$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss16m$black<-recode(brfss16m$racegr3, recodes="2=1; 9=NA; else=0")
brfss16m$white<-recode(brfss16m$racegr3, recodes="1=1; 9=NA; else=0")
brfss16m$other<-recode(brfss16m$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss16m$hispanic<-recode(brfss16m$racegr3, recodes="5=1; 9=NA; else=0")
brfss16m$race_eth<-recode(brfss16m$racegr3,
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor.result = T)
brfss16m$race_eth<-relevel(brfss16m$race_eth, ref = "nhwhite")
#insurance
brfss16m$ins<-recode(brfss16m$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
#income grouping
brfss16m$inc<-recode(brfss16m$incomg, recodes = "9= NA;1='1_lt15k'; 2='2_15-25k';3='3_25-35k';4='4_35-50k';5='5_50kplus'", as.factor.result = T)
brfss16m$inc<-as.ordered(brfss16m$inc)
#education level
brfss16m$educ<-recode(brfss16m$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor.result=T)
brfss16m$educ<-relevel(brfss16m$educ, ref='2hsgrad')
#employloyment
brfss16m$employ<-recode(brfss16m$employ1,
recodes="1:2='employloyed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor.result=T)
brfss16m$employ<-relevel(brfss16m$employ, ref='employloyed')
#marital status
brfss16m$marst<-recode(brfss16m$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor.result=T)
brfss16m$marst<-relevel(brfss16m$marst, ref='married')
#Age cut into intervals
brfss16m$agec<-cut(brfss16m$age80, breaks=c(0,24,39,59,79,99))
#BMI, in the brfss16ma the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's
brfss16m$bmi<-brfss16m$bmi5/100
#smoking currently
brfss16m$smoke<-recode(brfss16m$smoker3,
recodes="1:2=1; 3:4=0; else=NA")
#brfss16m$smoke<-relevel(brfss16m$smoke, ref = "NeverSmoked")
Now, we can get a general idea of the missingness of these variables by just using summary(brfss16m)
summary(brfss16m[, c("ins", "smoke", "bmi", "badhealth", "race_eth", "educ", "employ", "marst", "inc")])
## ins smoke bmi badhealth
## Min. :0.0000 Min. :0.0000 Min. :12.80 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:23.71 1st Qu.:0.0000
## Median :1.0000 Median :0.0000 Median :26.72 Median :0.0000
## Mean :0.9307 Mean :0.1351 Mean :27.84 Mean :0.1732
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:30.85 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :81.53 Max. :1.0000
## NA's :220 NA's :2129 NA's :4275 NA's :127
## race_eth educ employ
## nhwhite :36563 2hsgrad :12526 employloyed:25391
## hispanic : 4774 0Prim : 1176 nilf : 6333
## nh black : 4985 1somehs : 2244 retired :14575
## nh multirace: 723 3somecol:13442 unable : 3263
## nh other : 1954 4colgrad:20409 NA's : 438
## NA's : 1001 NA's : 203
##
## marst inc
## married :25698 1_lt15k : 3685
## cohab : 1678 2_15-25k : 6433
## divorced : 6561 3_25-35k : 4107
## nm : 8790 4_35-50k : 5635
## separated: 995 5_50kplus:21648
## widowed : 5875 NA's : 8492
## NA's : 403
Which shows that, among these recoded variables, inc , the income variable, 8492 people in the BRFSS, or 16.984% of the sample.
The lowest number of missings is in the bad health variable, which only has 0.254% missing.
Now, i’m going to illustrate mean imputation of a continuous variable, BMI.
#I'm going to play with 3 outcomes, bmi, having a regular doctor and income category
summary(brfss16m$bmi)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 12.80 23.71 26.72 27.84 30.85 81.53 4275
#what happens when we replace the missings with the mean?
brfss16m$bmi.imp.mean<-ifelse(is.na(brfss16m$bmi)==T, mean(brfss16m$bmi, na.rm=T), brfss16m$bmi)
mean(brfss16m$bmi, na.rm=T)
## [1] 27.83772
mean(brfss16m$bmi.imp.mean) #no difference!
## [1] 27.83772
median(brfss16m$bmi, na.rm=T)
## [1] 26.72
median(brfss16m$bmi.imp.mean) #slight difference
## [1] 27.44
var(brfss16m$bmi, na.rm=T)
## [1] 36.8107
var(brfss16m$bmi.imp.mean) # more noticeable difference!
## [1] 33.66332
So what we see here, is that imputing with the mean does nothing to central tendency (when measured using the mean, but does affect the median slightly), but it does reduce the variance in the outcome. This is because you’re replacing all missing cases with the most likely value (the mean), so you’re artificially deflating the variance. That’s not good.
We can see this in a histogram:
#plot the histogram
hist(brfss16m$bmi.imp.mean)
hist(brfss16m$bmi, add=T ,col=1) #
Where you can see the extra values at the mean by the white area over the mode.
If we have a categorical variable, an easy way to impute the values is to use modal imputation, or impute cases with the mode, or most common value. It doesn’t make sense to use the mean, because what would that mean for a categorical variable?
table(brfss16m$employ)
##
## employloyed nilf retired unable
## 25391 6333 14575 3263
#find the most common value
mcv.employ<-factor(names(which.max(table(brfss16m$employ))), levels=levels(brfss16m$employ))
mcv.employ
## [1] employloyed
## Levels: employloyed nilf retired unable
#impute the cases
brfss16m$employ.imp<-as.factor(ifelse(is.na(brfss16m$employ)==T, mcv.employ, brfss16m$employ))
levels(brfss16m$employ.imp)<-levels(brfss16m$employ)
prop.table(table(brfss16m$employ))
##
## employloyed nilf retired unable
## 0.51230782 0.12777935 0.29407611 0.06583673
prop.table(table(brfss16m$employ.imp))
##
## employloyed nilf retired unable
## 0.51658 0.12666 0.29150 0.06526
barplot(prop.table(table(brfss16m$employ)), main="Original Data", ylim=c(0, .6))
barplot(prop.table(table(brfss16m$employ.imp)), main="Imputed Data",ylim=c(0, .6))
Which doesn’t look like much of a difference because only 438 people were missing. Now let’s try modal imputation on income group:
table(brfss16m$inc)
##
## 1_lt15k 2_15-25k 3_25-35k 4_35-50k 5_50kplus
## 3685 6433 4107 5635 21648
#find the most common value
mcv.inc<-factor(names(which.max(table(brfss16m$inc))), levels = levels(brfss16m$inc))
mcv.inc
## [1] 5_50kplus
## Levels: 1_lt15k 2_15-25k 3_25-35k 4_35-50k 5_50kplus
#impute the cases
brfss16m$inc.imp<-as.factor(ifelse(is.na(brfss16m$inc)==T, mcv.inc, brfss16m$inc))
levels(brfss16m$inc.imp)<-levels(as.factor(brfss16m$inc))
prop.table(table(brfss16m$inc))
##
## 1_lt15k 2_15-25k 3_25-35k 4_35-50k 5_50kplus
## 0.08877807 0.15498217 0.09894478 0.13575696 0.52153802
prop.table(table(brfss16m$inc.imp))
##
## 1_lt15k 2_15-25k 3_25-35k 4_35-50k 5_50kplus
## 0.07370 0.12866 0.08214 0.11270 0.60280
barplot(prop.table(table(brfss16m$inc)), main="Original Data", ylim=c(0, .6))
barplot(prop.table(table(brfss16m$inc.imp)), main="Imputed Data", ylim=c(0, .6))
Which shows how dramatically we alter the distribution of the variable by imputing at the mode.
These days, these types of imputation have been far surpassed by more complete methods that are based upon regression methods. These methods are generally referred to as multiple imputation, because we are really interested in imputing multiple variables simultaneously. Instead of reviewing this perspective here, I suggest you have a look at Joe Schafer’s site that gives a nice treatment of the subject. Here, I will use the imputation techniques in the mice library in R, which you can read about here.
I have used these in practice in publications and generally like the framework the library uses. Another popular technique is in the Amelia library of Gary King, which I haven’t used much. If you are serious about doing multiple imputation it would be advised to investigate multiple methodologies.
To begin, I explore the various patterns of missingness in the data. The md.pattern function in mice does this nicely. Here, each row corresponds to a particular pattern of missingness (1 = observed, 0=missing)
#look at the patterns of missingness
md.pattern(brfss16m[,c("bmi", "inc", "agec","educ","race_eth")])
## agec educ race_eth bmi inc
## 38652 1 1 1 1 1 0
## 2202 1 1 1 0 1 1
## 6226 1 1 1 1 0 1
## 45 1 0 1 1 1 1
## 533 1 1 0 1 1 1
## 1773 1 1 1 0 0 2
## 10 1 0 1 0 1 2
## 44 1 0 1 1 0 2
## 62 1 1 0 0 1 2
## 209 1 1 0 1 0 2
## 47 1 0 1 0 0 3
## 140 1 1 0 0 0 3
## 4 1 0 0 0 1 3
## 16 1 0 0 1 0 3
## 37 1 0 0 0 0 4
## 0 203 1001 4275 8492 13971
Shows that 3688 rows of the data are complete (first row).
The second row shows that 135 people are missing only the bmi variable. I say only because 4275 people are missing the bmi variable in total. Apparently some folks are missing in combination with other variables.
Sure enough, if we look down, we see that 95 people are missing bmi AND income, 17 are missing bmi AND income and education, and so on.
The bottom row tells how many total people are missing each variable, in ANY combination with other variables.
If you want to see how pairs of variables are missing together, the md.pairs() function will show this.
A pair of variables can have exactly four missingness patterns: Both variables are observed (pattern rr), the first variable is observed and the second variable is missing (pattern rm), the first variable is missing and the second variable is observed (pattern mr), and both are missing (pattern mm).
md.pairs(brfss16m[,c("bmi", "inc", "agec","educ","race_eth")])
## $rr
## bmi inc agec educ race_eth
## bmi 45725 39230 45725 45620 44967
## inc 39230 41508 41508 41449 40909
## agec 45725 41508 50000 49797 48999
## educ 45620 41449 49797 49797 48853
## race_eth 44967 40909 48999 48853 48999
##
## $rm
## bmi inc agec educ race_eth
## bmi 0 6495 0 105 758
## inc 2278 0 0 59 599
## agec 4275 8492 0 203 1001
## educ 4177 8348 0 0 944
## race_eth 4032 8090 0 146 0
##
## $mr
## bmi inc agec educ race_eth
## bmi 0 2278 4275 4177 4032
## inc 6495 0 8492 8348 8090
## agec 0 0 0 0 0
## educ 105 59 203 0 146
## race_eth 758 599 1001 944 0
##
## $mm
## bmi inc agec educ race_eth
## bmi 4275 1997 0 98 243
## inc 1997 8492 0 144 402
## agec 0 0 0 0 0
## educ 98 144 0 203 57
## race_eth 243 402 0 57 1001
We can perform a basic multiple imputation by simply doing: Note this may take a very long time with big data sets
imp<-mice(data = brfss16m[,c("bmi", "inc", "agec","educ","race_eth")], seed = 22)
##
## iter imp variable
## 1 1 bmi inc educ race_eth
## 1 2 bmi inc educ race_eth
## 1 3 bmi inc educ race_eth
## 1 4 bmi inc educ race_eth
## 1 5 bmi inc educ race_eth
## 2 1 bmi inc educ race_eth
## 2 2 bmi inc educ race_eth
## 2 3 bmi inc educ race_eth
## 2 4 bmi inc educ race_eth
## 2 5 bmi inc educ race_eth
## 3 1 bmi inc educ race_eth
## 3 2 bmi inc educ race_eth
## 3 3 bmi inc educ race_eth
## 3 4 bmi inc educ race_eth
## 3 5 bmi inc educ race_eth
## 4 1 bmi inc educ race_eth
## 4 2 bmi inc educ race_eth
## 4 3 bmi inc educ race_eth
## 4 4 bmi inc educ race_eth
## 4 5 bmi inc educ race_eth
## 5 1 bmi inc educ race_eth
## 5 2 bmi inc educ race_eth
## 5 3 bmi inc educ race_eth
## 5 4 bmi inc educ race_eth
## 5 5 bmi inc educ race_eth
print(imp)
## Multiply imputed data set
## Call:
## mice(data = brfss16m[, c("bmi", "inc", "agec", "educ", "race_eth")],
## seed = 22)
## Number of multiple imputations: 5
## Missing cells per column:
## bmi inc agec educ race_eth
## 4275 8492 0 203 1001
## Imputation methods:
## bmi inc agec educ race_eth
## "pmm" "polr" "" "polyreg" "polyreg"
## VisitSequence:
## bmi inc educ race_eth
## 1 2 4 5
## PredictorMatrix:
## bmi inc agec educ race_eth
## bmi 0 1 1 1 1
## inc 1 0 1 1 1
## agec 0 0 0 0 0
## educ 1 1 1 0 1
## race_eth 1 1 1 1 0
## Random generator seed value: 22
Shows how many imputations were done. It also shows total missingness, which imputation method was used for each variable (because you wouldn’t want to use a normal distribution for a categorical variable!!).
It also shows the sequence of how each variable is visited (or imputed, the default is left to right).
We may want to make sure imputed values are plausible by having a look. For instance, are the BMI values outside of the range of the data.
head(imp$imp$bmi)
## 1 2 3 4 5
## 11 23.33 35.62 45.35 27.44 33.84
## 19 19.37 23.26 24.21 17.33 32.12
## 23 45.18 32.22 29.09 19.13 36.80
## 52 31.87 20.10 51.37 36.81 45.17
## 55 25.77 24.96 20.09 28.99 22.50
## 62 24.39 29.05 26.07 22.31 22.14
summary(imp$imp$bmi)
## 1 2 3 4
## Min. :13.19 Min. :13.66 Min. :13.73 Min. :13.19
## 1st Qu.:23.80 1st Qu.:23.63 1st Qu.:23.17 1st Qu.:24.21
## Median :27.12 Median :27.12 Median :27.07 Median :26.92
## Mean :27.95 Mean :28.21 Mean :27.89 Mean :27.85
## 3rd Qu.:30.99 3rd Qu.:31.62 3rd Qu.:31.18 3rd Qu.:30.54
## Max. :65.47 Max. :60.55 Max. :71.06 Max. :81.53
## 5
## Min. :13.19
## 1st Qu.:24.03
## Median :27.41
## Mean :28.62
## 3rd Qu.:31.74
## Max. :81.53
summary(brfss16m$bmi)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 12.80 23.71 26.72 27.84 30.85 81.53 4275
head(imp$imp$inc)
## 1 2 3 4 5
## 2 5_50kplus 5_50kplus 4_35-50k 4_35-50k 1_lt15k
## 4 4_35-50k 2_15-25k 5_50kplus 4_35-50k 2_15-25k
## 6 5_50kplus 3_25-35k 5_50kplus 3_25-35k 5_50kplus
## 11 5_50kplus 5_50kplus 5_50kplus 5_50kplus 5_50kplus
## 16 2_15-25k 5_50kplus 5_50kplus 5_50kplus 4_35-50k
## 19 5_50kplus 5_50kplus 5_50kplus 5_50kplus 5_50kplus
summary(imp$imp$inc)
## 1 2 3 4
## 1_lt15k : 948 1_lt15k : 963 1_lt15k : 898 1_lt15k : 932
## 2_15-25k :1585 2_15-25k :1566 2_15-25k :1647 2_15-25k :1614
## 3_25-35k : 942 3_25-35k : 920 3_25-35k : 941 3_25-35k : 972
## 4_35-50k :1234 4_35-50k :1245 4_35-50k :1175 4_35-50k :1205
## 5_50kplus:3783 5_50kplus:3798 5_50kplus:3831 5_50kplus:3769
## 5
## 1_lt15k : 909
## 2_15-25k :1561
## 3_25-35k :1016
## 4_35-50k :1199
## 5_50kplus:3807
Which shows the imputed values for the first 6 cases across the 5 different imputations, as well as the numeric summary of the imputed values. We can see that there is variation across the imputations, because the imputed values are not the same.
We can also do some plotting. For instance if we want to see how the observed and imputed values of bmi look with respect to race, we can do:
library(lattice)
stripplot(imp,bmi~race_eth|.imp, pch=20)
and we see the distribution of the original data (blue dots), the imputed data (red dots) across the levels of race, for each of the five different imputation runs(the number at the top shows which run, and the first plot is the original data).
This plot shows that the bmi values correspond well with the observed data, so they are probably plausible valuesU.
If we want to get our new, imputed data, we can use the complete() function, which by default extracts the first imputed data set. If we want a different one, we can do complete(imp, action=3) for example, to get the third imputed data set.
dat.imp<-complete(imp, action = 5)
head(dat.imp, n=10)
## bmi inc agec educ race_eth
## 1 33.45 5_50kplus (39,59] 4colgrad nhwhite
## 2 40.34 1_lt15k (24,39] 2hsgrad nhwhite
## 3 30.41 1_lt15k (39,59] 3somecol hispanic
## 4 22.24 2_15-25k (24,39] 2hsgrad nhwhite
## 5 20.53 5_50kplus (39,59] 4colgrad nhwhite
## 6 37.59 5_50kplus (24,39] 3somecol nhwhite
## 7 26.78 5_50kplus (39,59] 3somecol nhwhite
## 8 20.66 1_lt15k (24,39] 3somecol hispanic
## 9 30.62 5_50kplus (39,59] 3somecol nhwhite
## 10 25.83 5_50kplus (39,59] 3somecol nhwhite
#Compare to the original data
head(brfss16m[,c("bmi", "inc", "agec","educ","race_eth")], n=10)
## # A tibble: 10 x 5
## bmi inc agec educ race_eth
## <dbl> <ord> <fct> <fct> <fct>
## 1 33.4 5_50kplus (39,59] 4colgrad nhwhite
## 2 40.3 <NA> (24,39] 2hsgrad nhwhite
## 3 30.4 1_lt15k (39,59] 3somecol hispanic
## 4 22.2 <NA> (24,39] 2hsgrad <NA>
## 5 20.5 5_50kplus (39,59] 4colgrad nhwhite
## 6 37.6 <NA> (24,39] 3somecol nhwhite
## 7 26.8 5_50kplus (39,59] 3somecol nhwhite
## 8 20.7 1_lt15k (24,39] 3somecol hispanic
## 9 30.6 5_50kplus (39,59] 3somecol nhwhite
## 10 25.8 5_50kplus (39,59] 3somecol nhwhite
While the first few cases don’t show much missingness, we can coax some more interesting cases out and compare the original data to the imputed:
head(dat.imp[is.na(brfss16m$bmi)==T,], n=10)
## bmi inc agec educ race_eth
## 11 33.84 5_50kplus (39,59] 2hsgrad nhwhite
## 19 32.12 5_50kplus (59,79] 4colgrad nhwhite
## 23 36.80 5_50kplus (39,59] 0Prim hispanic
## 52 45.17 5_50kplus (24,39] 3somecol nh black
## 55 22.50 3_25-35k (79,99] 0Prim nhwhite
## 62 22.14 2_15-25k (39,59] 4colgrad nhwhite
## 63 22.66 3_25-35k (59,79] 4colgrad nhwhite
## 69 40.24 3_25-35k (59,79] 3somecol nh multirace
## 85 18.79 3_25-35k (59,79] 4colgrad nhwhite
## 94 30.41 4_35-50k (59,79] 4colgrad nhwhite
head(brfss16m[is.na(brfss16m$bmi)==T,c("bmi", "inc", "agec","educ","race_eth")], n=10)
## # A tibble: 10 x 5
## bmi inc agec educ race_eth
## <dbl> <ord> <fct> <fct> <fct>
## 1 NA <NA> (39,59] 2hsgrad nhwhite
## 2 NA <NA> (59,79] 4colgrad nhwhite
## 3 NA <NA> (39,59] 0Prim hispanic
## 4 NA <NA> (24,39] 3somecol nh black
## 5 NA <NA> (79,99] 0Prim nhwhite
## 6 NA 2_15-25k (39,59] 4colgrad nhwhite
## 7 NA 3_25-35k (59,79] 4colgrad nhwhite
## 8 NA <NA> (59,79] 3somecol nh multirace
## 9 NA 3_25-35k (59,79] 4colgrad nhwhite
## 10 NA 4_35-50k (59,79] 4colgrad nhwhite
A key element of using imputed data, is that the relationships we want to know about should be maintained after imputation, and presumably, the relationships within each imputed data set will be the same. So if we used each of the (5 in this case) imputed data sets in a model, then we should see similar results across the five different models.
Here I look at a linear model for bmi:
#Now, I will see the variability in the 5 different imputations for each outcom
fit.bmi<-with(data=imp ,expr=lm(bmi~inc+agec+educ+race_eth))
fit.bmi
## call :
## with.mids(data = imp, expr = lm(bmi ~ inc + agec + educ + race_eth))
##
## call1 :
## mice(data = brfss16m[, c("bmi", "inc", "agec", "educ", "race_eth")],
## seed = 22)
##
## nmis :
## bmi inc agec educ race_eth
## 4275 8492 0 203 1001
##
## analyses :
## [[1]]
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
##
## Coefficients:
## (Intercept) inc2 inc3 inc4 inc5
## 25.40213 0.01538 -0.05631 0.02177 -0.58724
## agec2 agec3 agec4 agec5 educ2
## 2.80088 3.93049 3.36767 1.09953 0.70810
## educ3 educ4 educ5 race_eth2 race_eth3
## -0.07790 -0.01623 -1.20590 0.26286 1.64974
## race_eth4 race_eth5
## 0.67008 -1.35024
##
##
## [[2]]
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
##
## Coefficients:
## (Intercept) inc2 inc3 inc4 inc5
## 25.43681 0.12826 -0.03799 0.04040 -0.50593
## agec2 agec3 agec4 agec5 educ2
## 2.78119 3.93716 3.42638 1.07864 0.46801
## educ3 educ4 educ5 race_eth2 race_eth3
## -0.17763 -0.18392 -1.33359 0.38598 1.66145
## race_eth4 race_eth5
## 0.64751 -1.35637
##
##
## [[3]]
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
##
## Coefficients:
## (Intercept) inc2 inc3 inc4 inc5
## 25.46516 0.04820 -0.09977 -0.06893 -0.66278
## agec2 agec3 agec4 agec5 educ2
## 2.79133 3.93306 3.41565 1.06044 0.55002
## educ3 educ4 educ5 race_eth2 race_eth3
## -0.11551 -0.07607 -1.20453 0.28702 1.56743
## race_eth4 race_eth5
## 0.53912 -1.42390
##
##
## [[4]]
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
##
## Coefficients:
## (Intercept) inc2 inc3 inc4 inc5
## 25.467411 0.087854 -0.171936 0.003613 -0.617410
## agec2 agec3 agec4 agec5 educ2
## 2.826107 3.830944 3.377238 1.081824 0.734674
## educ3 educ4 educ5 race_eth2 race_eth3
## -0.182231 -0.097424 -1.207000 0.372184 1.529913
## race_eth4 race_eth5
## 0.716437 -1.403044
##
##
## [[5]]
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
##
## Coefficients:
## (Intercept) inc2 inc3 inc4 inc5
## 25.49823 0.11389 -0.08410 0.02830 -0.57735
## agec2 agec3 agec4 agec5 educ2
## 2.78344 3.82964 3.31524 0.95153 0.70768
## educ3 educ4 educ5 race_eth2 race_eth3
## -0.12674 0.02034 -1.15914 0.27828 1.55631
## race_eth4 race_eth5
## 0.42663 -1.49853
with (data=imp, exp=(sd(bmi)))
## call :
## with.mids(data = imp, expr = (sd(bmi)))
##
## call1 :
## mice(data = brfss16m[, c("bmi", "inc", "agec", "educ", "race_eth")],
## seed = 22)
##
## nmis :
## bmi inc agec educ race_eth
## 4275 8492 0 203 1001
##
## analyses :
## [[1]]
## [1] 6.083388
##
## [[2]]
## [1] 6.11109
##
## [[3]]
## [1] 6.102539
##
## [[4]]
## [1] 6.104557
##
## [[5]]
## [1] 6.144922
with (data=imp, exp=(prop.table(table(inc))))
## call :
## with.mids(data = imp, expr = (prop.table(table(inc))))
##
## call1 :
## mice(data = brfss16m[, c("bmi", "inc", "agec", "educ", "race_eth")],
## seed = 22)
##
## nmis :
## bmi inc agec educ race_eth
## 4275 8492 0 203 1001
##
## analyses :
## [[1]]
## inc
## 1_lt15k 2_15-25k 3_25-35k 4_35-50k 5_50kplus
## 0.09266 0.16036 0.10098 0.13738 0.50862
##
## [[2]]
## inc
## 1_lt15k 2_15-25k 3_25-35k 4_35-50k 5_50kplus
## 0.09296 0.15998 0.10054 0.13760 0.50892
##
## [[3]]
## inc
## 1_lt15k 2_15-25k 3_25-35k 4_35-50k 5_50kplus
## 0.09166 0.16160 0.10096 0.13620 0.50958
##
## [[4]]
## inc
## 1_lt15k 2_15-25k 3_25-35k 4_35-50k 5_50kplus
## 0.09234 0.16094 0.10158 0.13680 0.50834
##
## [[5]]
## inc
## 1_lt15k 2_15-25k 3_25-35k 4_35-50k 5_50kplus
## 0.09188 0.15988 0.10246 0.13668 0.50910
with (data=imp, exp=(prop.table(table(race_eth))))
## call :
## with.mids(data = imp, expr = (prop.table(table(race_eth))))
##
## call1 :
## mice(data = brfss16m[, c("bmi", "inc", "agec", "educ", "race_eth")],
## seed = 22)
##
## nmis :
## bmi inc agec educ race_eth
## 4275 8492 0 203 1001
##
## analyses :
## [[1]]
## race_eth
## nhwhite hispanic nh black nh multirace nh other
## 0.74608 0.09742 0.10182 0.01468 0.04000
##
## [[2]]
## race_eth
## nhwhite hispanic nh black nh multirace nh other
## 0.74584 0.09764 0.10184 0.01484 0.03984
##
## [[3]]
## race_eth
## nhwhite hispanic nh black nh multirace nh other
## 0.74592 0.09750 0.10194 0.01488 0.03976
##
## [[4]]
## race_eth
## nhwhite hispanic nh black nh multirace nh other
## 0.74632 0.09712 0.10180 0.01478 0.03998
##
## [[5]]
## race_eth
## nhwhite hispanic nh black nh multirace nh other
## 0.74634 0.09768 0.10168 0.01460 0.03970
with (data=imp, exp=(prop.table(table(educ))))
## call :
## with.mids(data = imp, expr = (prop.table(table(educ))))
##
## call1 :
## mice(data = brfss16m[, c("bmi", "inc", "agec", "educ", "race_eth")],
## seed = 22)
##
## nmis :
## bmi inc agec educ race_eth
## 4275 8492 0 203 1001
##
## analyses :
## [[1]]
## educ
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0.25192 0.02370 0.04506 0.26978 0.40954
##
## [[2]]
## educ
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0.25160 0.02368 0.04518 0.26994 0.40960
##
## [[3]]
## educ
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0.25150 0.02366 0.04518 0.26982 0.40984
##
## [[4]]
## educ
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0.25156 0.02366 0.04510 0.26980 0.40988
##
## [[5]]
## educ
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0.25144 0.02374 0.04524 0.26974 0.40984
Now we pool the separate models from each imputed data set:
est.p<-pool(fit.bmi)
print(est.p)
## Call: pool(object = fit.bmi)
##
## Pooled coefficients:
## (Intercept) inc2 inc3 inc4 inc5
## 25.453949770 0.078716819 -0.090021697 0.005028554 -0.590143360
## agec2 agec3 agec4 agec5 educ2
## 2.796590508 3.892258991 3.380435939 1.054391469 0.633695206
## educ3 educ4 educ5 race_eth2 race_eth3
## -0.136002503 -0.070659334 -1.222031478 0.317264370 1.592967650
## race_eth4 race_eth5
## 0.599956060 -1.406414347
##
## Fraction of information about the coefficients missing due to nonresponse:
## (Intercept) inc2 inc3 inc4 inc5 agec2
## 0.07008382 0.18853812 0.18711186 0.15095386 0.29474286 0.02402665
## agec3 agec4 agec5 educ2 educ3 educ4
## 0.22668242 0.14826795 0.17301703 0.35342441 0.11439014 0.63007923
## educ5 race_eth2 race_eth3 race_eth4 race_eth5
## 0.54614575 0.31987555 0.37120243 0.27140985 0.19859261
summary(est.p)
## est se t df Pr(>|t|)
## (Intercept) 25.453949770 0.1522548 167.17994201 851.67460 0.000000e+00
## inc2 0.078716819 0.1218953 0.64577383 128.69284 5.195756e-01
## inc3 -0.090021697 0.1354296 -0.66471224 130.56468 5.074069e-01
## inc4 0.005028554 0.1260918 0.03988009 196.48026 9.682292e-01
## inc5 -0.590143360 0.1215960 -4.85331207 54.97302 1.042651e-05
## agec2 2.796590508 0.1295817 21.58168241 6208.78821 0.000000e+00
## agec3 3.892258991 0.1354081 28.74464559 90.65978 0.000000e+00
## agec4 3.380435939 0.1291021 26.18420663 203.31784 0.000000e+00
## agec5 1.054391469 0.1609031 6.55295835 151.53838 8.313514e-10
## educ2 0.633695206 0.2280994 2.77815384 38.79272 8.380520e-03
## educ3 -0.136002503 0.1459919 -0.93157594 333.56218 3.522295e-01
## educ4 -0.070659334 0.1139123 -0.62029600 12.14262 5.465286e-01
## educ5 -1.222031478 0.1021910 -11.95830332 16.38484 1.639863e-09
## race_eth2 0.317264370 0.1162498 2.72915980 46.99980 8.905691e-03
## race_eth3 1.592967650 0.1112532 14.31839487 35.27991 4.440892e-16
## race_eth4 0.599956060 0.2561603 2.34211171 64.34719 2.228109e-02
## race_eth5 -1.406414347 0.1531547 -9.18296448 116.58733 1.776357e-15
## lo 95 hi 95 nmis fmi lambda
## (Intercept) 25.15511115 25.7527884 NA 0.07008382 0.06790264
## inc2 -0.16246154 0.3198952 NA 0.18853812 0.17602452
## inc3 -0.35794203 0.1778986 NA 0.18711186 0.17475462
## inc4 -0.24363857 0.2536957 NA 0.15095386 0.14235507
## inc5 -0.83382989 -0.3464568 NA 0.29474286 0.26954296
## agec2 2.54256558 3.0506154 NA 0.02402665 0.02371232
## agec3 3.62327376 4.1612442 NA 0.22668242 0.20980876
## agec4 3.12588529 3.6349866 NA 0.14826795 0.13993062
## agec5 0.73648837 1.3722946 NA 0.17301703 0.16217408
## educ2 0.17224180 1.0951486 NA 0.35342441 0.32092723
## educ3 -0.42318329 0.1511783 NA 0.11439014 0.10909600
## educ4 -0.31852994 0.1772113 NA 0.63007923 0.57378594
## educ5 -1.43825401 -1.0058089 NA 0.54614575 0.49393311
## race_eth2 0.08339985 0.5511289 NA 0.31987555 0.29153692
## race_eth3 1.36717563 1.8187597 NA 0.37120243 0.33653876
## race_eth4 0.08827019 1.1116419 NA 0.27140985 0.24911080
## race_eth5 -1.70974046 -1.1030882 NA 0.19859261 0.18496176
We need to pay attention to the fmi column and the lambda column. These convey information about how much the missingness of each particular variable affects the model coefficients.
lam<-data.frame(lam=est.p$lambda, param=names(est.p$lambda))
ggplot(data=lam,aes(x=param, y=lam))+geom_col()+theme(axis.text.x = element_text(angle = 45, hjust = 1))
It appears that the education variables and race/ethnicity have large variances to them. This suggests that there may be noticeable variation in the resulting coefficient of the model, depending on which imputed data set we use.
We can also compare to the model fit on the original data, with missings eliminated:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
bnm<-brfss16m%>%
select(bmi, inc, agec, educ, race_eth)%>%
filter(complete.cases(.))%>%
as.data.frame()
summary(lm(bmi~inc+agec+educ+race_eth, bnm))
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth, data = bnm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.588 -4.000 -0.888 2.907 53.793
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.63163 0.14552 176.140 < 2e-16 ***
## inc.L -0.43449 0.08628 -5.036 4.79e-07 ***
## inc.Q -0.31235 0.08527 -3.663 0.000249 ***
## inc.C -0.17141 0.07934 -2.160 0.030744 *
## inc^4 -0.19809 0.08859 -2.236 0.025349 *
## agec(24,39] 2.62671 0.15182 17.301 < 2e-16 ***
## agec(39,59] 3.71578 0.14365 25.866 < 2e-16 ***
## agec(59,79] 3.24749 0.14332 22.659 < 2e-16 ***
## agec(79,99] 0.91184 0.17743 5.139 2.77e-07 ***
## educ0Prim 0.75203 0.23752 3.166 0.001546 **
## educ1somehs -0.25063 0.16296 -1.538 0.124056
## educ3somecol -0.08821 0.08518 -1.036 0.300410
## educ4colgrad -1.23067 0.08251 -14.916 < 2e-16 ***
## race_ethhispanic 0.30708 0.11214 2.738 0.006176 **
## race_ethnh black 1.62964 0.10335 15.769 < 2e-16 ***
## race_ethnh multirace 0.65539 0.25022 2.619 0.008816 **
## race_ethnh other -1.57342 0.16134 -9.752 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.93 on 38635 degrees of freedom
## Multiple R-squared: 0.05155, Adjusted R-squared: 0.05115
## F-statistic: 131.2 on 16 and 38635 DF, p-value: < 2.2e-16
Here, I compare the coefficients from the model where we eliminated all missing data to the one that we fit on the imputed data:
fit1<-lm(bmi~inc+agec+educ+race_eth, data=brfss16m)
summary(fit1)
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth, data = brfss16m)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.588 -4.000 -0.888 2.907 53.793
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.63163 0.14552 176.140 < 2e-16 ***
## inc.L -0.43449 0.08628 -5.036 4.79e-07 ***
## inc.Q -0.31235 0.08527 -3.663 0.000249 ***
## inc.C -0.17141 0.07934 -2.160 0.030744 *
## inc^4 -0.19809 0.08859 -2.236 0.025349 *
## agec(24,39] 2.62671 0.15182 17.301 < 2e-16 ***
## agec(39,59] 3.71578 0.14365 25.866 < 2e-16 ***
## agec(59,79] 3.24749 0.14332 22.659 < 2e-16 ***
## agec(79,99] 0.91184 0.17743 5.139 2.77e-07 ***
## educ0Prim 0.75203 0.23752 3.166 0.001546 **
## educ1somehs -0.25063 0.16296 -1.538 0.124056
## educ3somecol -0.08821 0.08518 -1.036 0.300410
## educ4colgrad -1.23067 0.08251 -14.916 < 2e-16 ***
## race_ethhispanic 0.30708 0.11214 2.738 0.006176 **
## race_ethnh black 1.62964 0.10335 15.769 < 2e-16 ***
## race_ethnh multirace 0.65539 0.25022 2.619 0.008816 **
## race_ethnh other -1.57342 0.16134 -9.752 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.93 on 38635 degrees of freedom
## (11348 observations deleted due to missingness)
## Multiple R-squared: 0.05155, Adjusted R-squared: 0.05115
## F-statistic: 131.2 on 16 and 38635 DF, p-value: < 2.2e-16
fit.imp<-lm(bmi~inc+agec+educ+race_eth, data=dat.imp)
summary(fit.imp)
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth, data = dat.imp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.356 -4.014 -0.944 2.905 53.799
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.49823 0.14813 172.132 < 2e-16 ***
## inc2 0.11389 0.11164 1.020 0.307667
## inc3 -0.08410 0.12363 -0.680 0.496326
## inc4 0.02830 0.11775 0.240 0.810087
## inc5 -0.57735 0.10485 -5.506 3.68e-08 ***
## agec2 2.78344 0.12887 21.598 < 2e-16 ***
## agec3 3.82964 0.12115 31.611 < 2e-16 ***
## agec4 3.31524 0.12050 27.512 < 2e-16 ***
## agec5 0.95153 0.14826 6.418 1.39e-10 ***
## educ2 0.70768 0.18902 3.744 0.000181 ***
## educ3 -0.12674 0.13859 -0.915 0.360457
## educ4 0.02034 0.07487 0.272 0.785858
## educ5 -1.15914 0.07318 -15.839 < 2e-16 ***
## race_eth2 0.27828 0.09839 2.829 0.004678 **
## race_eth3 1.55631 0.09130 17.046 < 2e-16 ***
## race_eth4 0.42663 0.22458 1.900 0.057479 .
## race_eth5 -1.49853 0.13942 -10.748 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.986 on 49983 degrees of freedom
## Multiple R-squared: 0.05126, Adjusted R-squared: 0.05095
## F-statistic: 168.8 on 16 and 49983 DF, p-value: < 2.2e-16
So for instance, the coefficient for college grad in the imputed model is -1.20, but is -1.23 in the model where the data were limited to complete cases only. The notable patter that emerges in the imputed data is the lack of significance for the income variables. In the analysis that only uses complete cases, we see a significant income effect on bmi, but not once we impute the missing values. This suggests a significant selection effect for the income variable.
We can construct a flag variable. This is a useful exercise to see whethere we have missing not at random within the data:
fit1<-lm(bmi~agec+educ+race_eth+is.na(inc), data=brfss16m)
summary(fit1)
##
## Call:
## lm(formula = bmi ~ agec + educ + race_eth + is.na(inc), data = brfss16m)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.470 -3.978 -0.905 2.871 53.701
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.472769 0.127810 199.302 < 2e-16 ***
## agec(24,39] 2.648301 0.134502 19.690 < 2e-16 ***
## agec(39,59] 3.688721 0.125994 29.277 < 2e-16 ***
## agec(59,79] 3.305475 0.125121 26.418 < 2e-16 ***
## agec(79,99] 1.182587 0.152492 7.755 9.01e-15 ***
## educ0Prim 0.795509 0.206753 3.848 0.000119 ***
## educ1somehs -0.000858 0.144133 -0.006 0.995250
## educ3somecol -0.192862 0.077454 -2.490 0.012777 *
## educ4colgrad -1.472911 0.071512 -20.597 < 2e-16 ***
## race_ethhispanic 0.475369 0.101801 4.670 3.03e-06 ***
## race_ethnh black 1.743845 0.094601 18.434 < 2e-16 ***
## race_ethnh multirace 0.781121 0.230828 3.384 0.000715 ***
## race_ethnh other -1.319767 0.146442 -9.012 < 2e-16 ***
## is.na(inc)TRUE -0.826256 0.081874 -10.092 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.905 on 44864 degrees of freedom
## (5122 observations deleted due to missingness)
## Multiple R-squared: 0.05345, Adjusted R-squared: 0.05317
## F-statistic: 194.9 on 13 and 44864 DF, p-value: < 2.2e-16
And indeed we see that those with missing incomes have signifcantly lower bmi’s.
If we wanted to see the ranges of the betas in the five imputed data models, we could do that:
#get the coefficients from each of the 5 imputations of bmi
coefs<-matrix(unlist(lapply(fit.bmi$analyses, coef)), nrow=5, ncol=17, byrow=T)
#plot the coefficients from each of the different rounds of imputation to see the variability in the
#results
plot(coefs[1,-1], ylim=c(-2, 4), xaxt="n",cex=1.5, pch=20, ylab="beta")
axis(1, at=1:16, labels=names(fit.bmi$analyses[[1]]$coef[-1]))
cols=2:5
for(i in 1:dim(coefs)[1]-1){
points(coefs[i+1,-1], col=cols[i], cex=1.5, pch=20)
}
title(main="Estimated Betas from Each Imputed Regression Model for BMI Outcome")