This example will illustrate typical aspects of dealing with missing data. Topics will include: Mean imputation, modal imputation for categorical data, and multiple imputation of complex patterns of missing data.
For this example I am using 2014 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART county data. Link
library(car)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(mice) #Library for multiple imputation
## Loading required package: Rcpp
## mice 2.25 2015-11-09
library(MASS)
#load brfss
load("brfss_14.Rdata")
nams<-names(brfss14)
newnames<-gsub(pattern = "x_",replacement = "",x = nams)
names(brfss14)<-tolower(newnames)
set.seed(1234)
brfss14$strat<-sprintf("%05d", brfss14$ststr)
brfss14$state<-substr(brfss14$strat, 1,2)
#just for brevity, I just select TX respondents with non missing weights
brfss14<-brfss14[brfss14$state=="48",]
#insurance
brfss14$ins<-recode(brfss14$hlthpln1, recodes="1=1;2=0;else=NA")
#bmi
brfss14$bmi<-ifelse(is.na(brfss14$bmi5)==T, NA, brfss14$bmi5/100)
#poor or fair health
brfss14$badhealth<-recode(brfss14$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race
brfss14$black<-recode(brfss14$racegr3, recodes="2=1; 9=NA; else=0")
brfss14$white<-recode(brfss14$racegr3, recodes="1=1; 9=NA; else=0")
brfss14$other<-recode(brfss14$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss14$hispanic<-recode(brfss14$racegr3, recodes="5=1; 9=NA; else=0")
brfss14$race_eth<-recode(brfss14$racegr3, recodes="1='nhwhite'; 2='nh black'; 3='nh other';
4='nh multirace'; 5='hispanic'; else=NA", as.factor.result = T)
brfss14$race_eth<-relevel(brfss14$race_eth, ref = "nhwhite")
#have a personal doctor?
brfss14$doc<-recode(brfss14$persdoc2, recodes="1:2=1; 3=0; else=NA", as.factor.result=F)
#needed care in last year but couldn't get it because of cost
brfss14$medacc<-recode(brfss14$medcost, recodes="1=1;2=0;else=NA")
#education level
brfss14$educ<-recode(brfss14$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor.result=T)
brfss14$educ<-relevel(brfss14$educ, ref='2hsgrad')
#employment
brfss14$emp<-recode(brfss14$employ, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor.result=T)
brfss14$emp<-relevel(brfss14$emp, ref='Employed')
#marital status
brfss14$marst<-recode(brfss14$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor.result=T)
brfss14$marst<-relevel(brfss14$marst, ref='married')
#income
brfss14$inc<-recode(brfss14$income2, recodes="1='<10k'; 2='10 to 15k'; 3='15 to 20k'; 4='20 to 25k'; 5='25 to 35k';6='35 to 50k'; 7='50 to 75k';8='75k+'; 77:99=NA", as.factor.result = T)
#Age cut into intervals
brfss14$agec<-cut(brfss14$age80, breaks=c(0,24,39,59,79,99), include.lowest = T)
#Health behaviors
#insurance
brfss14$ins<-recode(brfss14$hlthpln1, recodes = "1=1; 2=0; else=NA" )
#smoking currently
brfss14$smoke<-recode(brfss14$smoker3, recodes="1:2='Current'; 3='Former';4='NeverSmoked'; else=NA", as.factor.result=T)
brfss14$smoke<-relevel(brfss14$smoke, ref = "NeverSmoked")
Now, we can get a general idea of the missingness of these variables by just using summary(brfss14)
summary(brfss14[, c("ins", "smoke", "bmi", "badhealth", "race_eth", "doc", "educ", "emp", "marst", "inc")])
## ins smoke bmi badhealth
## Min. :0.0000 NeverSmoked:2377 Min. :13.77 Min. :0.0000
## 1st Qu.:1.0000 Current : 682 1st Qu.:23.91 1st Qu.:0.0000
## Median :1.0000 Former :1279 Median :27.32 Median :0.0000
## Mean :0.9024 NA's : 131 Mean :28.18 Mean :0.1984
## 3rd Qu.:1.0000 3rd Qu.:31.09 3rd Qu.:0.0000
## Max. :1.0000 Max. :88.19 Max. :1.0000
## NA's :12 NA's :258 NA's :9
## race_eth doc educ emp
## nhwhite :3286 Min. :0.0000 2hsgrad :1274 Employed:2204
## hispanic : 261 1st Qu.:1.0000 0Prim : 102 nilf : 662
## nh black : 356 Median :1.0000 1somehs : 277 retired :1185
## nh multirace: 223 Mean :0.8294 3somecol:1288 unable : 394
## nh other : 310 3rd Qu.:1.0000 4colgrad:1504 NA's : 24
## NA's : 33 Max. :1.0000 NA's : 24
## NA's :3
## marst inc
## married :2374 75k+ :1089
## cohab : 113 35 to 50k: 588
## divorced : 675 50 to 75k: 581
## nm : 588 25 to 35k: 491
## separated: 94 20 to 25k: 375
## widowed : 599 (Other) : 719
## NA's : 26 NA's : 626
Which shows that, among these recoded variables, hc or the indicator for smoking is missing the most with 626 people in our TX sample missing, or 14.007608% of the sample.
The lowest number of missings is in the current doctor variable, which only has 0.0671291% missing.
Now, i’m going to illustrate mean imputation of a continuous variable, BMI.
#I'm going to play with 3 outcomes, bmi, having a regular doctor and income category
summary(brfss14$bmi)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 13.77 23.91 27.32 28.18 31.09 88.19 258
#what happens when we replace the missings with the mean?
brfss14$bmi.imp.mean<-ifelse(is.na(brfss14$bmi)==T, mean(brfss14$bmi, na.rm=T), brfss14$bmi)
mean(brfss14$bmi, na.rm=T)
## [1] 28.18241
mean(brfss14$bmi.imp.mean) #no difference!
## [1] 28.18241
median(brfss14$bmi, na.rm=T)
## [1] 27.32
median(brfss14$bmi.imp.mean) #slight difference
## [1] 27.48
var(brfss14$bmi, na.rm=T)
## [1] 40.16017
var(brfss14$bmi.imp.mean) # more noticeable difference!
## [1] 37.84117
So what we see here, is that imputing with the mean does nothing to central tendency (when measured using the mean, but does affect the median slightly), but it does reduce the variance in the outcome. This is because you’re replacing all missing cases with the most likely value (the mean), so you’re artificially deflating the variance. That’s not good.
We can see this in a histogram:
#plot the histogram
hist(brfss14$bmi.imp.mean)
hist(brfss14$bmi, add=T ,col=1) #
Where you can see the extra values at the mean by the white area over the mode.
If we have a categorical variable, an easy way to impute the values is to use modal imputation, or impute cases with the mode, or most common value. It doesn’t make sense to use the mean, because what would that mean for a categorical variable?
table(brfss14$emp)
##
## Employed nilf retired unable
## 2204 662 1185 394
#find the most common value
mcv.emp<-factor(names(which.max(table(brfss14$emp))), levels=levels(brfss14$emp))
mcv.emp
## [1] Employed
## Levels: Employed nilf retired unable
#impute the cases
brfss14$emp.imp<-as.factor(ifelse(is.na(brfss14$emp)==T, mcv.emp, brfss14$emp))
levels(brfss14$emp.imp)<-levels(brfss14$emp)
prop.table(table(brfss14$emp))
##
## Employed nilf retired unable
## 0.49583802 0.14893138 0.26659168 0.08863892
prop.table(table(brfss14$emp.imp))
##
## Employed nilf retired unable
## 0.4985455 0.1481316 0.2651600 0.0881629
barplot(prop.table(table(brfss14$emp)), main="Original Data", ylim=c(0, .6))
barplot(prop.table(table(brfss14$emp.imp)), main="Imputed Data",ylim=c(0, .6))
Which doesn’t look like much of a difference because only 24 people were missing. Now let’s try modal imputation on income group:
table(brfss14$inc)
##
## <10k 10 to 15k 15 to 20k 20 to 25k 25 to 35k 35 to 50k 50 to 75k
## 206 220 293 375 491 588 581
## 75k+
## 1089
#find the most common value
mcv.inc<-factor(names(which.max(table(brfss14$inc))), levels=levels(brfss14$inc))
mcv.inc
## [1] 75k+
## 8 Levels: <10k 10 to 15k 15 to 20k 20 to 25k 25 to 35k ... 75k+
#impute the cases
brfss14$inc.imp<-as.factor(ifelse(is.na(brfss14$inc)==T, mcv.inc, brfss14$inc))
levels(brfss14$inc.imp)<-levels(brfss14$inc)
prop.table(table(brfss14$inc))
##
## <10k 10 to 15k 15 to 20k 20 to 25k 25 to 35k 35 to 50k
## 0.05360396 0.05724694 0.07624252 0.09758002 0.12776477 0.15300546
## 50 to 75k 75k+
## 0.15118397 0.28337237
prop.table(table(brfss14$inc.imp))
##
## <10k 10 to 15k 15 to 20k 20 to 25k 25 to 35k 35 to 50k
## 0.04609532 0.04922802 0.06556277 0.08391139 0.10986798 0.13157306
## 50 to 75k 75k+
## 0.13000671 0.38375475
barplot(prop.table(table(brfss14$inc)), main="Original Data", ylim=c(0, .6))
barplot(prop.table(table(brfss14$inc.imp)), main="Imputed Data", ylim=c(0, .6))
Which shows how dramatically we alter the distribution of the variable by imputing at the mode.
These days, these types of imputation have been far surpassed by more complete methods that are based upon regression methods. These methods are generally referred to as multiple imputation, because we are really interested in imputing multiple variables simultaneously. Instead of reviewing this perspective here, I suggest you have a look at Joe Schafer’s site that gives a nice treatment of the subject. Here, I will use the imputation techniques in the mice library in R, which you can read about here.
I have used these in practice in publications and generally like the framework the library uses. Another popular technique is in the Amelia library of Gary King, which I haven’t used much. If you are serious about doing multiple imputation it would be advised to investigate multiple methodologies.
To begin, I explore the various patterns of missingness in the data. The md.pattern function in mice does this nicely. Here, each row corresponds to a particular pattern of missingness (1 = observed, 0=missing)
#look at the patterns of missingness
md.pattern(brfss14[,c("bmi", "inc", "agec","educ","race_eth")])
## agec educ race_eth bmi inc
## 3688 1 1 1 1 1 0
## 135 1 1 1 0 1 1
## 497 1 1 1 1 0 1
## 4 1 0 1 1 1 1
## 15 1 1 0 1 1 1
## 95 1 1 1 0 0 2
## 1 1 1 0 0 1 2
## 7 1 1 0 1 0 2
## 17 1 0 1 0 0 3
## 7 1 1 0 0 0 3
## 3 1 0 0 0 0 4
## 0 24 33 258 626 941
Shows that 3688 rows of the data are complete (first row).
The second row shows that 135 people are missing only the bmi variable. I say only because 258 people are missing the bmi variable in total. Apparently some folks are missing in combination with other variables.
Sure enough, if we look down, we see that 95 people are missing bmi AND income, 17 are missing bmi AND income and education, and so on.
The bottom row tells how many total people are missing each variable, in ANY combination with other variables.
If you want to see how pairs of variables are missing together, the md.pairs() function will show this. A pair of variables can have exactly four missingness patterns: both variables are observed (pattern rr), the first variable is observed and the second variable is missing (pattern rm), the first variable is missing and the second variable is observed (pattern mr), and both are missing (pattern mm).
md.pairs(brfss14[,c("bmi", "inc", "agec","educ","race_eth")])
## $rr
## bmi inc agec educ race_eth
## bmi 4211 3707 4211 4207 4189
## inc 3707 3843 3843 3839 3827
## agec 4211 3843 4469 4445 4436
## educ 4207 3839 4445 4445 4415
## race_eth 4189 3827 4436 4415 4436
##
## $rm
## bmi inc agec educ race_eth
## bmi 0 504 0 4 22
## inc 136 0 0 4 16
## agec 258 626 0 24 33
## educ 238 606 0 0 30
## race_eth 247 609 0 21 0
##
## $mr
## bmi inc agec educ race_eth
## bmi 0 136 258 238 247
## inc 504 0 626 606 609
## agec 0 0 0 0 0
## educ 4 4 24 0 21
## race_eth 22 16 33 30 0
##
## $mm
## bmi inc agec educ race_eth
## bmi 258 122 0 20 11
## inc 122 626 0 20 17
## agec 0 0 0 0 0
## educ 20 20 0 24 3
## race_eth 11 17 0 3 33
We can perform a basic multiple imputation by simply doing: Note this may take a very long time with big data sets
imp<-mice(data = brfss14[,c("bmi", "inc", "agec","educ","race_eth")], seed = 22)
##
## iter imp variable
## 1 1 bmi inc educ race_eth
## 1 2 bmi inc educ race_eth
## 1 3 bmi inc educ race_eth
## 1 4 bmi inc educ race_eth
## 1 5 bmi inc educ race_eth
## 2 1 bmi inc educ race_eth
## 2 2 bmi inc educ race_eth
## 2 3 bmi inc educ race_eth
## 2 4 bmi inc educ race_eth
## 2 5 bmi inc educ race_eth
## 3 1 bmi inc educ race_eth
## 3 2 bmi inc educ race_eth
## 3 3 bmi inc educ race_eth
## 3 4 bmi inc educ race_eth
## 3 5 bmi inc educ race_eth
## 4 1 bmi inc educ race_eth
## 4 2 bmi inc educ race_eth
## 4 3 bmi inc educ race_eth
## 4 4 bmi inc educ race_eth
## 4 5 bmi inc educ race_eth
## 5 1 bmi inc educ race_eth
## 5 2 bmi inc educ race_eth
## 5 3 bmi inc educ race_eth
## 5 4 bmi inc educ race_eth
## 5 5 bmi inc educ race_eth
print(imp)
## Multiply imputed data set
## Call:
## mice(data = brfss14[, c("bmi", "inc", "agec", "educ", "race_eth")],
## seed = 22)
## Number of multiple imputations: 5
## Missing cells per column:
## bmi inc agec educ race_eth
## 258 626 0 24 33
## Imputation methods:
## bmi inc agec educ race_eth
## "pmm" "polyreg" "" "polyreg" "polyreg"
## VisitSequence:
## bmi inc educ race_eth
## 1 2 4 5
## PredictorMatrix:
## bmi inc agec educ race_eth
## bmi 0 1 1 1 1
## inc 1 0 1 1 1
## agec 0 0 0 0 0
## educ 1 1 1 0 1
## race_eth 1 1 1 1 0
## Random generator seed value: 22
Shows how many imputations were done. It also shows total missingness, which imputation method was used for each variable (because you wouldn’t want to use a normal distribution for a categorical variable!!). It also shows the sequence of how each variable is visited (or imputed, the default is left to right).
We may want to make sure imputed values are plausible by having a look:
head(imp$imp$bmi)
## 1 2 3 4 5
## 140290 25.10 36.91 34.33 14.53 40.24
## 140297 21.97 25.39 26.78 26.29 28.50
## 140342 37.23 34.72 25.85 19.84 31.57
## 140352 25.82 25.80 26.61 28.37 28.19
## 140379 19.67 21.46 21.77 21.95 20.31
## 140401 27.89 37.12 47.47 49.38 26.52
head(imp$imp$inc)
## 1 2 3 4 5
## 140283 50 to 75k 50 to 75k 35 to 50k 35 to 50k 25 to 35k
## 140290 10 to 15k 15 to 20k 35 to 50k 75k+ 15 to 20k
## 140292 25 to 35k 15 to 20k 35 to 50k 75k+ <10k
## 140297 75k+ 35 to 50k 35 to 50k 20 to 25k 10 to 15k
## 140300 20 to 25k 35 to 50k 25 to 35k 35 to 50k 50 to 75k
## 140304 35 to 50k 75k+ 20 to 25k 25 to 35k 50 to 75k
Which shows the imputed values for the first 6 cases across the 5 different imputations. We can see that there is variation across the imputations, because the imputed values are not the same.
We can also do some plotting. For instance if we want to see how the observed and imputed values of bmi look with respect to race, we can do:
library(lattice)
stripplot(imp,bmi~race_eth|.imp, pch=20)
and we see the distribution of the original data (blue dots), the imputed data (red dots) across the levels of race, for each of the five different imputation runs(the number at the top shows which run, and the first plot is the original data).
This plot shows that the bmi values correspond well with the observed data, so they are probably plausible valuesU.
If we want to get our new, imputed data, we can use the complete() function, which by default extracts the first imputed data set. If we want a different one, we can do complete(imp, action=3) for example, to get the third imputed data set.
dat.imp<-complete(imp)
head(dat.imp, n=10)
## bmi inc agec educ race_eth
## 140282 24.33 25 to 35k [0,24] 2hsgrad hispanic
## 140283 37.25 50 to 75k (39,59] 2hsgrad nhwhite
## 140284 22.24 75k+ [0,24] 3somecol nhwhite
## 140285 25.62 75k+ (39,59] 3somecol nh other
## 140286 21.63 75k+ (24,39] 4colgrad nhwhite
## 140287 25.84 75k+ (59,79] 4colgrad nhwhite
## 140288 24.41 25 to 35k [0,24] 3somecol nhwhite
## 140289 19.22 50 to 75k (24,39] 3somecol nhwhite
## 140290 25.10 10 to 15k (24,39] 3somecol nh black
## 140291 28.65 50 to 75k (24,39] 3somecol hispanic
#Compare to the original data
head(brfss14[,c("bmi", "inc", "agec","educ","black","white","hispanic")], n=10)
## bmi inc agec educ black white hispanic
## 140282 24.33 25 to 35k [0,24] 2hsgrad 0 0 1
## 140283 37.25 <NA> (39,59] 2hsgrad 0 1 0
## 140284 22.24 75k+ [0,24] 3somecol 0 1 0
## 140285 25.62 75k+ (39,59] 3somecol 0 0 0
## 140286 21.63 75k+ (24,39] 4colgrad 0 1 0
## 140287 25.84 75k+ (59,79] 4colgrad 0 1 0
## 140288 24.41 25 to 35k [0,24] 3somecol 0 1 0
## 140289 19.22 50 to 75k (24,39] 3somecol 0 1 0
## 140290 NA <NA> (24,39] <NA> 1 0 0
## 140291 28.65 50 to 75k (24,39] 3somecol 0 0 1
While the first few cases don’t show much missingness, we can coax some more interesting cases out and compare the original data to the imputed:
head(dat.imp[is.na(brfss14$bmi)==T,], n=10)
## bmi inc agec educ race_eth
## 140290 25.10 10 to 15k (24,39] 3somecol nh black
## 140297 21.97 75k+ [0,24] 4colgrad nhwhite
## 140342 37.23 35 to 50k (59,79] 4colgrad nhwhite
## 140352 25.82 15 to 20k [0,24] 1somehs hispanic
## 140379 19.67 20 to 25k (59,79] 0Prim hispanic
## 140401 27.89 35 to 50k (39,59] 2hsgrad nhwhite
## 140443 31.32 20 to 25k (39,59] 2hsgrad nhwhite
## 140451 34.70 20 to 25k (59,79] 3somecol nh other
## 140470 41.84 10 to 15k (79,99] 2hsgrad nh black
## 140486 22.15 15 to 20k (24,39] 0Prim hispanic
head(brfss14[is.na(brfss14$bmi)==T,c("bmi", "inc", "agec","educ","race_eth")], n=10)
## bmi inc agec educ race_eth
## 140290 NA <NA> (24,39] <NA> nh black
## 140297 NA <NA> [0,24] <NA> nhwhite
## 140342 NA <NA> (59,79] 4colgrad nhwhite
## 140352 NA 15 to 20k [0,24] 1somehs hispanic
## 140379 NA <NA> (59,79] 0Prim hispanic
## 140401 NA <NA> (39,59] 2hsgrad nhwhite
## 140443 NA 20 to 25k (39,59] 2hsgrad nhwhite
## 140451 NA 20 to 25k (59,79] 3somecol nh other
## 140470 NA 10 to 15k (79,99] 2hsgrad nh black
## 140486 NA <NA> (24,39] 0Prim hispanic
A key element of using imputed data, is that the relationships we want to know about should be maintained after imputation, and presumably, the relationships within each imputed data set will be the same. So if we used each of the (5 in this case) imputed data sets in a model, then we should see similar results across the five different models.
Here I look at a linear model for bmi:
#Now, I will see the variability in the 5 different imputations for each outcom
fit.bmi<-with(data=imp ,exp=lm(bmi~inc+agec+educ+race_eth))
fit.bmi
## call :
## with.mids(data = imp, expr = lm(bmi ~ inc + agec + educ + race_eth))
##
## call1 :
## mice(data = brfss14[, c("bmi", "inc", "agec", "educ", "race_eth")],
## seed = 22)
##
## nmis :
## bmi inc agec educ race_eth
## 258 626 0 24 33
##
## analyses :
## [[1]]
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
##
## Coefficients:
## (Intercept) inc2 inc3 inc4 inc5
## 26.458327 0.335561 0.520844 -0.374462 -0.850894
## inc6 inc7 inc8 agec2 agec3
## -0.785629 -0.726448 -1.119064 3.023091 3.774867
## agec4 agec5 educ2 educ3 educ4
## 3.004520 0.024796 -1.360424 -1.025615 0.004437
## educ5 race_eth2 race_eth3 race_eth4 race_eth5
## -1.033684 -0.032551 0.425236 -0.169849 -0.347025
##
##
## [[2]]
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
##
## Coefficients:
## (Intercept) inc2 inc3 inc4 inc5
## 26.63602 0.16638 -0.50579 -0.84253 -1.02144
## inc6 inc7 inc8 agec2 agec3
## -0.89876 -0.72390 -1.35295 2.73334 3.75046
## agec4 agec5 educ2 educ3 educ4
## 3.04012 0.06632 -1.90401 -0.91305 0.08618
## educ5 race_eth2 race_eth3 race_eth4 race_eth5
## -1.03414 -0.09571 0.43466 -0.09511 -0.34998
##
##
## [[3]]
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
##
## Coefficients:
## (Intercept) inc2 inc3 inc4 inc5
## 26.2337 0.6805 0.4253 -0.5538 -0.5335
## inc6 inc7 inc8 agec2 agec3
## -0.5322 -0.1929 -0.8283 2.7203 3.7345
## agec4 agec5 educ2 educ3 educ4
## 2.9431 -0.0658 -0.8253 -0.8740 0.1829
## educ5 race_eth2 race_eth3 race_eth4 race_eth5
## -0.9673 -0.3177 0.2808 -0.2371 -0.4758
##
##
## [[4]]
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
##
## Coefficients:
## (Intercept) inc2 inc3 inc4 inc5
## 26.34589 0.26108 0.36190 -0.55454 -0.97619
## inc6 inc7 inc8 agec2 agec3
## -0.74598 -0.48829 -1.18510 3.00405 3.89441
## agec4 agec5 educ2 educ3 educ4
## 3.12241 0.12303 -0.91581 -1.06391 0.03855
## educ5 race_eth2 race_eth3 race_eth4 race_eth5
## -1.05597 0.42373 0.36907 -0.17874 -0.20772
##
##
## [[5]]
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth)
##
## Coefficients:
## (Intercept) inc2 inc3 inc4 inc5
## 26.18129 0.50754 0.56280 -0.33546 -0.55342
## inc6 inc7 inc8 agec2 agec3
## -0.47992 -0.16140 -0.90981 2.90280 3.59461
## agec4 agec5 educ2 educ3 educ4
## 2.96881 0.10866 -1.50723 -0.80422 0.14856
## educ5 race_eth2 race_eth3 race_eth4 race_eth5
## -0.99911 -0.04744 0.46538 -0.18594 -0.39986
with (data=imp, exp=(sd(bmi)))
## call :
## with.mids(data = imp, expr = (sd(bmi)))
##
## call1 :
## mice(data = brfss14[, c("bmi", "inc", "agec", "educ", "race_eth")],
## seed = 22)
##
## nmis :
## bmi inc agec educ race_eth
## 258 626 0 24 33
##
## analyses :
## [[1]]
## [1] 6.424797
##
## [[2]]
## [1] 6.345079
##
## [[3]]
## [1] 6.392502
##
## [[4]]
## [1] 6.405062
##
## [[5]]
## [1] 6.319322
with (data=imp, exp=(prop.table(table(inc))))
## call :
## with.mids(data = imp, expr = (prop.table(table(inc))))
##
## call1 :
## mice(data = brfss14[, c("bmi", "inc", "agec", "educ", "race_eth")],
## seed = 22)
##
## nmis :
## bmi inc agec educ race_eth
## 258 626 0 24 33
##
## analyses :
## [[1]]
## inc
## <10k 10 to 15k 15 to 20k 20 to 25k 25 to 35k 35 to 50k
## 0.05840233 0.05862609 0.07854106 0.10002238 0.12530768 0.15730588
## 50 to 75k 75k+
## 0.15014545 0.27164914
##
## [[2]]
## inc
## <10k 10 to 15k 15 to 20k 20 to 25k 25 to 35k 35 to 50k
## 0.05370329 0.05616469 0.07831730 0.10337883 0.12844037 0.15506825
## 50 to 75k 75k+
## 0.14902663 0.27590065
##
## [[3]]
## inc
## <10k 10 to 15k 15 to 20k 20 to 25k 25 to 35k 35 to 50k
## 0.05728351 0.06019244 0.07585590 0.10114119 0.13582457 0.15417319
## 50 to 75k 75k+
## 0.14701275 0.26851645
##
## [[4]]
## inc
## <10k 10 to 15k 15 to 20k 20 to 25k 25 to 35k 35 to 50k
## 0.05594093 0.05862609 0.07809353 0.09756098 0.13000671 0.15596330
## 50 to 75k 75k+
## 0.14835534 0.27545312
##
## [[5]]
## inc
## <10k 10 to 15k 15 to 20k 20 to 25k 25 to 35k 35 to 50k
## 0.05526964 0.05795480 0.07898859 0.09935109 0.13291564 0.15305437
## 50 to 75k 75k+
## 0.14813157 0.27433430
with (data=imp, exp=(prop.table(table(race_eth))))
## call :
## with.mids(data = imp, expr = (prop.table(table(race_eth))))
##
## call1 :
## mice(data = brfss14[, c("bmi", "inc", "agec", "educ", "race_eth")],
## seed = 22)
##
## nmis :
## bmi inc agec educ race_eth
## 258 626 0 24 33
##
## analyses :
## [[1]]
## race_eth
## nhwhite hispanic nh black nh multirace nh other
## 0.74110539 0.05862609 0.08077870 0.05012307 0.06936675
##
## [[2]]
## race_eth
## nhwhite hispanic nh black nh multirace nh other
## 0.73998657 0.05907362 0.08033117 0.05101812 0.06959051
##
## [[3]]
## race_eth
## nhwhite hispanic nh black nh multirace nh other
## 0.74177668 0.05862609 0.07988364 0.04989931 0.06981428
##
## [[4]]
## race_eth
## nhwhite hispanic nh black nh multirace nh other
## 0.74065787 0.05884985 0.08033117 0.05012307 0.07003804
##
## [[5]]
## race_eth
## nhwhite hispanic nh black nh multirace nh other
## 0.74088163 0.05840233 0.08033117 0.05012307 0.07026180
with (data=imp, exp=(prop.table(table(agec))))
## call :
## with.mids(data = imp, expr = (prop.table(table(agec))))
##
## call1 :
## mice(data = brfss14[, c("bmi", "inc", "agec", "educ", "race_eth")],
## seed = 22)
##
## nmis :
## bmi inc agec educ race_eth
## 258 626 0 24 33
##
## analyses :
## [[1]]
## agec
## [0,24] (24,39] (39,59] (59,79] (79,99]
## 0.05817856 0.16894160 0.31752070 0.38174088 0.07361826
##
## [[2]]
## agec
## [0,24] (24,39] (39,59] (59,79] (79,99]
## 0.05817856 0.16894160 0.31752070 0.38174088 0.07361826
##
## [[3]]
## agec
## [0,24] (24,39] (39,59] (59,79] (79,99]
## 0.05817856 0.16894160 0.31752070 0.38174088 0.07361826
##
## [[4]]
## agec
## [0,24] (24,39] (39,59] (59,79] (79,99]
## 0.05817856 0.16894160 0.31752070 0.38174088 0.07361826
##
## [[5]]
## agec
## [0,24] (24,39] (39,59] (59,79] (79,99]
## 0.05817856 0.16894160 0.31752070 0.38174088 0.07361826
with (data=imp, exp=(prop.table(table(educ))))
## call :
## with.mids(data = imp, expr = (prop.table(table(educ))))
##
## call1 :
## mice(data = brfss14[, c("bmi", "inc", "agec", "educ", "race_eth")],
## seed = 22)
##
## nmis :
## bmi inc agec educ race_eth
## 258 626 0 24 33
##
## analyses :
## [[1]]
## educ
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0.28686507 0.02282390 0.06220631 0.28910271 0.33900201
##
## [[2]]
## educ
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0.28664131 0.02304766 0.06243007 0.29022153 0.33765943
##
## [[3]]
## educ
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0.28597002 0.02282390 0.06220631 0.29022153 0.33877825
##
## [[4]]
## educ
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0.28708883 0.02282390 0.06220631 0.29044529 0.33743567
##
## [[5]]
## educ
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0.28708883 0.02282390 0.06198255 0.28999776 0.33810696
Which we can compare to the model fit on the original data, with missings eliminated:
summary(lm(bmi~inc+agec+educ+race_eth, data=brfss14))
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth, data = brfss14)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.883 -4.183 -0.940 2.957 57.537
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.18592 0.66661 39.282 < 2e-16 ***
## inc10 to 15k 0.29581 0.63766 0.464 0.642751
## inc15 to 20k 0.21908 0.59562 0.368 0.713033
## inc20 to 25k -0.33909 0.57228 -0.593 0.553536
## inc25 to 35k -0.77314 0.54841 -1.410 0.158686
## inc35 to 50k -0.58545 0.54149 -1.081 0.279690
## inc50 to 75k -0.35103 0.54558 -0.643 0.519996
## inc75k+ -1.01161 0.52710 -1.919 0.055036 .
## agec(24,39] 3.14463 0.53954 5.828 6.08e-09 ***
## agec(39,59] 3.92339 0.51111 7.676 2.09e-14 ***
## agec(59,79] 3.17112 0.50819 6.240 4.87e-10 ***
## agec(79,99] 0.07633 0.62690 0.122 0.903104
## educ0Prim -1.33281 0.88433 -1.507 0.131863
## educ1somehs -0.80800 0.49356 -1.637 0.101698
## educ3somecol 0.24741 0.27806 0.890 0.373639
## educ4colgrad -1.07534 0.28372 -3.790 0.000153 ***
## race_ethhispanic 0.12929 0.52704 0.245 0.806233
## race_ethnh black 0.41594 0.38947 1.068 0.285616
## race_ethnh multirace -0.35715 0.48301 -0.739 0.459697
## race_ethnh other -0.36974 0.42531 -0.869 0.384718
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.302 on 3668 degrees of freedom
## (781 observations deleted due to missingness)
## Multiple R-squared: 0.04216, Adjusted R-squared: 0.0372
## F-statistic: 8.498 on 19 and 3668 DF, p-value: < 2.2e-16
And we see the same relationships, and the same significant effects.
Now we pool the separate models:
est.p<-pool(fit.bmi)
print(est.p)
## Call: pool(object = fit.bmi)
##
## Pooled coefficients:
## (Intercept) inc2 inc3 inc4 inc5 inc6
## 26.37103775 0.39020258 0.27300328 -0.53216328 -0.78709221 -0.68850194
## inc7 inc8 agec2 agec3 agec4 agec5
## -0.45859443 -1.07905432 2.87672264 3.74977204 3.01579518 0.05140060
## educ2 educ3 educ4 educ5 race_eth2 race_eth3
## -1.30256077 -0.93615786 0.09212783 -1.01803343 -0.01392793 0.39501903
## race_eth4 race_eth5
## -0.17333842 -0.35607302
##
## Fraction of information about the coefficients missing due to nonresponse:
## (Intercept) inc2 inc3 inc4 inc5 inc6
## 0.11600727 0.14738563 0.51177410 0.17230775 0.23299471 0.15089883
## inc7 inc8 agec2 agec3 agec4 agec5
## 0.30646265 0.21406968 0.11341483 0.07213157 0.03238494 0.02483647
## educ2 educ3 educ4 educ5 race_eth2 race_eth3
## 0.36347516 0.07484830 0.10049004 0.02219006 0.33613479 0.04949621
## race_eth4 race_eth5
## 0.01679925 0.07832378
summary(est.p)
## est se t df Pr(>|t|)
## (Intercept) 26.37103775 0.6024667 43.77177624 304.01316 0.000000e+00
## inc2 0.39020258 0.6022361 0.64792297 197.01825 5.177891e-01
## inc3 0.27300328 0.7130986 0.38284088 18.59852 7.061781e-01
## inc4 -0.53216328 0.5471652 -0.97258244 147.85483 3.323493e-01
## inc5 -0.78709221 0.5459563 -1.44167625 84.41333 1.530934e-01
## inc6 -0.68850194 0.5137380 -1.34018107 188.70394 1.817979e-01
## inc7 -0.45859443 0.5692809 -0.80556793 50.40505 4.242761e-01
## inc8 -1.07905432 0.5208763 -2.07161354 98.87026 4.090524e-02
## agec2 2.87672264 0.4815111 5.97436420 316.56209 6.211051e-09
## agec3 3.74977204 0.4450830 8.42488230 690.86706 2.220446e-16
## agec4 3.01579518 0.4320963 6.97945197 2085.22337 3.966605e-12
## agec5 0.05140060 0.5359170 0.09591149 2661.86671 9.235981e-01
## educ2 -1.30256077 0.8451104 -1.54129064 36.40563 1.318944e-01
## educ3 -0.93615786 0.4383870 -2.13546012 649.97019 3.309661e-02
## educ4 0.09212783 0.2624508 0.35102902 392.35361 7.257550e-01
## educ5 -1.01803343 0.2608126 -3.90331377 2894.07073 9.704868e-05
## race_eth2 -0.01392793 0.5354786 -0.02601024 42.28309 9.793715e-01
## race_eth3 0.39501903 0.3638634 1.08562444 1232.84787 2.778574e-01
## race_eth4 -0.17333842 0.4381532 -0.39561143 3397.25550 6.924165e-01
## race_eth5 -0.35607302 0.3910950 -0.91045147 602.57075 3.629485e-01
## lo 95 hi 95 nmis fmi lambda
## (Intercept) 25.1855051 27.55657040 NA 0.11600727 0.11021085
## inc2 -0.7974539 1.57785902 NA 0.14738563 0.13877415
## inc3 -1.2217128 1.76771935 NA 0.51177410 0.46195138
## inc4 -1.6134375 0.54911096 NA 0.17230775 0.16118696
## inc5 -1.8727085 0.29852407 NA 0.23299471 0.21503486
## inc6 -1.7019092 0.32490535 NA 0.15089883 0.14194697
## inc7 -1.6018011 0.68461227 NA 0.30646265 0.27947942
## inc8 -2.1126026 -0.04550603 NA 0.21406968 0.19833065
## agec2 1.9293563 3.82408902 NA 0.11341483 0.10783114
## agec3 2.8758944 4.62364966 NA 0.07213157 0.06944935
## agec4 2.1684102 3.86318017 NA 0.03238494 0.03145732
## agec5 -0.9994552 1.10225643 NA 0.02483647 0.02410406
## educ2 -3.0158613 0.41073974 NA 0.36347516 0.32944153
## educ3 -1.7969835 -0.07533225 NA 0.07484830 0.07200592
## educ4 -0.4238579 0.60811354 NA 0.10049004 0.09591650
## educ5 -1.5294306 -0.50663624 NA 0.02219006 0.02151456
## race_eth2 -1.0943533 1.06649742 NA 0.33613479 0.30545929
## race_eth3 -0.3188410 1.10887908 NA 0.04949621 0.04795550
## race_eth4 -1.0324090 0.68573216 NA 0.01679925 0.01622060
## race_eth5 -1.1241479 0.41200186 NA 0.07832378 0.07526970
We need to pay attention to the fmi column and the lambda column. These convey information about how much the missingness of each particular variable affects the model coefficients.
barplot(est.p$fmi)
barplot(est.p$lambda)
It appears that the third and seventh income categories, the second education category (somehs) and the first race_eth category (hispanic) have large variances to them. This suggests that there may be noticeable variation in the resulting coefficient of the model, depending on which imputed data set we use.
Here, I compare the coefficients from the model where we eliminated all missing data to the one that we fit on the imputed data:
summary(lm(bmi~inc+agec+educ+race_eth, data=brfss14))
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + race_eth, data = brfss14)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.883 -4.183 -0.940 2.957 57.537
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.18592 0.66661 39.282 < 2e-16 ***
## inc10 to 15k 0.29581 0.63766 0.464 0.642751
## inc15 to 20k 0.21908 0.59562 0.368 0.713033
## inc20 to 25k -0.33909 0.57228 -0.593 0.553536
## inc25 to 35k -0.77314 0.54841 -1.410 0.158686
## inc35 to 50k -0.58545 0.54149 -1.081 0.279690
## inc50 to 75k -0.35103 0.54558 -0.643 0.519996
## inc75k+ -1.01161 0.52710 -1.919 0.055036 .
## agec(24,39] 3.14463 0.53954 5.828 6.08e-09 ***
## agec(39,59] 3.92339 0.51111 7.676 2.09e-14 ***
## agec(59,79] 3.17112 0.50819 6.240 4.87e-10 ***
## agec(79,99] 0.07633 0.62690 0.122 0.903104
## educ0Prim -1.33281 0.88433 -1.507 0.131863
## educ1somehs -0.80800 0.49356 -1.637 0.101698
## educ3somecol 0.24741 0.27806 0.890 0.373639
## educ4colgrad -1.07534 0.28372 -3.790 0.000153 ***
## race_ethhispanic 0.12929 0.52704 0.245 0.806233
## race_ethnh black 0.41594 0.38947 1.068 0.285616
## race_ethnh multirace -0.35715 0.48301 -0.739 0.459697
## race_ethnh other -0.36974 0.42531 -0.869 0.384718
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.302 on 3668 degrees of freedom
## (781 observations deleted due to missingness)
## Multiple R-squared: 0.04216, Adjusted R-squared: 0.0372
## F-statistic: 8.498 on 19 and 3668 DF, p-value: < 2.2e-16
So for instance, the coefficient for college grad in the imputed model is -1.01, but is -1.07 in the model where the data were limited to complete cases only.
If we wanted to see the ranges of the betas in the five imputed data models, we could do that:
#get the coefficients from each of the 5 imputations of bmi
coefs<-matrix(unlist(lapply(fit.bmi$analyses, coef)), nrow=5, ncol=20, byrow=T)
#plot the coefficients from each of the different rounds of imputation to see the variability in the
#results
plot(coefs[1,-1], ylim=c(-2, 4), xaxt="n",cex=1.5, pch=20, ylab="beta")
axis(1, at=1:19, labels=names(fit.bmi$analyses[[1]]$coef[-1]))
cols=2:5
for(i in 1:dim(coefs)[1]-1){
points(coefs[i+1,-1], col=cols[i], cex=1.5, pch=20)
}
title(main="Estimated Betas from Each Imputed Regression Model for BMI Outcome")
Now, what if I don’t want to use all information within the data to impute the values, let’s say I only want to use age to predict BMI, and not race. I can customize which variables are used in the imputation process by generating a predictor matrix:
#I have to provide a matrix of which variables will be used for imputing the other variables,
#I basically don't want the bmi, doc or inc variables to be influenced by the key predictors I want to use
#to analyze them later. So I only use income and age to predict BMI
predictorMatrix<-matrix(
c(
0,1,1,0,0,
1,0,1,1,1,
1,1,0,1,1,
0,1,1,0,1,
0,1,1,1,0), nrow=5, ncol=5, byrow=T)
colnames(predictorMatrix)<-rownames(predictorMatrix)<-c("bmi", "inc", "agec","educ","race_eth")
predictorMatrix
## bmi inc agec educ race_eth
## bmi 0 1 1 0 0
## inc 1 0 1 1 1
## agec 1 1 0 1 1
## educ 0 1 1 0 1
## race_eth 0 1 1 1 0
newimp<-mice(data = brfss14[,c("bmi","inc", "agec","educ","race_eth")], seed = 1234, predictorMatrix = predictorMatrix)
##
## iter imp variable
## 1 1 bmi inc educ race_eth
## 1 2 bmi inc educ race_eth
## 1 3 bmi inc educ race_eth
## 1 4 bmi inc educ race_eth
## 1 5 bmi inc educ race_eth
## 2 1 bmi inc educ race_eth
## 2 2 bmi inc educ race_eth
## 2 3 bmi inc educ race_eth
## 2 4 bmi inc educ race_eth
## 2 5 bmi inc educ race_eth
## 3 1 bmi inc educ race_eth
## 3 2 bmi inc educ race_eth
## 3 3 bmi inc educ race_eth
## 3 4 bmi inc educ race_eth
## 3 5 bmi inc educ race_eth
## 4 1 bmi inc educ race_eth
## 4 2 bmi inc educ race_eth
## 4 3 bmi inc educ race_eth
## 4 4 bmi inc educ race_eth
## 4 5 bmi inc educ race_eth
## 5 1 bmi inc educ race_eth
## 5 2 bmi inc educ race_eth
## 5 3 bmi inc educ race_eth
## 5 4 bmi inc educ race_eth
## 5 5 bmi inc educ race_eth
print(newimp)
## Multiply imputed data set
## Call:
## mice(data = brfss14[, c("bmi", "inc", "agec", "educ", "race_eth")],
## predictorMatrix = predictorMatrix, seed = 1234)
## Number of multiple imputations: 5
## Missing cells per column:
## bmi inc agec educ race_eth
## 258 626 0 24 33
## Imputation methods:
## bmi inc agec educ race_eth
## "pmm" "polyreg" "" "polyreg" "polyreg"
## VisitSequence:
## bmi inc educ race_eth
## 1 2 4 5
## PredictorMatrix:
## bmi inc agec educ race_eth
## bmi 0 1 1 0 0
## inc 1 0 1 1 1
## agec 0 0 0 0 0
## educ 0 1 1 0 1
## race_eth 0 1 1 1 0
## Random generator seed value: 1234
fit.bmi2<-with(data=newimp ,exp=lm(bmi~inc+agec+educ+race_eth))
est.p2<-pool(fit.bmi)
summary(est.p2)
## est se t df Pr(>|t|)
## (Intercept) 26.37103775 0.6024667 43.77177624 304.01316 0.000000e+00
## inc2 0.39020258 0.6022361 0.64792297 197.01825 5.177891e-01
## inc3 0.27300328 0.7130986 0.38284088 18.59852 7.061781e-01
## inc4 -0.53216328 0.5471652 -0.97258244 147.85483 3.323493e-01
## inc5 -0.78709221 0.5459563 -1.44167625 84.41333 1.530934e-01
## inc6 -0.68850194 0.5137380 -1.34018107 188.70394 1.817979e-01
## inc7 -0.45859443 0.5692809 -0.80556793 50.40505 4.242761e-01
## inc8 -1.07905432 0.5208763 -2.07161354 98.87026 4.090524e-02
## agec2 2.87672264 0.4815111 5.97436420 316.56209 6.211051e-09
## agec3 3.74977204 0.4450830 8.42488230 690.86706 2.220446e-16
## agec4 3.01579518 0.4320963 6.97945197 2085.22337 3.966605e-12
## agec5 0.05140060 0.5359170 0.09591149 2661.86671 9.235981e-01
## educ2 -1.30256077 0.8451104 -1.54129064 36.40563 1.318944e-01
## educ3 -0.93615786 0.4383870 -2.13546012 649.97019 3.309661e-02
## educ4 0.09212783 0.2624508 0.35102902 392.35361 7.257550e-01
## educ5 -1.01803343 0.2608126 -3.90331377 2894.07073 9.704868e-05
## race_eth2 -0.01392793 0.5354786 -0.02601024 42.28309 9.793715e-01
## race_eth3 0.39501903 0.3638634 1.08562444 1232.84787 2.778574e-01
## race_eth4 -0.17333842 0.4381532 -0.39561143 3397.25550 6.924165e-01
## race_eth5 -0.35607302 0.3910950 -0.91045147 602.57075 3.629485e-01
## lo 95 hi 95 nmis fmi lambda
## (Intercept) 25.1855051 27.55657040 NA 0.11600727 0.11021085
## inc2 -0.7974539 1.57785902 NA 0.14738563 0.13877415
## inc3 -1.2217128 1.76771935 NA 0.51177410 0.46195138
## inc4 -1.6134375 0.54911096 NA 0.17230775 0.16118696
## inc5 -1.8727085 0.29852407 NA 0.23299471 0.21503486
## inc6 -1.7019092 0.32490535 NA 0.15089883 0.14194697
## inc7 -1.6018011 0.68461227 NA 0.30646265 0.27947942
## inc8 -2.1126026 -0.04550603 NA 0.21406968 0.19833065
## agec2 1.9293563 3.82408902 NA 0.11341483 0.10783114
## agec3 2.8758944 4.62364966 NA 0.07213157 0.06944935
## agec4 2.1684102 3.86318017 NA 0.03238494 0.03145732
## agec5 -0.9994552 1.10225643 NA 0.02483647 0.02410406
## educ2 -3.0158613 0.41073974 NA 0.36347516 0.32944153
## educ3 -1.7969835 -0.07533225 NA 0.07484830 0.07200592
## educ4 -0.4238579 0.60811354 NA 0.10049004 0.09591650
## educ5 -1.5294306 -0.50663624 NA 0.02219006 0.02151456
## race_eth2 -1.0943533 1.06649742 NA 0.33613479 0.30545929
## race_eth3 -0.3188410 1.10887908 NA 0.04949621 0.04795550
## race_eth4 -1.0324090 0.68573216 NA 0.01679925 0.01622060
## race_eth5 -1.1241479 0.41200186 NA 0.07832378 0.07526970
#get the coefficients from each of the 5 imputations of bmi
coefs2<-matrix(unlist(lapply(fit.bmi2$analyses, coef)), nrow=5, ncol=20, byrow=T)
#plot the coefficients from each of the different rounds of imputation to see the variability in the
#results
plot(coefs2[1,-1], ylim=c(-2, 4), xaxt="n",cex=1.5, pch=20, ylab="beta")
axis(1, at=1:19, labels=names(fit.bmi2$analyses[[1]]$coef[-1]))
cols=2:5
for(i in 1:dim(coefs2)[1]-1){
points(coefs2[i+1,-1], col=cols[i], cex=1.5, pch=20)
}
title(main="Estimated Betas from Each Imputed Regression Model for BMI Outcome")