This example will illustrate typical aspects of dealing with missing data. Topics will include: Mean imputation, modal imputation for categorical data, and multiple imputation of complex patterns of missing data. For this example I am using 2011 CDC Behavioral Risk Factor Surveillance System (BRFSS) SMART county data. Link.
library(car)
library(survey)
## Loading required package: grid
##
## Attaching package: 'survey'
##
## The following object is masked from 'package:graphics':
##
## dotchart
library(mice) #Library for multiple imputation
## Loading required package: Rcpp
## Loading required package: lattice
## mice 2.22 2014-06-10
library(MASS)
#load brfss
load("~/Google Drive/dem7283/data/brfss_11.Rdata")
nams<-names(brfss_11)
newnames<-gsub("_", "", nams)
names(brfss_11)<-tolower(newnames)
#just for brevity, I just select TX respondents with non missing weights
brfss_11<-brfss_11[brfss_11$state==48&is.na(brfss_11$cntywt)==F,]
#insurance
brfss_11$ins<-recode(brfss_11$hlthpln1, recodes="1=1;2=0;else=NA")
#smoking currently
brfss_11$smoke<-recode(brfss_11$smoker3, recodes="1:2=1; 3:4=0; else=NA")
#low physical activity
brfss_11$lowact<-recode(brfss_11$pacat, recodes="1:2=0; 3:4=1; else=NA")
#high blood pressure
brfss_11$hbp<-recode(brfss_11$bphigh4, recodes="1=1; 2:4=0; else=NA")
#high cholesterol
brfss_11$hc<-recode(brfss_11$toldhi2, recodes="1=1; 2=0; else=NA")
#bmi
brfss_11$bmi<-ifelse(is.na(brfss_11$bmi5)==T, NA, brfss_11$bmi5/100)
#poor or fair health
brfss_11$badhealth<-recode(brfss_11$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race
brfss_11$black<-recode(brfss_11$racegr2, recodes="2=1; 9=NA; else=0", as.factor.result=T)
brfss_11$white<-recode(brfss_11$racegr2, recodes="1=1; 9=NA; else=0", as.factor.result=T)
brfss_11$other<-recode(brfss_11$racegr2, recodes="3:4=1; 9=NA; else=0", as.factor.result=T)
brfss_11$hispanic<-recode(brfss_11$racegr2, recodes="5=1; 9=NA; else=0", as.factor.result=T)
#have a personal doctor?
brfss_11$doc<-recode(brfss_11$persdoc2, recodes="1:2=1; 3=0; else=NA", as.factor.result=F)
#needed care in last year but couldn't get it because of cost
brfss_11$medacc<-recode(brfss_11$medcost, recodes="1=1;2=0;else=NA")
#education level
brfss_11$educ<-recode(brfss_11$educa, recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA", as.factor.result=T)
brfss_11$educ<-relevel(brfss_11$educ, ref='2hsgrad')
#employment
brfss_11$emp<-recode(brfss_11$employ, recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA", as.factor.result=T)
brfss_11$emp<-relevel(brfss_11$emp, ref='Employed')
#marital status
brfss_11$marst<-recode(brfss_11$marital, recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA", as.factor.result=T)
brfss_11$marst<-relevel(brfss_11$marst, ref='married')
#income
brfss_11$inc<-recode(brfss_11$incomg, recodes="1='<15k'; 2='15 to 25k'; 3='25 to 35k'; 4='35 to 50k'; 5='50k+';9=NA", as.factor.result = T)
#Age cut into intervals
brfss_11$agec<-cut(brfss_11$age, breaks=c(0,24,39,59,79,99))
Now, we can get a general idea of the missingness of these variables by just using summary(brfss_11)
summary(brfss_11[, c("ins", "smoke", "lowact", "hbp", "hc", "bmi", "badhealth", "black", "doc", "educ", "emp", "marst", "inc")])
## ins smoke lowact hbp
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.8591 Mean :0.131 Mean :0.4905 Mean :0.4228
## 3rd Qu.:1.0000 3rd Qu.:0.000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.000 Max. :1.0000 Max. :1.0000
## NA's :28 NA's :54 NA's :676 NA's :27
## hc bmi badhealth black
## Min. :0.0000 Min. :12.55 Min. :0.0000 0 :6769
## 1st Qu.:0.0000 1st Qu.:23.44 1st Qu.:0.0000 1 : 681
## Median :0.0000 Median :26.57 Median :0.0000 NA's: 144
## Mean :0.4769 Mean :27.55 Mean :0.1908
## 3rd Qu.:1.0000 3rd Qu.:30.50 3rd Qu.:0.0000
## Max. :1.0000 Max. :82.28 Max. :1.0000
## NA's :1117 NA's :467 NA's :35
## doc educ emp marst
## Min. :0.000 2hsgrad :1649 Employed:3580 married :4250
## 1st Qu.:1.000 0Prim : 361 nilf :1369 cohab : 159
## Median :1.000 1somehs : 460 retired :2102 divorced :1041
## Mean :0.834 3somecol:1909 unable : 488 nm : 845
## 3rd Qu.:1.000 4colgrad:3189 NA's : 55 separated: 180
## Max. :1.000 NA's : 26 widowed :1062
## NA's :34 NA's : 57
## inc
## <15k : 753
## 15 to 25k:1092
## 25 to 35k: 650
## 35 to 50k: 857
## 50k+ :3181
## NA's :1061
##
Which shows that, among these recoded variables, hc or the indicator for high cholesterol is missing the most with 1117 people in our TX sample missing, or 14.7089808% of the sample. The lowest number of missings is in the education variable, which only has 0.3423756% missing.
Now, i’m going to illustrate mean imputation of a continuous variable, BMI.
#I'm going to play with 3 outcomes, bmi, having a regular doctor and income category
summary(brfss_11$bmi) #467 missing, 6%
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 12.55 23.44 26.57 27.55 30.51 82.28 467
#what happens when we replace the missings with the mean?
brfss_11$bmi.imp.mean<-ifelse(is.na(brfss_11$bmi)==T, mean(brfss_11$bmi, na.rm=T), brfss_11$bmi)
mean(brfss_11$bmi, na.rm=T)
## [1] 27.55206
mean(brfss_11$bmi.imp.mean) #no difference!
## [1] 27.55206
median(brfss_11$bmi, na.rm=T)
## [1] 26.57
median(brfss_11$bmi.imp.mean) #slight difference
## [1] 26.94
var(brfss_11$bmi, na.rm=T)
## [1] 36.38479
var(brfss_11$bmi.imp.mean) # more noticeable difference!
## [1] 34.14698
So what we see here, is that imputing with the mean does nothing to central tendency (when measured using the mean, but does affect the median), but it does reduce the variance in the outcome. This is because you’re replacing all missing cases with the most likely value (the mean), so you’re artificially deflating the variance. That’s not good.
We can see this in a histogram:
#plot the histogram
hist(brfss_11$bmi.imp.mean)
hist(brfss_11$bmi, add=T ,col=1) #you can see the extra values at the mean
If we have a categorical variable, an easy way to impute the values is to use modal imputation, or impute cases with the mode, or most common value. It doesn’t make sense to use the mean, because what would that mean for a categorical variable?
table(brfss_11$emp)
##
## Employed nilf retired unable
## 3580 1369 2102 488
#find the most common value
mcv.emp<-factor(names(which.max(table(brfss_11$emp))), levels=levels(brfss_11$emp))
mcv.emp
## [1] Employed
## Levels: Employed nilf retired unable
#impute the cases
brfss_11$emp.imp<-as.factor(ifelse(is.na(brfss_11$emp)==T, mcv.emp, brfss_11$emp))
levels(brfss_11$emp.imp)<-levels(brfss_11$emp)
prop.table(table(brfss_11$emp))
##
## Employed nilf retired unable
## 0.47486404 0.18158907 0.27881682 0.06473007
prop.table(table(brfss_11$emp.imp))
##
## Employed nilf retired unable
## 0.47866737 0.18027390 0.27679747 0.06426126
barplot(prop.table(table(brfss_11$emp)), main="Original Data", ylim=c(0, .6))
barplot(prop.table(table(brfss_11$emp.imp)), main="Imputed Data",ylim=c(0, .6))
Which doesn’t look like much of a difference because only 55 people were missing. Now let’s try modal imputation on income group:
table(brfss_11$inc)
##
## <15k 15 to 25k 25 to 35k 35 to 50k 50k+
## 753 1092 650 857 3181
#find the most common value
mcv.inc<-factor(names(which.max(table(brfss_11$inc))), levels=levels(brfss_11$inc))
mcv.inc
## [1] 50k+
## Levels: <15k 15 to 25k 25 to 35k 35 to 50k 50k+
#impute the cases
brfss_11$inc.imp<-as.factor(ifelse(is.na(brfss_11$inc)==T, mcv.inc, brfss_11$inc))
levels(brfss_11$inc.imp)<-levels(brfss_11$inc)
prop.table(table(brfss_11$inc))
##
## <15k 15 to 25k 25 to 35k 35 to 50k 50k+
## 0.11526098 0.16715139 0.09949487 0.13118016 0.48691260
prop.table(table(brfss_11$inc.imp))
##
## <15k 15 to 25k 25 to 35k 35 to 50k 50k+
## 0.09915723 0.14379774 0.08559389 0.11285225 0.55859889
barplot(prop.table(table(brfss_11$inc)), main="Original Data", ylim=c(0, .6))
barplot(prop.table(table(brfss_11$inc.imp)), main="Imputed Data", ylim=c(0, .6))
Which shows how dramatically we alter the distribution of the variable by imputing at the mode.
These days, these types of imputation have been far surpassed by more complete methods that are based upon regression methods. These methods are generally referred to as multiple imputation, because we are really interested in imputing multiple variables simultaneously. Instead of reviewing this perspective here, I suggest you have a look at Joe Schafer’s site that gives a nice treatment of the subject. Here, I will use the imputation techniques in the mice library in R, which you can read about here.
I have used these in practice in publications and generally like the framework the library uses. Another popular technique is in the Amelia library of Gary King, which I haven’t used much. If you are serious about doing multiple imputation it would be advised to investigate multiple methodologies.
To begin, I explore the various patterns of missingness in the data. The md.pattern function in mice does this nicely. Here, each row corresponds to a particular pattern of missingness (1 = observed, 0=missing)
#look at the patterns of missingness
md.pattern(brfss_11[,c("bmi", "inc", "agec","educ","black","white","hispanic")])
## agec educ black white hispanic bmi inc
## 6136 1 1 1 1 1 1 1 0
## 288 1 1 1 1 1 0 1 1
## 855 1 1 1 1 1 1 0 1
## 8 1 0 1 1 1 1 1 1
## 152 1 1 1 1 1 0 0 2
## 3 1 0 1 1 1 0 1 2
## 6 1 0 1 1 1 1 0 2
## 2 1 0 1 1 1 0 0 3
## 85 1 1 0 0 0 1 1 3
## 10 1 1 0 0 0 0 1 4
## 33 1 1 0 0 0 1 0 4
## 2 1 0 0 0 0 1 1 4
## 9 1 1 0 0 0 0 0 5
## 1 1 0 0 0 0 0 1 5
## 2 1 0 0 0 0 1 0 5
## 2 1 0 0 0 0 0 0 6
## 0 26 144 144 144 467 1061 1986
Shows that 6,116 rows of the data are complete (first row). The second row shows that 288 people are missing only the bmi variable. I say only because 467 people are missing the bmi variable in total. Apparently some folks are missing in combination with other variables. Sure enough, if we look down, we see that 3 people are missing bmi AND educ, 152 are missing bmi AND inc, and so on. The bottom row tells how many total people are missing each variable, in ANY combination with other variables.
If you want to see how pairs of variables are missing together, the md.pairs() function will show this. A pair of variables can have exactly four missingness patterns: both variables are observed (pattern rr), the first variable is observed and the second variable is missing (pattern rm), the first variable is missing and the second variable is observed (pattern mr), and both are missing (pattern mm).
md.pairs(brfss_11[,c("bmi", "inc", "agec","educ","black","white","hispanic")])
## $rr
## bmi inc agec educ black white hispanic
## bmi 7127 6231 7127 7109 7005 7005 7005
## inc 6231 6533 6533 6519 6435 6435 6435
## agec 7127 6533 7594 7568 7450 7450 7450
## educ 7109 6519 7568 7568 7431 7431 7431
## black 7005 6435 7450 7431 7450 7450 7450
## white 7005 6435 7450 7431 7450 7450 7450
## hispanic 7005 6435 7450 7431 7450 7450 7450
##
## $rm
## bmi inc agec educ black white hispanic
## bmi 0 896 0 18 122 122 122
## inc 302 0 0 14 98 98 98
## agec 467 1061 0 26 144 144 144
## educ 459 1049 0 0 137 137 137
## black 445 1015 0 19 0 0 0
## white 445 1015 0 19 0 0 0
## hispanic 445 1015 0 19 0 0 0
##
## $mr
## bmi inc agec educ black white hispanic
## bmi 0 302 467 459 445 445 445
## inc 896 0 1061 1049 1015 1015 1015
## agec 0 0 0 0 0 0 0
## educ 18 14 26 0 19 19 19
## black 122 98 144 137 0 0 0
## white 122 98 144 137 0 0 0
## hispanic 122 98 144 137 0 0 0
##
## $mm
## bmi inc agec educ black white hispanic
## bmi 467 165 0 8 22 22 22
## inc 165 1061 0 12 46 46 46
## agec 0 0 0 0 0 0 0
## educ 8 12 0 26 7 7 7
## black 22 46 0 7 144 144 144
## white 22 46 0 7 144 144 144
## hispanic 22 46 0 7 144 144 144
We can perform a basic multiple imputation by simply doing: Note this may take a long time with big data sets
imp<-mice(data = brfss_11[,c("bmi", "inc", "agec","educ","black","white","hispanic")], seed = 22)
##
## iter imp variable
## 1 1 bmi inc educ black white hispanic
## 1 2 bmi inc educ black white hispanic
## 1 3 bmi inc educ black white hispanic
## 1 4 bmi inc educ black white hispanic
## 1 5 bmi inc educ black white hispanic
## 2 1 bmi inc educ black white hispanic
## 2 2 bmi inc educ black white hispanic
## 2 3 bmi inc educ black white hispanic
## 2 4 bmi inc educ black white hispanic
## 2 5 bmi inc educ black white hispanic
## 3 1 bmi inc educ black white hispanic
## 3 2 bmi inc educ black white hispanic
## 3 3 bmi inc educ black white hispanic
## 3 4 bmi inc educ black white hispanic
## 3 5 bmi inc educ black white hispanic
## 4 1 bmi inc educ black white hispanic
## 4 2 bmi inc educ black white hispanic
## 4 3 bmi inc educ black white hispanic
## 4 4 bmi inc educ black white hispanic
## 4 5 bmi inc educ black white hispanic
## 5 1 bmi inc educ black white hispanic
## 5 2 bmi inc educ black white hispanic
## 5 3 bmi inc educ black white hispanic
## 5 4 bmi inc educ black white hispanic
## 5 5 bmi inc educ black white hispanic
print(imp)
## Multiply imputed data set
## Call:
## mice(data = brfss_11[, c("bmi", "inc", "agec", "educ", "black",
## "white", "hispanic")], seed = 22)
## Number of multiple imputations: 5
## Missing cells per column:
## bmi inc agec educ black white hispanic
## 467 1061 0 26 144 144 144
## Imputation methods:
## bmi inc agec educ black white hispanic
## "pmm" "polyreg" "" "polyreg" "logreg" "logreg" "logreg"
## VisitSequence:
## bmi inc educ black white hispanic
## 1 2 4 5 6 7
## PredictorMatrix:
## bmi inc agec educ black white hispanic
## bmi 0 1 1 1 1 1 1
## inc 1 0 1 1 1 1 1
## agec 0 0 0 0 0 0 0
## educ 1 1 1 0 1 1 1
## black 1 1 1 1 0 1 1
## white 1 1 1 1 1 0 1
## hispanic 1 1 1 1 1 1 0
## Random generator seed value: 22
Shows how many imputations were done. It also shows total missingness, which imputation method was used for each variable (because you wouldn’t want to use a normal distribution for a categorical variable!!). It also shows the sequence of how each variable is visited (or imputed, the default is left to right).
We may want to make sure imputed values are plausible by having a look:
head(imp$imp$bmi)
## 1 2 3 4 5
## 112488 20.05 23.17 24.95 28.25 22.47
## 112505 33.47 41.57 23.87 40.60 29.97
## 112508 41.35 31.25 23.30 36.49 31.89
## 112528 26.57 31.79 24.07 37.12 35.28
## 112537 32.28 32.08 29.76 22.73 23.40
## 112591 34.33 20.54 27.32 29.85 38.32
head(imp$imp$inc)
## 1 2 3 4 5
## 112406 15 to 25k 50k+ 50k+ 15 to 25k 15 to 25k
## 112434 <15k 50k+ <15k 50k+ <15k
## 112436 50k+ 50k+ 50k+ 50k+ 50k+
## 112445 50k+ 50k+ 50k+ 50k+ 50k+
## 112450 15 to 25k 25 to 35k <15k 15 to 25k 15 to 25k
## 112470 50k+ 35 to 50k 50k+ 50k+ 50k+
We can also do some plotting. For instance if we want to see how the observed and imputed values of bmi look with respect to income, we can do:
stripplot(imp,bmi~inc|.imp, pch=20)
and we see the distribution of the original data (blue dots), the imputed data (red dots) across the levels of income, for each of the five different imputation runs(the number at the top shows which run, and the first plot is the original data).
This plot shows that the bmi values correspond well with the observed data, so they are probably plausible values.
If we want to get our new, imputed data, we can use the complete() function, which by default extracts the first imputed data set. If we want a different one, we can do complete(imp, action=3) for example, to get the third imputed data set.
dat.imp<-complete(imp)
head(dat.imp, n=10)
## bmi inc agec educ black white hispanic
## 112405 22.32 50k+ (59,79] 4colgrad 0 1 0
## 112406 22.15 15 to 25k (24,39] 3somecol 0 1 0
## 112407 21.81 50k+ (24,39] 4colgrad 0 1 0
## 112408 30.17 50k+ (24,39] 4colgrad 0 0 0
## 112409 23.03 25 to 35k (0,24] 2hsgrad 0 0 1
## 112410 24.41 50k+ (39,59] 4colgrad 0 0 1
## 112411 25.09 35 to 50k (39,59] 2hsgrad 0 1 0
## 112412 20.18 50k+ (0,24] 3somecol 0 1 0
## 112413 23.75 50k+ (24,39] 4colgrad 0 1 0
## 112414 30.27 50k+ (24,39] 4colgrad 0 0 0
#Compare to the original data
head(brfss_11[,c("bmi", "inc", "agec","educ","black","white","hispanic")], n=10)
## bmi inc agec educ black white hispanic
## 112405 22.32 50k+ (59,79] 4colgrad 0 1 0
## 112406 22.15 <NA> (24,39] 3somecol 0 1 0
## 112407 21.81 50k+ (24,39] 4colgrad 0 1 0
## 112408 30.17 50k+ (24,39] 4colgrad 0 0 0
## 112409 23.03 25 to 35k (0,24] 2hsgrad 0 0 1
## 112410 24.41 50k+ (39,59] 4colgrad 0 0 1
## 112411 25.09 35 to 50k (39,59] 2hsgrad 0 1 0
## 112412 20.18 50k+ (0,24] 3somecol 0 1 0
## 112413 23.75 50k+ (24,39] 4colgrad 0 1 0
## 112414 30.27 50k+ (24,39] 4colgrad 0 0 0
While the first few cases don’t show much missingness, we can coax some more interesting cases out and compare the original data to the imputed:
head(dat.imp[is.na(brfss_11$bmi)==T,], n=10)
## bmi inc agec educ black white hispanic
## 112488 20.05 50k+ (59,79] 4colgrad 0 0 0
## 112505 33.47 35 to 50k (24,39] 2hsgrad 0 1 0
## 112508 41.35 15 to 25k (39,59] 1somehs 0 0 1
## 112528 26.57 15 to 25k (59,79] 3somecol 1 0 0
## 112537 32.28 50k+ (39,59] 4colgrad 0 0 1
## 112591 34.33 15 to 25k (59,79] 2hsgrad 0 0 1
## 112605 24.69 35 to 50k (39,59] 2hsgrad 0 1 0
## 112606 20.05 50k+ (59,79] 4colgrad 0 1 0
## 112691 23.57 35 to 50k (59,79] 3somecol 0 1 0
## 112702 28.35 <15k (59,79] 4colgrad 0 1 0
head(brfss_11[is.na(brfss_11$bmi)==T,c("bmi", "inc", "agec","educ","black","white","hispanic")], n=10)
## bmi inc agec educ black white hispanic
## 112488 NA <NA> (59,79] 4colgrad <NA> <NA> <NA>
## 112505 NA 35 to 50k (24,39] 2hsgrad 0 1 0
## 112508 NA 15 to 25k (39,59] 1somehs 0 0 1
## 112528 NA <NA> (59,79] 3somecol 1 0 0
## 112537 NA 50k+ (39,59] 4colgrad 0 0 1
## 112591 NA <NA> (59,79] 2hsgrad 0 0 1
## 112605 NA 35 to 50k (39,59] 2hsgrad 0 1 0
## 112606 NA 50k+ (59,79] 4colgrad 0 1 0
## 112691 NA <NA> (59,79] 3somecol 0 1 0
## 112702 NA <15k (59,79] 4colgrad <NA> <NA> <NA>
A key element of using imputed data, is that the relationships we want to know about should be maintained after imputation, and presumably, the relationships within each imputed data set will be the same. So if we used each of the (5 in this case) imputed data sets in a model, then we should see similar results across the five different models.
Here I look at a linear model for bmi:
#Now, I will see the variability in the 5 different imputations for each outcom
fit.bmi<-with(data=imp ,exp=lm(bmi~inc+agec+educ+black+hispanic))
fit.bmi
## call :
## with.mids(data = imp, expr = lm(bmi ~ inc + agec + educ + black +
## hispanic))
##
## call1 :
## mice(data = brfss_11[, c("bmi", "inc", "agec", "educ", "black",
## "white", "hispanic")], seed = 22)
##
## nmis :
## bmi inc agec educ black white hispanic
## 467 1061 0 26 144 144 144
##
## analyses :
## [[1]]
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + black + hispanic)
##
## Coefficients:
## (Intercept) inc2 inc3 inc4 inc5
## 25.2841 -0.4035 -0.7955 -0.8272 -1.1873
## agec2 agec3 agec4 agec5 educ2
## 2.9697 3.6971 3.2278 0.4934 1.4142
## educ3 educ4 educ5 black2 hispanic2
## 0.3235 0.1260 -0.6790 2.4349 0.9206
##
##
## [[2]]
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + black + hispanic)
##
## Coefficients:
## (Intercept) inc2 inc3 inc4 inc5
## 25.39351 -0.58945 -0.74322 -0.69994 -1.14988
## agec2 agec3 agec4 agec5 educ2
## 2.96031 3.56140 3.10236 0.46484 0.73748
## educ3 educ4 educ5 black2 hispanic2
## 0.07692 0.11767 -0.83615 2.49769 0.92526
##
##
## [[3]]
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + black + hispanic)
##
## Coefficients:
## (Intercept) inc2 inc3 inc4 inc5
## 25.4947 -0.5781 -0.8876 -0.7314 -1.2941
## agec2 agec3 agec4 agec5 educ2
## 2.8075 3.6386 3.1418 0.3749 1.1017
## educ3 educ4 educ5 black2 hispanic2
## 0.1276 0.1176 -0.7927 2.3336 0.8731
##
##
## [[4]]
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + black + hispanic)
##
## Coefficients:
## (Intercept) inc2 inc3 inc4 inc5
## 25.46596 -0.61609 -0.78489 -0.52006 -1.06511
## agec2 agec3 agec4 agec5 educ2
## 2.60768 3.46996 3.03328 0.40306 0.97205
## educ3 educ4 educ5 black2 hispanic2
## 0.17251 0.06283 -0.78019 2.60211 0.82089
##
##
## [[5]]
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + black + hispanic)
##
## Coefficients:
## (Intercept) inc2 inc3 inc4 inc5
## 25.6508 -0.7195 -0.7867 -1.0157 -1.3082
## agec2 agec3 agec4 agec5 educ2
## 2.7382 3.4794 3.0711 0.3786 0.6830
## educ3 educ4 educ5 black2 hispanic2
## 0.1529 0.1309 -0.8705 2.5121 0.7874
with (data=imp, exp=(sd(bmi)))
## call :
## with.mids(data = imp, expr = (sd(bmi)))
##
## call1 :
## mice(data = brfss_11[, c("bmi", "inc", "agec", "educ", "black",
## "white", "hispanic")], seed = 22)
##
## nmis :
## bmi inc agec educ black white hispanic
## 467 1061 0 26 144 144 144
##
## analyses :
## [[1]]
## [1] 6.106734
##
## [[2]]
## [1] 6.020854
##
## [[3]]
## [1] 6.087242
##
## [[4]]
## [1] 6.036923
##
## [[5]]
## [1] 6.01707
with (data=imp, exp=(sd(inc)))
## call :
## with.mids(data = imp, expr = (sd(inc)))
##
## call1 :
## mice(data = brfss_11[, c("bmi", "inc", "agec", "educ", "black",
## "white", "hispanic")], seed = 22)
##
## nmis :
## bmi inc agec educ black white hispanic
## 467 1061 0 26 144 144 144
##
## analyses :
## [[1]]
## [1] 1.491672
##
## [[2]]
## [1] 1.499184
##
## [[3]]
## [1] 1.495172
##
## [[4]]
## [1] 1.49721
##
## [[5]]
## [1] 1.492826
with (data=imp, exp=(sd(black)))
## call :
## with.mids(data = imp, expr = (sd(black)))
##
## call1 :
## mice(data = brfss_11[, c("bmi", "inc", "agec", "educ", "black",
## "white", "hispanic")], seed = 22)
##
## nmis :
## bmi inc agec educ black white hispanic
## 467 1061 0 26 144 144 144
##
## analyses :
## [[1]]
## [1] 0.2891104
##
## [[2]]
## [1] 0.2879921
##
## [[3]]
## [1] 0.2909599
##
## [[4]]
## [1] 0.2885521
##
## [[5]]
## [1] 0.2885521
with (data=imp, exp=(sd(hispanic)))
## call :
## with.mids(data = imp, expr = (sd(hispanic)))
##
## call1 :
## mice(data = brfss_11[, c("bmi", "inc", "agec", "educ", "black",
## "white", "hispanic")], seed = 22)
##
## nmis :
## bmi inc agec educ black white hispanic
## 467 1061 0 26 144 144 144
##
## analyses :
## [[1]]
## [1] 0.3836722
##
## [[2]]
## [1] 0.3841118
##
## [[3]]
## [1] 0.3828998
##
## [[4]]
## [1] 0.3837822
##
## [[5]]
## [1] 0.3836722
with (data=imp, exp=(sd(agec)))
## call :
## with.mids(data = imp, expr = (sd(agec)))
##
## call1 :
## mice(data = brfss_11[, c("bmi", "inc", "agec", "educ", "black",
## "white", "hispanic")], seed = 22)
##
## nmis :
## bmi inc agec educ black white hispanic
## 467 1061 0 26 144 144 144
##
## analyses :
## [[1]]
## [1] 0.9868412
##
## [[2]]
## [1] 0.9868412
##
## [[3]]
## [1] 0.9868412
##
## [[4]]
## [1] 0.9868412
##
## [[5]]
## [1] 0.9868412
with (data=imp, exp=(sd(educ)))
## call :
## with.mids(data = imp, expr = (sd(educ)))
##
## call1 :
## mice(data = brfss_11[, c("bmi", "inc", "agec", "educ", "black",
## "white", "hispanic")], seed = 22)
##
## nmis :
## bmi inc agec educ black white hispanic
## 467 1061 0 26 144 144 144
##
## analyses :
## [[1]]
## [1] 1.576475
##
## [[2]]
## [1] 1.574776
##
## [[3]]
## [1] 1.576993
##
## [[4]]
## [1] 1.576846
##
## [[5]]
## [1] 1.576354
Which we can compare to the model fit on the original data, with missings eliminated:
summary(lm(bmi~inc+agec+educ+black+hispanic, data=brfss_11))
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + black + hispanic, data = brfss_11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.500 -3.919 -0.919 2.912 51.609
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.7836 0.4634 55.642 < 2e-16 ***
## inc15 to 25k -0.5571 0.2920 -1.908 0.056501 .
## inc25 to 35k -0.8082 0.3314 -2.439 0.014759 *
## inc35 to 50k -0.8099 0.3158 -2.564 0.010366 *
## inc50k+ -1.2512 0.2767 -4.523 6.23e-06 ***
## agec(24,39] 2.5238 0.4349 5.803 6.83e-09 ***
## agec(39,59] 3.3917 0.4102 8.268 < 2e-16 ***
## agec(59,79] 2.8778 0.4113 6.996 2.91e-12 ***
## agec(79,99] 0.4257 0.4716 0.903 0.366724
## educ0Prim 0.6516 0.4448 1.465 0.143006
## educ1somehs 0.3543 0.3599 0.984 0.324937
## educ3somecol 0.1935 0.2215 0.873 0.382438
## educ4colgrad -0.8018 0.2160 -3.713 0.000207 ***
## black1 2.6049 0.2656 9.810 < 2e-16 ***
## hispanic1 0.8436 0.2271 3.715 0.000205 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.859 on 6121 degrees of freedom
## (1458 observations deleted due to missingness)
## Multiple R-squared: 0.06081, Adjusted R-squared: 0.05866
## F-statistic: 28.31 on 14 and 6121 DF, p-value: < 2.2e-16
And we see the same relationships, and the same significant effects.
Now we pool the separate models:
est.p<-pool(fit.bmi)
print(est.p)
## Call: pool(object = fit.bmi)
##
## Pooled coefficients:
## (Intercept) inc2 inc3 inc4 inc5 agec2
## 25.4578144 -0.5813415 -0.7995774 -0.7588615 -1.2008980 2.8166979
## agec3 agec4 agec5 educ2 educ3 educ4
## 3.5693006 3.1152752 0.4229708 0.9816962 0.1706885 0.1109902
## educ5 black2 hispanic2
## -0.7917089 2.4760851 0.8654392
##
## Fraction of information about the coefficients missing due to nonresponse:
## (Intercept) inc2 inc3 inc4 inc5 agec2
## 0.13828603 0.20812702 0.03889433 0.37107244 0.18346234 0.19573670
## agec3 agec4 agec5 educ2 educ3 educ4
## 0.10072339 0.05877721 0.02292614 0.49462411 0.09969444 0.02286819
## educ5 black2 hispanic2
## 0.15049545 0.18135604 0.10155164
summary(est.p)
## est se t df Pr(>|t|)
## (Intercept) 25.4578144 0.4092780 62.2017714 226.12589 0.000000e+00
## inc2 -0.5813415 0.2841284 -2.0460519 105.28848 4.324344e-02
## inc3 -0.7995774 0.2991165 -2.6731309 2011.94039 7.575412e-03
## inc4 -0.7588615 0.3430104 -2.2123571 35.12953 3.353602e-02
## inc5 -1.2008980 0.2693144 -4.4590939 133.40132 1.729905e-05
## agec2 2.8166979 0.3933157 7.1614170 118.14848 7.360157e-11
## agec3 3.5693006 0.3492304 10.2204753 405.75432 0.000000e+00
## agec4 3.1152752 0.3419729 9.1097127 1050.59986 0.000000e+00
## agec5 0.4229708 0.3905056 1.0831363 3836.78796 2.788160e-01
## educ2 0.9816962 0.4852687 2.0229952 19.97432 5.666501e-02
## educ3 0.1706885 0.3286549 0.5193547 413.43180 6.037913e-01
## educ4 0.1109902 0.2015461 0.5506938 3846.27487 5.818756e-01
## educ5 -0.7917089 0.2108900 -3.7541320 193.17256 2.300566e-04
## black2 2.4760851 0.2649433 9.3457152 136.31843 2.220446e-16
## hispanic2 0.8654392 0.2135866 4.0519356 399.72907 6.104608e-05
## lo 95 hi 95 nmis fmi lambda
## (Intercept) 24.65132798 26.26430091 NA 0.13828603 0.13069804
## inc2 -1.14469778 -0.01798532 NA 0.20812702 0.19322657
## inc3 -1.38618779 -0.21296705 NA 0.03889433 0.03793941
## inc4 -1.45511781 -0.06260512 NA 0.37107244 0.33625729
## inc5 -1.73357662 -0.66821934 NA 0.18346234 0.17131160
## agec2 2.03783586 3.59556001 NA 0.19573670 0.18223651
## agec3 2.88277385 4.25582745 NA 0.10072339 0.09630167
## agec4 2.44424751 3.78630290 NA 0.05877721 0.05698713
## agec5 -0.34264767 1.18858924 NA 0.02292614 0.02241695
## educ2 -0.03063993 1.99403236 NA 0.49462411 0.44643414
## educ3 -0.47535456 0.81673149 NA 0.09969444 0.09534967
## educ4 -0.28415727 0.50613763 NA 0.02286819 0.02236023
## educ5 -1.20765151 -0.37576620 NA 0.15049545 0.14174545
## black2 1.95215449 3.00001566 NA 0.18135604 0.16943275
## hispanic2 0.44554577 1.28533260 NA 0.10155164 0.09706757
#compare to original data
summary(lm(bmi~inc+agec+educ+black+hispanic, data=brfss_11))
##
## Call:
## lm(formula = bmi ~ inc + agec + educ + black + hispanic, data = brfss_11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.500 -3.919 -0.919 2.912 51.609
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.7836 0.4634 55.642 < 2e-16 ***
## inc15 to 25k -0.5571 0.2920 -1.908 0.056501 .
## inc25 to 35k -0.8082 0.3314 -2.439 0.014759 *
## inc35 to 50k -0.8099 0.3158 -2.564 0.010366 *
## inc50k+ -1.2512 0.2767 -4.523 6.23e-06 ***
## agec(24,39] 2.5238 0.4349 5.803 6.83e-09 ***
## agec(39,59] 3.3917 0.4102 8.268 < 2e-16 ***
## agec(59,79] 2.8778 0.4113 6.996 2.91e-12 ***
## agec(79,99] 0.4257 0.4716 0.903 0.366724
## educ0Prim 0.6516 0.4448 1.465 0.143006
## educ1somehs 0.3543 0.3599 0.984 0.324937
## educ3somecol 0.1935 0.2215 0.873 0.382438
## educ4colgrad -0.8018 0.2160 -3.713 0.000207 ***
## black1 2.6049 0.2656 9.810 < 2e-16 ***
## hispanic1 0.8436 0.2271 3.715 0.000205 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.859 on 6121 degrees of freedom
## (1458 observations deleted due to missingness)
## Multiple R-squared: 0.06081, Adjusted R-squared: 0.05866
## F-statistic: 28.31 on 14 and 6121 DF, p-value: < 2.2e-16
If we wanted to see the ranges of the betas in the five imputed data models, we could do that:
#get the coefficients from each of the 5 imputations of bmi
coefs<-matrix(unlist(lapply(fit.bmi$analyses, coef)), nrow=5, ncol=15, byrow=T)
#plot the coefficients from each of the different rounds of imputation to see the variability in the
#results
plot(coefs[1,-1], ylim=c(-2, 4), xaxt="n",cex=1.5, pch=20, ylab="beta")
axis(1, at=1:14, labels=names(fit.bmi$analyses[[1]]$coef[-1]))
cols=2:5
for(i in 1:dim(coefs)[1]-1){
points(coefs[i+1,-1], col=cols[i], cex=1.5, pch=20)
}
title(main="Estimated Betas from Each Imputed Regression Model for BMI Outcome")
Now, what if I don’t want to use all information within the data to impute the values, let’s say I only want to use age to predict BMI, and not race. I can customize which variables are used in the imputation process by generating a predictor matrix:
#I have to provide a matrix of which variables will be used for imputing the other variables,
#I basically don't want the bmi, doc or inc variables to be influenced by the key predictors I want to use
#to analyze them later. So I only use income and age to predict BMI
predictorMatrix<-matrix(
c(
0,1,1,0,0,0,0,
0,0,1,1,0,0,0,
1,1,0,0,0,0,0,
0,1,1,0,1,1,1,
0,1,1,1,0,1,1,
0,1,1,1,1,0,1,
0,1,1,1,1,1,0), nrow=7, ncol=7, byrow=T)
colnames(predictorMatrix)<-rownames(predictorMatrix)<-c("bmi", "inc", "agec","educ","black","white","hispanic")
predictorMatrix
## bmi inc agec educ black white hispanic
## bmi 0 1 1 0 0 0 0
## inc 0 0 1 1 0 0 0
## agec 1 1 0 0 0 0 0
## educ 0 1 1 0 1 1 1
## black 0 1 1 1 0 1 1
## white 0 1 1 1 1 0 1
## hispanic 0 1 1 1 1 1 0
newimp<-mice(data = brfss_11[,c("bmi","inc", "agec","educ","black","white","hispanic")], seed = 22, predictorMatrix = predictorMatrix)
##
## iter imp variable
## 1 1 bmi inc educ black white hispanic
## 1 2 bmi inc educ black white hispanic
## 1 3 bmi inc educ black white hispanic
## 1 4 bmi inc educ black white hispanic
## 1 5 bmi inc educ black white hispanic
## 2 1 bmi inc educ black white hispanic
## 2 2 bmi inc educ black white hispanic
## 2 3 bmi inc educ black white hispanic
## 2 4 bmi inc educ black white hispanic
## 2 5 bmi inc educ black white hispanic
## 3 1 bmi inc educ black white hispanic
## 3 2 bmi inc educ black white hispanic
## 3 3 bmi inc educ black white hispanic
## 3 4 bmi inc educ black white hispanic
## 3 5 bmi inc educ black white hispanic
## 4 1 bmi inc educ black white hispanic
## 4 2 bmi inc educ black white hispanic
## 4 3 bmi inc educ black white hispanic
## 4 4 bmi inc educ black white hispanic
## 4 5 bmi inc educ black white hispanic
## 5 1 bmi inc educ black white hispanic
## 5 2 bmi inc educ black white hispanic
## 5 3 bmi inc educ black white hispanic
## 5 4 bmi inc educ black white hispanic
## 5 5 bmi inc educ black white hispanic
print(newimp)
## Multiply imputed data set
## Call:
## mice(data = brfss_11[, c("bmi", "inc", "agec", "educ", "black",
## "white", "hispanic")], predictorMatrix = predictorMatrix,
## seed = 22)
## Number of multiple imputations: 5
## Missing cells per column:
## bmi inc agec educ black white hispanic
## 467 1061 0 26 144 144 144
## Imputation methods:
## bmi inc agec educ black white hispanic
## "pmm" "polyreg" "" "polyreg" "logreg" "logreg" "logreg"
## VisitSequence:
## bmi inc educ black white hispanic
## 1 2 4 5 6 7
## PredictorMatrix:
## bmi inc agec educ black white hispanic
## bmi 0 1 1 0 0 0 0
## inc 0 0 1 1 0 0 0
## agec 0 0 0 0 0 0 0
## educ 0 1 1 0 1 1 1
## black 0 1 1 1 0 1 1
## white 0 1 1 1 1 0 1
## hispanic 0 1 1 1 1 1 0
## Random generator seed value: 22
fit.bmi2<-with(data=newimp ,exp=lm(bmi~inc+agec+educ+black+hispanic))
est.p2<-pool(fit.bmi)
summary(est.p2)
## est se t df Pr(>|t|)
## (Intercept) 25.4578144 0.4092780 62.2017714 226.12589 0.000000e+00
## inc2 -0.5813415 0.2841284 -2.0460519 105.28848 4.324344e-02
## inc3 -0.7995774 0.2991165 -2.6731309 2011.94039 7.575412e-03
## inc4 -0.7588615 0.3430104 -2.2123571 35.12953 3.353602e-02
## inc5 -1.2008980 0.2693144 -4.4590939 133.40132 1.729905e-05
## agec2 2.8166979 0.3933157 7.1614170 118.14848 7.360157e-11
## agec3 3.5693006 0.3492304 10.2204753 405.75432 0.000000e+00
## agec4 3.1152752 0.3419729 9.1097127 1050.59986 0.000000e+00
## agec5 0.4229708 0.3905056 1.0831363 3836.78796 2.788160e-01
## educ2 0.9816962 0.4852687 2.0229952 19.97432 5.666501e-02
## educ3 0.1706885 0.3286549 0.5193547 413.43180 6.037913e-01
## educ4 0.1109902 0.2015461 0.5506938 3846.27487 5.818756e-01
## educ5 -0.7917089 0.2108900 -3.7541320 193.17256 2.300566e-04
## black2 2.4760851 0.2649433 9.3457152 136.31843 2.220446e-16
## hispanic2 0.8654392 0.2135866 4.0519356 399.72907 6.104608e-05
## lo 95 hi 95 nmis fmi lambda
## (Intercept) 24.65132798 26.26430091 NA 0.13828603 0.13069804
## inc2 -1.14469778 -0.01798532 NA 0.20812702 0.19322657
## inc3 -1.38618779 -0.21296705 NA 0.03889433 0.03793941
## inc4 -1.45511781 -0.06260512 NA 0.37107244 0.33625729
## inc5 -1.73357662 -0.66821934 NA 0.18346234 0.17131160
## agec2 2.03783586 3.59556001 NA 0.19573670 0.18223651
## agec3 2.88277385 4.25582745 NA 0.10072339 0.09630167
## agec4 2.44424751 3.78630290 NA 0.05877721 0.05698713
## agec5 -0.34264767 1.18858924 NA 0.02292614 0.02241695
## educ2 -0.03063993 1.99403236 NA 0.49462411 0.44643414
## educ3 -0.47535456 0.81673149 NA 0.09969444 0.09534967
## educ4 -0.28415727 0.50613763 NA 0.02286819 0.02236023
## educ5 -1.20765151 -0.37576620 NA 0.15049545 0.14174545
## black2 1.95215449 3.00001566 NA 0.18135604 0.16943275
## hispanic2 0.44554577 1.28533260 NA 0.10155164 0.09706757
#get the coefficients from each of the 5 imputations of bmi
coefs2<-matrix(unlist(lapply(fit.bmi2$analyses, coef)), nrow=5, ncol=15, byrow=T)
#plot the coefficients from each of the different rounds of imputation to see the variability in the
#results
plot(coefs2[1,-1], ylim=c(-2, 4), xaxt="n",cex=1.5, pch=20, ylab="beta")
axis(1, at=1:14, labels=names(fit.bmi2$analyses[[1]]$coef[-1]))
cols=2:5
for(i in 1:dim(coefs2)[1]-1){
points(coefs2[i+1,-1], col=cols[i], cex=1.5, pch=20)
}
title(main="Estimated Betas from Each Imputed Regression Model for BMI Outcome")