In this example, we will use Bayesian hierarchical models to do some imputation of missing data. I will first do a simple case where we use the posterior predictive distribution to impute a missing value.
The second case will impute county poverty rates in Texas.
The final example will impute student math scores from a longitudinal data set (the ECLS-K ).
We will use JAGS to illustrate the use of this type of imputation
This case involves us estimating the missing data for poverty rates at the county level in Texas from the American Community Survey. In this case, we have missing values both in the outcome (poverty rate) and a predictor (unemployment rate). We build a model to jointly estimate the missing values of unemployment, then use those values to help estimate the values of poverty. Since all the data are observed, we artifically knock out a sample of cases then compare our imputed estimates to the observed values that we knocked out. This is called a cross-validation experiment.
library(maptools)
## Loading required package: sp
## Checking rgeos availability: TRUE
library(car)
library(spdep)
## Loading required package: Matrix
library(R2OpenBUGS)
library(coda)
## Loading required package: lattice
dat<-readShapePoly("/Users/ozd504/Google Drive/dem7903_App_Hier/Bayes/TXsda.shp")
## Field name: '_FREQ_' changed to: 'X_FREQ_'
dat$ppoor2k8<-ifelse(dat$ppoor2k8==0,NA, dat$ppoor2k8)
dat$punem2k8<-ifelse(dat$punem2k8==0,NA, dat$punem2k8)
dat2<-dat@data
dat2$mispov<-dat2$ppoor2k
dat2$misunem<-dat2$punemp2k
set.seed(1234)
#Here we artifically put in missing cases for 75 of the counties.
dat2$mispov[sample(1:length(dat2$STATE), size=75, replace=F)]<-NA
dat2$misunem[sample(1:length(dat2$STATE), size=75, replace=F)]<-NA
summary(dat2$ppoor2k)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 13.3 16.5 17.3 19.5 50.9
summary(dat2$mispov)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 13.2 16.5 17.3 19.4 50.9 75
missings<-which(is.na(dat2$mispov)==T)
N<-length(dat$STATE)
nb<-poly2nb(dat, queen = T)
wts<-nb2WB(nb)
model1<-"
model{
for(i in 1:N)
{
pov[i]~dnorm(mu[i], taup)
unem[i]~dnorm(muun[i],tauu)
mu[i] <- alpha + beta*muun[i]+v[i]+u[i]
muun[i]<-alpha2+vu[i]+uu[i]
v[i]~dnorm(0, tau1)
vu[i]~dnorm(0, tau2)
}
#spatial random intercept
u[1:N]~car.normal(adj[], weight[], num[], tau3)
uu[1:N]~car.normal(adj[], weight[], num[], tau4)
for (i in 1:sumNumNeigh){weight[i]<-1}
alpha~dflat()
alpha2~dflat()
beta~dnorm(0,.01)
taup<-pow(sdp,-2)
sdp~dunif(0,100)
tauu<-pow(sdu,-2)
sdu~dunif(0,100)
tau1<-pow(sd1,-2)
sd1~dunif(0,100)
tau2<-pow(sd2,-2)
sd2~dunif(0,100)
tau3<-pow(sd3,-2)
sd3~dunif(0,100)
tau4<-pow(sd4,-2)
sd4~dunif(0,100)
}
"
write(model1, "/Users/ozd504/mod1.txt")
#generate the data
#I use a logit transform of the rate to avoid prediction out of the 0:1 interval
mdat<-list(pov=car::logit(dat2$mispov/100, adjust=.01), unem=car::logit(dat2$misunem/100, adjust=.01), N= N, adj=wts$adj, num=wts$num, sumNumNeigh=sum(wts$num))
b<-coef(lm(mdat$pov~mdat$unem))
bu<-coef(lm(mdat$unem~1))
#initial values
in1<-list(alpha=b[1],alpha2=bu[1], u=rep(0, length(unique(dat$COFIPS))), uu=rep(0, length(unique(dat$COFIPS))),v=rep(0, length(unique(dat$COFIPS))),vu=rep(0, length(unique(dat$COFIPS))),sdp=1, sdu=1, sd1=1, sd2=1, sd3=1, sd4=1, beta=b[2])
in2<-list(alpha=b[1],alpha2=bu[1], u=rep(0, length(unique(dat$COFIPS))), uu=rep(0, length(unique(dat$COFIPS))),v=rep(0, length(unique(dat$COFIPS))),vu=rep(0, length(unique(dat$COFIPS))),sdp=.1, sdu=.1, sd1=.1, sd2=.1, sd3=.1, sd4=1, beta=b[2])
#write out the data so OpenBUGS can read it
bugs.data(data=mdat, dir="/Users/ozd504/", data.file="data.txt")
## [1] "data.txt"
bugs.data(data=in1, dir="/Users/ozd504/", data.file="inits1.txt")
## [1] "inits1.txt"
bugs.data(data=in2, dir="/Users/ozd504/", data.file="inits2.txt")
## [1] "inits2.txt"
#Set where we want to work
setwd("/Users/ozd504/")
WINE="/opt/local/bin/wine"
WINEPATH="/opt/local/bin/winepath"
OpenBUGS.pgm="/Users/ozd504/.wine/drive_c/Program\ Files/OpenBUGS/OpenBUGS323/OpenBUGS.exe"
#Run OpenBUGS on our model, monitoring the intercept and the hyperparameters for the random intercepts. If these converge, the Random effects themselves will most likely be converged as well.
#Burn in for 10000 iterations, generate a total of 50k iterations for inference
#Since this is run on a mac, I feed bugs() the information for my wine installation
res1 <- bugs(data = "data.txt",
inits = list( in1, in2),
parameters.to.save = c("alpha", "alpha2", "beta","sdp", "sdu", "sd1", "sd2", "sd3", "sd4"),
model.file = "mod1.txt",
n.chains = 2,
n.iter=70000,
n.burnin=50000,
n.thin=5,
bugs.seed=11,
working.directory="/Users/ozd504/", debug=F,
OpenBUGS.pgm=OpenBUGS.pgm, WINE=WINE, WINEPATH=WINEPATH,useWINE=T)
res1mc<-as.mcmc.list(res1)
gelman.diag(res1mc)
## Potential scale reduction factors:
##
## Point est. Upper C.I.
## alpha 1.16 1.51
## alpha2 1.00 1.01
## beta 1.16 1.50
## deviance 1.03 1.09
## sd1 1.05 1.05
## sd2 1.02 1.07
## sd3 1.02 1.05
## sd4 1.07 1.10
## sdp 1.09 1.30
## sdu 1.04 1.05
##
## Multivariate psrf
##
## 1.12
res2 <- bugs(data = "data.txt",
inits = list( in1, in2),
parameters.to.save = c("mu", "muun"),
model.file = "mod1.txt",
n.chains = 2,
n.iter=30000,
n.burnin=25000,
n.thin=5,
bugs.seed=11,
working.directory="/Users/ozd504/", debug=F,
OpenBUGS.pgm=OpenBUGS.pgm, WINE=WINE, WINEPATH=WINEPATH,useWINE=T)
res2mc<-as.mcmc.list(res2)
#Numerical summary of the missing values
sums<-summary(res2mc)
library(boot)
##
## Attaching package: 'boot'
##
## The following object is masked from 'package:lattice':
##
## melanoma
##
## The following object is masked from 'package:car':
##
## logit
dat$imppov<-inv.logit(res2$mean$mu)
dat$impunemp<-inv.logit(res2$mean$muun)
data.frame(observed.pov=(head(inv.logit(mdat$pov), n=15)), est.pov=head(dat$imppov, n=15))
## observed.pov est.pov
## 1 0.17141 0.17209
## 2 0.17093 0.17164
## 3 NA 0.17725
## 4 0.20515 0.20524
## 5 0.09779 0.09924
## 6 0.11393 0.11504
## 7 0.20797 0.20764
## 8 0.12813 0.12840
## 9 NA 0.20050
## 10 NA 0.16887
## 11 NA 0.12516
## 12 0.16800 0.16768
## 13 0.24478 0.24338
## 14 0.12838 0.12996
## 15 0.16554 0.16529
So we have our missing values, Let’s compare them to the actual known values and see how our estimation did
dat$BayesianEst<-dat$imppov
dat$unem.est<-dat$impunemp
dat$MissingCounties<-dat2$mispov
dat$misunem<-dat2$misunem
dat$PovertyRate<-dat$ppoor2k/100
dat$Imputed<-ifelse(is.na(dat$MissingCounties)==T, dat$BayesianEst,dat$PovertyRate)
errs1<-(dat$BayesianEst[missings]-dat$PovertyRate[missings])/dat$PovertyRate[missings]
mpe<-sum(errs1[is.finite(errs1)])*1/N
mpe
## [1] 0.01572
errs3<-abs((dat$BayesianEst[missings]-dat$PovertyRate[missings])/dat$PovertyRate[missings])
mape<-100*sum(errs3[is.finite(errs3)])*1/N
mape
## [1] 6.553
spplot(dat, c("PovertyRate", "BayesianEst"))
So our model estimates the rate pretty well, we were 6.5% off on average.