I’ll be using the BRFSS 2020 data set since previous data I used did not have very many missing values to do imputation on.
library(car)
library(stargazer)
library(survey)
library(ggplot2)
library(pander)
library(dplyr)
library(knitr)
library(factoextra)
library(FactoMineR)
library(mice)
brfss20<- readRDS(url("https://github.com/coreysparks/DEM7283/blob/master/data/brfss20sm.rds?raw=true"))
names(brfss20) <- tolower(gsub(pattern = "_",replacement = "",x = names(brfss20)))
#smaller sample
samps<- sample(1:dim(brfss20)[1], size = 10000, replace = F)
brfss20<- brfss20[samps,]
brfss20$drink <- Recode(brfss20$drnkwk1, recodes = "99900=NA")
#sex
brfss20$male<-as.factor(ifelse(brfss20$sex==1,
"Male",
"Female"))
#binge drinking
brfss20$bingedrink <- Recode(brfss20$drnk3ge5, recodes = "88=0; 77=NA; 99=NA")
#smoking
brfss20$smoke <- Recode(brfss20$smoke100, recodes = "2=0; 7=NA; 9=NA", as.factor=T)
#drink and drive
brfss20$drinkdrive <- Recode(brfss20$drnkdri2, recodes = "88=0; 77=NA; 99=NA")
#sealt belt useage
brfss20$seatbeltdrive <- Recode(brfss20$seatbelt, recodes = "1:2 = 1; 3:5 = 0; 7=NA; 8 = NA; 9=NA")
#sleep per night
brfss20$sleep <- Recode(brfss20$sleptim1, recodes = "77=NA; 99=NA")
#exercise
brfss20$exercise <- Recode(brfss20$exerany2, recodes = "2=0; 7=NA; 9=NA")
#healthy days
brfss20$healthdays<-Recode(brfss20$physhlth, recodes = "88=0; 77=NA; 99=NA", as.factor=T)
#bmi
brfss20$bmi<-brfss20$bmi5/100
#income
brfss20$income <- Recode(brfss20$income2, recodes = "77=NA; 99=NA", as.factor=T)
#marital
brfss20$marst<-Recode(brfss20$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
#employ
brfss20$employ<-Recode(brfss20$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss20a <- brfss20 %>% select(healthdays, smoke, bmi, income, marst, employ)
summary(brfss20a)
## healthdays smoke bmi income marst
## 0 :7002 0 :5762 Min. :12.88 8 :3331 cohab : 412
## 30 : 526 1 :3677 1st Qu.:23.81 7 :1251 divorced :1230
## 2 : 440 NA's: 561 Median :27.12 6 :1054 married :4986
## 1 : 342 Mean :28.13 5 : 685 nm :2069
## 3 : 266 3rd Qu.:31.19 4 : 602 separated: 230
## (Other):1195 Max. :76.52 (Other): 983 widowed : 950
## NA's : 229 NA's :1118 NA's :2094 NA's : 123
## employ
## Employed:5209
## nilf :1373
## retired :2668
## unable : 533
## NA's : 217
##
##
mcv.marriage <- factor(names(which.max(table(brfss20$marst))),
levels = levels(brfss20$marst))
mcv.marriage
## [1] married
## Levels: cohab divorced married nm separated widowed
brfss20$marst.imp<-as.factor(ifelse(is.na(brfss20$marst)==T, mcv.marriage, brfss20$marst))
levels(brfss20$marst.imp)<-levels(brfss20$marst)
prop.table(table(brfss20$marst))
##
## cohab divorced married nm separated widowed
## 0.04171307 0.12453174 0.50480915 0.20947656 0.02328642 0.09618305
prop.table(table(brfss20$marst.imp))
##
## cohab divorced married nm separated widowed
## 0.0412 0.1230 0.5109 0.2069 0.0230 0.0950
mcv.employ <- factor(names(which.max(table(brfss20$employ))),
levels = levels(brfss20$employ))
mcv.employ
## [1] Employed
## Levels: Employed nilf retired unable
brfss20$employ.imp<-as.factor(ifelse(is.na(brfss20$employ)==T, mcv.employ, brfss20$employ))
levels(brfss20$employ.imp)<-levels(brfss20$employ)
prop.table(table(brfss20$employ))
##
## Employed nilf retired unable
## 0.53245426 0.14034550 0.27271798 0.05448227
prop.table(table(brfss20$employ.imp))
##
## Employed nilf retired unable
## 0.5426 0.1373 0.2668 0.0533
mcv.healthdays <- factor(names(which.max(table(brfss20$healthdays))),
levels = levels(brfss20$healthdays))
mcv.healthdays
## [1] 0
## 31 Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 30
brfss20$healthdays.imp<-as.factor(ifelse(is.na(brfss20$healthdays)==T, mcv.healthdays, brfss20$healthdays))
levels(brfss20$healthdays.imp)<-levels(brfss20$healthdays)
prop.table(table(brfss20$healthdays))
##
## 0 1 2 3 4 5
## 0.7166103776 0.0350015352 0.0450312148 0.0272234162 0.0122812404 0.0251765428
## 6 7 8 9 10 11
## 0.0037867158 0.0152492068 0.0022515607 0.0008187494 0.0151468632 0.0002046873
## 12 13 14 15 16 17
## 0.0022515607 0.0006140620 0.0079828063 0.0150445195 0.0004093747 0.0007164057
## 18 19 20 21 22 23
## 0.0006140620 0.0003070310 0.0106437417 0.0020468734 0.0002046873 0.0001023437
## 24 25 26 27 28 29
## 0.0004093747 0.0034796848 0.0001023437 0.0004093747 0.0015351551 0.0005117184
## 30
## 0.0538327704
prop.table(table(brfss20$healthdays.imp))
##
## 0 1 2 3 4 5 6 7 8 9 10
## 0.7231 0.0342 0.0440 0.0266 0.0120 0.0246 0.0037 0.0149 0.0022 0.0008 0.0148
## 11 12 13 14 15 16 17 18 19 20 21
## 0.0002 0.0022 0.0006 0.0078 0.0147 0.0004 0.0007 0.0006 0.0003 0.0104 0.0020
## 22 23 24 25 26 27 28 29 30
## 0.0002 0.0001 0.0004 0.0034 0.0001 0.0004 0.0015 0.0005 0.0526
mcv.smoke <- factor(names(which.max(table(brfss20$smoke))),
levels = levels(brfss20$smoke))
mcv.smoke
## [1] 0
## Levels: 0 1
brfss20$smoke.imp<-as.factor(ifelse(is.na(brfss20$smoke)==T, mcv.smoke, brfss20$smoke))
levels(brfss20$smoke.imp)<-levels(brfss20$smoke)
prop.table(table(brfss20$smoke))
##
## 0 1
## 0.610446 0.389554
prop.table(table(brfss20$smoke.imp))
##
## 0 1
## 0.6323 0.3677
mcv.income <- factor(names(which.max(table(brfss20$income))),
levels = levels(brfss20$income))
mcv.income
## [1] 8
## Levels: 1 2 3 4 5 6 7 8
brfss20$income.imp<-as.factor(ifelse(is.na(brfss20$income)==T, mcv.income, brfss20$income))
levels(brfss20$income.imp)<-levels(brfss20$income)
prop.table(table(brfss20$income))
##
## 1 2 3 4 5 6 7
## 0.03389831 0.03706046 0.05337718 0.07614470 0.08664306 0.13331647 0.15823425
## 8
## 0.42132558
prop.table(table(brfss20$income.imp))
##
## 1 2 3 4 5 6 7 8
## 0.0268 0.0293 0.0422 0.0602 0.0685 0.1054 0.1251 0.5425
barplot(prop.table(table(brfss20$income)), main="Original Data", ylim=c(0, .6))
barplot(prop.table(table(brfss20$income.imp)), main="Imputed Data",ylim=c(0, .6))
brfss20$bmi.imp.mean<-ifelse(is.na(brfss20$bmi)==T, mean(brfss20$bmi, na.rm=T), brfss20$bmi)
var(brfss20$bmi, na.rm=T)
## [1] 39.35813
var(brfss20$bmi.imp.mean)
## [1] 34.95745
modalfit <- lm(bmi.imp.mean ~ marst.imp + employ.imp + income.imp + healthdays.imp + smoke.imp, brfss20)
modalfit
##
## Call:
## lm(formula = bmi.imp.mean ~ marst.imp + employ.imp + income.imp +
## healthdays.imp + smoke.imp, data = brfss20)
##
## Coefficients:
## (Intercept) marst.impdivorced marst.impmarried marst.impnm
## 28.50971 0.58690 0.55605 -0.15798
## marst.impseparated marst.impwidowed employ.impnilf employ.impretired
## 0.40149 -0.03181 -0.94370 -0.60174
## employ.impunable income.imp2 income.imp3 income.imp4
## 1.31099 -0.26913 0.24923 -0.01683
## income.imp5 income.imp6 income.imp7 income.imp8
## -0.10722 -0.46312 -0.72192 -1.31292
## healthdays.imp1 healthdays.imp2 healthdays.imp3 healthdays.imp4
## -0.26947 0.74340 1.43539 0.46237
## healthdays.imp5 healthdays.imp6 healthdays.imp7 healthdays.imp8
## 1.31948 1.16212 0.97237 1.57721
## healthdays.imp9 healthdays.imp10 healthdays.imp11 healthdays.imp12
## 5.94323 1.47129 18.68219 0.70291
## healthdays.imp13 healthdays.imp14 healthdays.imp15 healthdays.imp16
## 3.47764 2.13197 1.42013 0.15513
## healthdays.imp17 healthdays.imp18 healthdays.imp19 healthdays.imp20
## 5.04144 1.78923 4.47878 2.53205
## healthdays.imp21 healthdays.imp22 healthdays.imp23 healthdays.imp24
## -1.20322 9.33809 2.72118 8.34987
## healthdays.imp25 healthdays.imp26 healthdays.imp27 healthdays.imp28
## 1.89663 -11.45502 3.96500 3.32930
## healthdays.imp29 healthdays.imp30 smoke.imp1
## -0.97005 2.05718 0.05575
I used a mix of mean and modal imputation for the first attempt. The tables show how the modal imputation can make noticeable differences.
md.pattern(brfss20[,c("bmi", "healthdays", "smoke","income","marst", "employ")])
## marst employ healthdays smoke bmi income
## 7076 1 1 1 1 1 1 0
## 1314 1 1 1 1 1 0 1
## 414 1 1 1 1 0 1 1
## 255 1 1 1 1 0 0 2
## 127 1 1 1 0 1 1 1
## 56 1 1 1 0 1 0 2
## 59 1 1 1 0 0 1 2
## 182 1 1 1 0 0 0 3
## 124 1 1 0 1 1 1 1
## 49 1 1 0 1 1 0 2
## 10 1 1 0 1 0 1 2
## 11 1 1 0 1 0 0 3
## 3 1 1 0 0 1 1 2
## 3 1 1 0 0 1 0 3
## 1 1 1 0 0 0 1 3
## 7 1 1 0 0 0 0 4
## 39 1 0 1 1 1 1 1
## 18 1 0 1 1 1 0 2
## 5 1 0 1 1 0 1 2
## 14 1 0 1 1 0 0 3
## 1 1 0 1 0 1 1 2
## 2 1 0 1 0 1 0 3
## 1 1 0 1 0 0 1 3
## 92 1 0 1 0 0 0 4
## 4 1 0 0 1 1 1 2
## 3 1 0 0 1 1 0 3
## 2 1 0 0 1 0 1 3
## 2 1 0 0 1 0 0 4
## 3 1 0 0 0 0 0 5
## 29 0 1 1 1 1 1 1
## 26 0 1 1 1 1 0 2
## 7 0 1 1 1 0 1 2
## 19 0 1 1 1 0 0 3
## 2 0 1 1 0 1 0 3
## 1 0 1 1 0 0 1 3
## 4 0 1 1 0 0 0 4
## 1 0 1 0 1 1 0 3
## 2 0 1 0 1 0 0 4
## 1 0 1 0 0 0 1 4
## 1 0 0 1 1 1 1 2
## 3 0 0 1 1 1 0 3
## 9 0 0 1 1 0 0 4
## 15 0 0 1 0 0 0 5
## 1 0 0 0 1 1 0 4
## 1 0 0 0 1 0 0 5
## 1 0 0 0 0 0 1 5
## 123 217 229 561 1118 2094 4342
dat2<-brfss20
samp2<-sample(1:dim(dat2)[1], replace = F, size = 500)
dat2$bmiknock<-dat2$bmi
dat2$bmiknock[samp2]<-NA
head(dat2[, c("bmiknock","bmi")])
## # A tibble: 6 × 2
## bmiknock bmi
## <dbl> <dbl>
## 1 NA NA
## 2 26.5 26.5
## 3 NA NA
## 4 NA 27.8
## 5 25.7 25.7
## 6 41.2 41.2
imp<-mice(data = dat2[,c("bmi", "healthdays", "smoke","income","marst", "employ")], seed = 7, m = 4)
##
## iter imp variable
## 1 1 bmi healthdays smoke income marst employ
## 1 2 bmi healthdays smoke income marst employ
## 1 3 bmi healthdays smoke income marst employ
## 1 4 bmi healthdays smoke income marst employ
## 2 1 bmi healthdays smoke income marst employ
## 2 2 bmi healthdays smoke income marst employ
## 2 3 bmi healthdays smoke income marst employ
## 2 4 bmi healthdays smoke income marst employ
## 3 1 bmi healthdays smoke income marst employ
## 3 2 bmi healthdays smoke income marst employ
## 3 3 bmi healthdays smoke income marst employ
## 3 4 bmi healthdays smoke income marst employ
## 4 1 bmi healthdays smoke income marst employ
## 4 2 bmi healthdays smoke income marst employ
## 4 3 bmi healthdays smoke income marst employ
## 4 4 bmi healthdays smoke income marst employ
## 5 1 bmi healthdays smoke income marst employ
## 5 2 bmi healthdays smoke income marst employ
## 5 3 bmi healthdays smoke income marst employ
## 5 4 bmi healthdays smoke income marst employ
## Warning: Number of logged events: 16
print(imp)
## Class: mids
## Number of multiple imputations: 4
## Imputation methods:
## bmi healthdays smoke income marst employ
## "pmm" "polyreg" "logreg" "polyreg" "polyreg" "polyreg"
## PredictorMatrix:
## bmi healthdays smoke income marst employ
## bmi 0 1 1 1 1 1
## healthdays 1 0 1 1 1 1
## smoke 1 1 0 1 1 1
## income 1 1 1 0 1 1
## marst 1 1 1 1 0 1
## employ 1 1 1 1 1 0
## Number of logged events: 16
## it im dep meth out
## 1 1 1 income polyreg healthdays26
## 2 1 2 income polyreg healthdays26
## 3 1 3 income polyreg healthdays26
## 4 1 4 income polyreg healthdays26
## 5 2 1 income polyreg healthdays26
## 6 2 3 income polyreg healthdays26
plot(imp)
fit.bmi<-with(data=imp ,expr=lm(bmi~healthdays+smoke+income+marst+employ))
fit.bmi
## call :
## with.mids(data = imp, expr = lm(bmi ~ healthdays + smoke + income +
## marst + employ))
##
## call1 :
## mice(data = dat2[, c("bmi", "healthdays", "smoke", "income",
## "marst", "employ")], m = 4, seed = 7)
##
## nmis :
## bmi healthdays smoke income marst employ
## 1118 229 561 2094 123 217
##
## analyses :
## [[1]]
##
## Call:
## lm(formula = bmi ~ healthdays + smoke + income + marst + employ)
##
## Coefficients:
## (Intercept) healthdays1 healthdays2 healthdays3 healthdays4
## 28.19726 -0.25091 0.85013 1.64730 0.84283
## healthdays5 healthdays6 healthdays7 healthdays8 healthdays9
## 1.48941 1.50840 1.13981 1.60669 6.07123
## healthdays10 healthdays11 healthdays12 healthdays13 healthdays14
## 1.63952 18.80190 1.27275 6.37808 2.80542
## healthdays15 healthdays16 healthdays17 healthdays18 healthdays19
## 1.65587 0.34986 5.28614 1.93008 7.09659
## healthdays20 healthdays21 healthdays22 healthdays23 healthdays24
## 2.67213 -1.70368 9.16811 3.15394 6.94956
## healthdays25 healthdays26 healthdays27 healthdays28 healthdays29
## 2.46149 -12.17997 2.69344 4.49536 -1.17494
## healthdays30 smoke1 income2 income3 income4
## 2.35745 0.03910 -0.30122 0.21496 0.11233
## income5 income6 income7 income8 marstdivorced
## 0.28607 -0.38674 -0.60542 -1.27028 0.57068
## marstmarried marstnm marstseparated marstwidowed employnilf
## 0.74910 -0.32093 0.34471 -0.08591 -0.98734
## employretired employunable
## -0.68229 1.41011
##
##
## [[2]]
##
## Call:
## lm(formula = bmi ~ healthdays + smoke + income + marst + employ)
##
## Coefficients:
## (Intercept) healthdays1 healthdays2 healthdays3 healthdays4
## 28.303075 -0.340933 0.683127 1.519277 0.515168
## healthdays5 healthdays6 healthdays7 healthdays8 healthdays9
## 1.093098 0.925047 1.279870 1.580463 5.712779
## healthdays10 healthdays11 healthdays12 healthdays13 healthdays14
## 1.539549 18.696583 0.430299 6.256804 2.829955
## healthdays15 healthdays16 healthdays17 healthdays18 healthdays19
## 1.698378 0.215151 4.648968 1.842550 4.823718
## healthdays20 healthdays21 healthdays22 healthdays23 healthdays24
## 2.959699 -0.718991 9.042631 2.628872 8.351185
## healthdays25 healthdays26 healthdays27 healthdays28 healthdays29
## 1.945206 -12.995989 3.833236 5.022940 -1.278370
## healthdays30 smoke1 income2 income3 income4
## 2.048114 -0.031526 -0.175398 -0.087313 -0.163713
## income5 income6 income7 income8 marstdivorced
## -0.217119 -0.433789 -0.685998 -1.212250 0.849371
## marstmarried marstnm marstseparated marstwidowed employnilf
## 0.898106 0.040303 0.726821 0.004742 -1.147596
## employretired employunable
## -0.748357 1.507619
##
##
## [[3]]
##
## Call:
## lm(formula = bmi ~ healthdays + smoke + income + marst + employ)
##
## Coefficients:
## (Intercept) healthdays1 healthdays2 healthdays3 healthdays4
## 28.877425 -0.008054 0.843664 1.534663 0.491331
## healthdays5 healthdays6 healthdays7 healthdays8 healthdays9
## 1.412764 1.474607 1.150435 1.612564 5.672960
## healthdays10 healthdays11 healthdays12 healthdays13 healthdays14
## 1.544565 15.022443 0.311428 4.151501 2.501833
## healthdays15 healthdays16 healthdays17 healthdays18 healthdays19
## 1.484717 0.251446 5.179148 2.163598 3.312584
## healthdays20 healthdays21 healthdays22 healthdays23 healthdays24
## 2.897171 -1.670294 9.503182 2.681803 8.389422
## healthdays25 healthdays26 healthdays27 healthdays28 healthdays29
## 1.857376 -12.993680 4.237103 5.798033 -0.897972
## healthdays30 smoke1 income2 income3 income4
## 2.171577 -0.012529 -0.268680 -0.063604 -0.255409
## income5 income6 income7 income8 marstdivorced
## -0.652926 -0.945623 -1.220824 -1.666822 0.580138
## marstmarried marstnm marstseparated marstwidowed employnilf
## 0.432289 -0.132406 0.473893 -0.375350 -1.214328
## employretired employunable
## -0.677602 1.263626
##
##
## [[4]]
##
## Call:
## lm(formula = bmi ~ healthdays + smoke + income + marst + employ)
##
## Coefficients:
## (Intercept) healthdays1 healthdays2 healthdays3 healthdays4
## 28.037218 -0.312995 0.720361 1.360711 0.606892
## healthdays5 healthdays6 healthdays7 healthdays8 healthdays9
## 1.289100 2.216713 1.183092 1.195000 5.989801
## healthdays10 healthdays11 healthdays12 healthdays13 healthdays14
## 1.693272 18.662799 0.501123 5.381076 2.333498
## healthdays15 healthdays16 healthdays17 healthdays18 healthdays19
## 1.750627 0.261726 5.326209 2.028709 3.717946
## healthdays20 healthdays21 healthdays22 healthdays23 healthdays24
## 2.425132 -2.175929 9.300465 3.019207 8.494965
## healthdays25 healthdays26 healthdays27 healthdays28 healthdays29
## 2.208981 -12.780378 3.828195 3.184567 -1.158696
## healthdays30 smoke1 income2 income3 income4
## 2.075422 -0.003936 0.301801 0.379315 0.111738
## income5 income6 income7 income8 marstdivorced
## 0.556141 -0.141529 -0.421603 -1.175965 0.763116
## marstmarried marstnm marstseparated marstwidowed employnilf
## 0.810793 -0.120460 0.667039 -0.012388 -1.076503
## employretired employunable
## -0.710545 1.033915
Overall, the results are somewhat similar to the modal/mean imputation to the multiply imputated data.