library(car)
library(mice)
## Warning: package 'mice' was built under R version 4.0.5
library(ggplot2)
library(dplyr)
library(car)
library(stargazer)
library(survey)
library(questionr)
library(dplyr)
library(tidyverse)
library(haven)
library(readr)
library(lattice)
load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))
set.seed(1234)
samp<-sample(1:dim(brfss_17)[1], size = 25000, replace = F) #smaller sample for brevity
brfss_17<-brfss_17[samp,]
#Healthy days
brfss_17$healthdays<-Recode(brfss_17$physhlth, recodes = "88=0; 77=NA; 99=NA")
# sex
brfss_17$male<-ifelse(brfss_17$sex==1,1,0)
brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")
#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")
brfss_17$race_eth<-Recode(brfss_17$racegr3,
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")
#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")
#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)
#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')
#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')
#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')
#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))
#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's
brfss_17$bmi<-brfss_17$bmi5/100
#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3,
recodes="1:2=1; 3:4=0; else=NA")
brfss_17$skcancer<-Recode(brfss_17$chcscncr, recodes = "1= 1; 2= 0; else=NA")
brfss_17$kidney<-Recode(brfss_17$chckidny, recodes = "1= 1; 2= 0; else=NA")
brfss_17$chrobpuld<-Recode(brfss_17$chccopd1, recodes = "1= 1; 2= 0; else=NA")
brfss_17$arthritis<-Recode(brfss_17$havarth3, recodes = "1= 1; 2= 0; else=NA")
brfss_17$depdisorder<-Recode(brfss_17$addepev2, recodes = "1= 1; 2:4=0; else=NA")
summary(brfss_17)summary(brfss_17[, c("ins", "race_eth","agec", "educ","badhealth", "skcancer", "marst", "smoke")])
## ins race_eth agec educ
## Min. :0.0000 nhwhite :18106 (0,24] :1544 2hsgrad : 6046
## 1st Qu.:1.0000 hispanic : 2326 (24,39]:4398 0Prim : 564
## Median :1.0000 nh black : 2594 (39,59]:7885 1somehs : 968
## Mean :0.9229 nh multirace: 443 (59,79]:9238 3somecol: 6861
## 3rd Qu.:1.0000 nh other : 1008 (79,99]:1935 4colgrad:10435
## Max. :1.0000 NA's : 523 NA's : 126
## NA's :111
## badhealth skcancer marst smoke
## Min. :0.0000 Min. :0.00000 married :12600 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 cohab : 823 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 divorced : 3412 Median :0.0000
## Mean :0.1756 Mean :0.09452 nm : 4526 Mean :0.1363
## 3rd Qu.:0.0000 3rd Qu.:0.00000 separated: 557 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 widowed : 2860 Max. :1.0000
## NA's :67 NA's :85 NA's : 222 NA's :1025
The result shows that, the smoke variable has the highest missing number with 1025 people missing in the data, or 4.1% of the sample. Bad health has the lowest number of missing with 67 missing or 0.268% missing. Age has no missing values
#I'm using skin cancer (skcancer) as my outcome variable
summary(brfss_17$skcancer)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00000 0.00000 0.00000 0.09452 0.00000 1.00000 85
#what happens when we replace the missing with the mean?
brfss_17$skcancer.imp.mean<-ifelse(is.na(brfss_17$skcancer)==T, mean(brfss_17$skcancer, na.rm=T), brfss_17$skcancer)
mean(brfss_17$skcancer, na.rm=T)
## [1] 0.09452137
mean(brfss_17$skcancer.imp.mean) #no difference!
## [1] 0.09452137
fit<-lm(skcancer~ins+educ+smoke+race_eth+marst, brfss_17)
median(brfss_17$skcancer, na.rm=T)
## [1] 0
median(brfss_17$skcancer.imp.mean) # No difference
## [1] 0
var(brfss_17$skcancer, na.rm=T)
## [1] 0.08559052
var(brfss_17$skcancer.imp.mean) # slight noticeable difference!
## [1] 0.0852995
The result shows that imputing with the mean does nothing to central tendency (when measured using the mean and the median), but it does reduce the variance in the outcome. This is because am replacing all missing cases with the most likely value (the mean), so am artificially deflating the variance. That’s not good.
#look at the patterns of missingness
md.pattern(brfss_17[,c("skcancer", "ins","smoke","agec","educ","race_eth")])
## agec skcancer ins educ race_eth smoke
## 23276 1 1 1 1 1 1 0
## 945 1 1 1 1 1 0 1
## 429 1 1 1 1 0 1 1
## 37 1 1 1 1 0 0 2
## 66 1 1 1 0 1 1 1
## 12 1 1 1 0 1 0 2
## 25 1 1 1 0 0 1 2
## 15 1 1 1 0 0 0 3
## 86 1 1 0 1 1 1 1
## 9 1 1 0 1 1 0 2
## 8 1 1 0 1 0 1 2
## 1 1 1 0 1 0 0 3
## 4 1 1 0 0 1 1 2
## 1 1 1 0 0 1 0 3
## 1 1 1 0 0 0 0 4
## 76 1 0 1 1 1 1 1
## 1 1 0 1 1 1 0 2
## 4 1 0 1 1 0 1 2
## 1 1 0 1 1 0 0 3
## 1 1 0 1 0 1 0 3
## 1 1 0 1 0 0 0 4
## 1 1 0 0 1 0 1 3
## 0 85 111 126 523 1025 1870
The first row shows the number of observations in the data that are complete (first row). The second row shows the number of people who are missing only the smoke variable. Rows that have multiple 0’s in the columns indicate missing data patterns where multiple variables are missing. The bottom row tells how many total people are missing each variable, in ANY combination with other variables.
We can perform a basic multiple imputation by simply doing: Note this may take a very long time with big data sets
dat2<-brfss_17
samp2<-sample(1:dim(dat2)[1], replace = F, size = 500)
dat2$skcancerknock<-dat2$skcancer
dat2$skcancerknock[samp2]<-NA
head(dat2[, c("skcancerknock","skcancer")])
## # A tibble: 6 x 2
## skcancerknock skcancer
## <dbl> <dbl>
## 1 0 0
## 2 1 1
## 3 0 0
## 4 0 0
## 5 0 0
## 6 1 1
imp<-mice(data = dat2[,c("skcancer", "ins", "smoke", "agec","educ","race_eth")], seed = 22, m = 5)
##
## iter imp variable
## 1 1 skcancer ins smoke educ race_eth
## 1 2 skcancer ins smoke educ race_eth
## 1 3 skcancer ins smoke educ race_eth
## 1 4 skcancer ins smoke educ race_eth
## 1 5 skcancer ins smoke educ race_eth
## 2 1 skcancer ins smoke educ race_eth
## 2 2 skcancer ins smoke educ race_eth
## 2 3 skcancer ins smoke educ race_eth
## 2 4 skcancer ins smoke educ race_eth
## 2 5 skcancer ins smoke educ race_eth
## 3 1 skcancer ins smoke educ race_eth
## 3 2 skcancer ins smoke educ race_eth
## 3 3 skcancer ins smoke educ race_eth
## 3 4 skcancer ins smoke educ race_eth
## 3 5 skcancer ins smoke educ race_eth
## 4 1 skcancer ins smoke educ race_eth
## 4 2 skcancer ins smoke educ race_eth
## 4 3 skcancer ins smoke educ race_eth
## 4 4 skcancer ins smoke educ race_eth
## 4 5 skcancer ins smoke educ race_eth
## 5 1 skcancer ins smoke educ race_eth
## 5 2 skcancer ins smoke educ race_eth
## 5 3 skcancer ins smoke educ race_eth
## 5 4 skcancer ins smoke educ race_eth
## 5 5 skcancer ins smoke educ race_eth
print(imp)
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## skcancer ins smoke agec educ race_eth
## "pmm" "pmm" "pmm" "" "polyreg" "polyreg"
## PredictorMatrix:
## skcancer ins smoke agec educ race_eth
## skcancer 0 1 1 1 1 1
## ins 1 0 1 1 1 1
## smoke 1 1 0 1 1 1
## agec 1 1 1 0 1 1
## educ 1 1 1 1 0 1
## race_eth 1 1 1 1 1 0
The results here shows how many imputations were done. It also shows total missingness, which imputation method was used for each variable (because you wouldn’t want to use a normal distribution for a categorical variable!!).
It also shows the sequence of how each variable is visited (or imputed, the default is left to right).
We may want to make sure imputed values are plausible by having a look. For instance, are the skcancer values outside of the range of the data.
head(imp$imp$skcancer)
## 1 2 3 4 5
## 446 0 0 0 0 0
## 697 0 0 0 0 0
## 876 0 0 0 0 1
## 939 0 0 0 0 1
## 1150 0 0 0 0 0
## 1344 0 0 1 0 0
summary(imp$imp$skcancer)
## 1 2 3 4
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.03529 Mean :0.04706 Mean :0.07059 Mean :0.04706
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## 5
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1294
## 3rd Qu.:0.0000
## Max. :1.0000
summary(brfss_17$skcancer)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00000 0.00000 0.00000 0.09452 0.00000 1.00000 85
head(imp$imp$ins)
## 1 2 3 4 5
## 400 1 0 1 1 1
## 514 1 1 1 1 1
## 705 1 1 1 1 1
## 1371 1 1 0 1 1
## 1452 1 0 1 1 1
## 2256 0 0 1 1 1
summary(imp$imp$ins)
## 1 2 3 4
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.:1.0000
## Median :1.0000 Median :1.0000 Median :1.0000 Median :1.0000
## Mean :0.9189 Mean :0.8108 Mean :0.8559 Mean :0.8468
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## 5
## Min. :0.0000
## 1st Qu.:1.0000
## Median :1.0000
## Mean :0.8829
## 3rd Qu.:1.0000
## Max. :1.0000
This shows the imputed values for the first 6 cases across the 5 different imputations, as well as the numeric summary of the imputed values. We can see that there is variation across the imputations, because the imputed values are not the same.
library(lattice)
stripplot(imp,skcancer~race_eth|.imp, pch=20)
and we see the distribution of the original data (blue dots), the imputed data (red dots) across the levels of race, for each of the five different imputation runs(the number at the top shows which run, and the first plot is the original data).
This plot shows that the skcancer values correspond well with the observed data, so they are probably plausible values.
If we want to get our new, imputed data, we can use the complete() function, which by default extracts the first imputed data set. If we want a different one, we can do complete(imp, action=3) for example, to get the third imputed data set.
dat.imp<-complete(imp, action = 1)
head(dat.imp, n=10)
## skcancer ins smoke agec educ race_eth
## 1 0 1 0 (59,79] 4colgrad nhwhite
## 2 1 1 0 (59,79] 4colgrad nhwhite
## 3 0 1 0 (59,79] 3somecol nhwhite
## 4 0 1 0 (39,59] 4colgrad nhwhite
## 5 0 1 0 (39,59] 4colgrad nhwhite
## 6 1 1 0 (39,59] 4colgrad nh multirace
## 7 0 1 1 (24,39] 3somecol nh black
## 8 0 1 0 (59,79] 1somehs nh black
## 9 1 1 0 (59,79] 4colgrad nhwhite
## 10 0 1 0 (59,79] 2hsgrad nhwhite
#Compare to the original data
head(brfss_17[,c("skcancer", "ins","smoke", "agec","educ","race_eth")], n=10)
## # A tibble: 10 x 6
## skcancer ins smoke agec educ race_eth
## <dbl> <dbl> <dbl> <fct> <fct> <fct>
## 1 0 1 0 (59,79] 4colgrad nhwhite
## 2 1 1 0 (59,79] 4colgrad nhwhite
## 3 0 1 0 (59,79] 3somecol nhwhite
## 4 0 1 0 (39,59] 4colgrad nhwhite
## 5 0 1 0 (39,59] 4colgrad nhwhite
## 6 1 1 0 (39,59] 4colgrad nh multirace
## 7 0 1 1 (24,39] 3somecol nh black
## 8 0 1 0 (59,79] 1somehs nh black
## 9 1 1 0 (59,79] 4colgrad nhwhite
## 10 0 1 0 (59,79] 2hsgrad nhwhite
While the first few cases don’t show much missingness, we can coax some more interesting cases out and compare the original data to the imputed:
head(dat.imp[is.na(brfss_17$skcancer)==T,], n=10)
## skcancer ins smoke agec educ race_eth
## 446 0 0 0 (39,59] 1somehs nh other
## 697 0 0 1 (59,79] 3somecol nhwhite
## 876 0 1 0 (59,79] 2hsgrad nhwhite
## 939 0 1 1 (39,59] 4colgrad nhwhite
## 1150 0 1 0 (59,79] 4colgrad nhwhite
## 1344 0 1 0 (39,59] 2hsgrad nhwhite
## 1684 0 0 0 (39,59] 2hsgrad nhwhite
## 2276 0 1 0 (59,79] 0Prim hispanic
## 2416 0 1 0 (59,79] 2hsgrad nhwhite
## 2714 0 1 0 (59,79] 3somecol nhwhite
head(brfss_17[is.na(brfss_17$skcancer)==T,c("skcancer", "ins","smoke", "agec","educ","race_eth")], n=10)
## # A tibble: 10 x 6
## skcancer ins smoke agec educ race_eth
## <dbl> <dbl> <dbl> <fct> <fct> <fct>
## 1 NA 0 NA (39,59] <NA> <NA>
## 2 NA 0 1 (59,79] 3somecol nhwhite
## 3 NA 1 0 (59,79] 2hsgrad nhwhite
## 4 NA 1 1 (39,59] 4colgrad nhwhite
## 5 NA 1 0 (59,79] 4colgrad nhwhite
## 6 NA 1 0 (39,59] 2hsgrad nhwhite
## 7 NA 0 0 (39,59] 2hsgrad nhwhite
## 8 NA 1 0 (59,79] 0Prim hispanic
## 9 NA 1 0 (59,79] 2hsgrad nhwhite
## 10 NA 1 0 (59,79] 3somecol nhwhite
#Now, I will see the variability in the 5 different imputations for each outcome
fit.skcancer<-with(data=imp ,expr=lm(skcancer~ins+smoke+agec+educ+race_eth))
fit.skcancer
## call :
## with.mids(data = imp, expr = lm(skcancer ~ ins + smoke + agec +
## educ + race_eth))
##
## call1 :
## mice(data = dat2[, c("skcancer", "ins", "smoke", "agec", "educ",
## "race_eth")], m = 5, seed = 22)
##
## nmis :
## skcancer ins smoke agec educ race_eth
## 85 111 1025 0 126 523
##
## analyses :
## [[1]]
##
## Call:
## lm(formula = skcancer ~ ins + smoke + agec + educ + race_eth)
##
## Coefficients:
## (Intercept) ins smoke
## 0.0108905 0.0130592 -0.0093091
## agec(24,39] agec(39,59] agec(59,79]
## -0.0006711 0.0380528 0.1230458
## agec(79,99] educ0Prim educ1somehs
## 0.2240718 -0.0099725 0.0073718
## educ3somecol educ4colgrad race_ethhispanic
## 0.0169110 0.0293885 -0.0596209
## race_ethnh black race_ethnh multirace race_ethnh other
## -0.0962605 -0.0381190 -0.0611399
##
##
## [[2]]
##
## Call:
## lm(formula = skcancer ~ ins + smoke + agec + educ + race_eth)
##
## Coefficients:
## (Intercept) ins smoke
## 0.0108739 0.0127803 -0.0074638
## agec(24,39] agec(39,59] agec(59,79]
## -0.0008746 0.0380194 0.1230578
## agec(79,99] educ0Prim educ1somehs
## 0.2242136 -0.0113975 0.0087554
## educ3somecol educ4colgrad race_ethhispanic
## 0.0163805 0.0301674 -0.0600819
## race_ethnh black race_ethnh multirace race_ethnh other
## -0.0964353 -0.0349008 -0.0625263
##
##
## [[3]]
##
## Call:
## lm(formula = skcancer ~ ins + smoke + agec + educ + race_eth)
##
## Coefficients:
## (Intercept) ins smoke
## 0.0124175 0.0127267 -0.0102366
## agec(24,39] agec(39,59] agec(59,79]
## -0.0007254 0.0381002 0.1225768
## agec(79,99] educ0Prim educ1somehs
## 0.2249217 -0.0122620 0.0068816
## educ3somecol educ4colgrad race_ethhispanic
## 0.0155579 0.0288025 -0.0612024
## race_ethnh black race_ethnh multirace race_ethnh other
## -0.0966236 -0.0394479 -0.0604393
##
##
## [[4]]
##
## Call:
## lm(formula = skcancer ~ ins + smoke + agec + educ + race_eth)
##
## Coefficients:
## (Intercept) ins smoke
## 0.0118630 0.0117453 -0.0087067
## agec(24,39] agec(39,59] agec(59,79]
## -0.0007397 0.0380015 0.1233566
## agec(79,99] educ0Prim educ1somehs
## 0.2254732 -0.0137309 0.0058850
## educ3somecol educ4colgrad race_ethhispanic
## 0.0161813 0.0296258 -0.0576263
## race_ethnh black race_ethnh multirace race_ethnh other
## -0.0958059 -0.0388413 -0.0609205
##
##
## [[5]]
##
## Call:
## lm(formula = skcancer ~ ins + smoke + agec + educ + race_eth)
##
## Coefficients:
## (Intercept) ins smoke
## 0.0118449 0.0128120 -0.0081914
## agec(24,39] agec(39,59] agec(59,79]
## -0.0002143 0.0382301 0.1238165
## agec(79,99] educ0Prim educ1somehs
## 0.2240568 -0.0127781 0.0060310
## educ3somecol educ4colgrad race_ethhispanic
## 0.0156664 0.0284863 -0.0604061
## race_ethnh black race_ethnh multirace race_ethnh other
## -0.0964529 -0.0392673 -0.0618257
with (data=imp, exp=(sd(skcancer)))
## call :
## with.mids(data = imp, expr = (sd(skcancer)))
##
## call1 :
## mice(data = dat2[, c("skcancer", "ins", "smoke", "agec", "educ",
## "race_eth")], m = 5, seed = 22)
##
## nmis :
## skcancer ins smoke agec educ race_eth
## 85 111 1025 0 126 523
##
## analyses :
## [[1]]
## [1] 0.2922792
##
## [[2]]
## [1] 0.2923348
##
## [[3]]
## [1] 0.2924457
##
## [[4]]
## [1] 0.2923348
##
## [[5]]
## [1] 0.2927229
with (data=imp, exp=(prop.table(table(ins))))
## call :
## with.mids(data = imp, expr = (prop.table(table(ins))))
##
## call1 :
## mice(data = dat2[, c("skcancer", "ins", "smoke", "agec", "educ",
## "race_eth")], m = 5, seed = 22)
##
## nmis :
## skcancer ins smoke agec educ race_eth
## 85 111 1025 0 126 523
##
## analyses :
## [[1]]
## ins
## 0 1
## 0.07708 0.92292
##
## [[2]]
## ins
## 0 1
## 0.07756 0.92244
##
## [[3]]
## ins
## 0 1
## 0.07736 0.92264
##
## [[4]]
## ins
## 0 1
## 0.0774 0.9226
##
## [[5]]
## ins
## 0 1
## 0.07724 0.92276
with (data=imp, exp=(prop.table(table(race_eth))))
## call :
## with.mids(data = imp, expr = (prop.table(table(race_eth))))
##
## call1 :
## mice(data = dat2[, c("skcancer", "ins", "smoke", "agec", "educ",
## "race_eth")], m = 5, seed = 22)
##
## nmis :
## skcancer ins smoke agec educ race_eth
## 85 111 1025 0 126 523
##
## analyses :
## [[1]]
## race_eth
## nhwhite hispanic nh black nh multirace nh other
## 0.73956 0.09528 0.10584 0.01800 0.04132
##
## [[2]]
## race_eth
## nhwhite hispanic nh black nh multirace nh other
## 0.73908 0.09516 0.10644 0.01808 0.04124
##
## [[3]]
## race_eth
## nhwhite hispanic nh black nh multirace nh other
## 0.73932 0.09528 0.10612 0.01820 0.04108
##
## [[4]]
## race_eth
## nhwhite hispanic nh black nh multirace nh other
## 0.74000 0.09508 0.10572 0.01804 0.04116
##
## [[5]]
## race_eth
## nhwhite hispanic nh black nh multirace nh other
## 0.73952 0.09500 0.10612 0.01812 0.04124
with (data=imp, exp=(prop.table(table(educ))))
## call :
## with.mids(data = imp, expr = (prop.table(table(educ))))
##
## call1 :
## mice(data = dat2[, c("skcancer", "ins", "smoke", "agec", "educ",
## "race_eth")], m = 5, seed = 22)
##
## nmis :
## skcancer ins smoke agec educ race_eth
## 85 111 1025 0 126 523
##
## analyses :
## [[1]]
## educ
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0.24284 0.02276 0.03916 0.27568 0.41956
##
## [[2]]
## educ
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0.24352 0.02272 0.03888 0.27556 0.41932
##
## [[3]]
## educ
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0.24292 0.02276 0.03896 0.27588 0.41948
##
## [[4]]
## educ
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0.24344 0.02276 0.03884 0.27560 0.41936
##
## [[5]]
## educ
## 2hsgrad 0Prim 1somehs 3somecol 4colgrad
## 0.24292 0.02276 0.03908 0.27560 0.41964
Now we pool the separate models from each imputed data set:
est.p<-pool(fit.skcancer)
print(est.p)
## Class: mipo m = 5
## term m estimate ubar b t
## 1 (Intercept) 5 0.0115779543 9.833231e-05 4.563839e-07 9.887997e-05
## 2 ins 5 0.0126247165 4.844477e-05 2.579594e-07 4.875432e-05
## 3 smoke 5 -0.0087815295 2.848660e-05 1.121451e-06 2.983234e-05
## 4 agec(24,39] 5 -0.0006450166 7.049146e-05 6.357648e-08 7.056775e-05
## 5 agec(39,59] 5 0.0380808069 6.281770e-05 8.377785e-09 6.282775e-05
## 6 agec(59,79] 5 0.1231706867 6.190020e-05 2.081743e-07 6.215001e-05
## 7 agec(79,99] 5 0.2245474338 9.498081e-05 3.938617e-07 9.545344e-05
## 8 educ0Prim 5 -0.0120281864 1.649866e-04 2.034961e-06 1.674285e-04
## 9 educ1somehs 5 0.0069849618 9.594727e-05 1.353638e-06 9.757164e-05
## 10 educ3somecol 5 0.0161394260 2.468687e-05 3.042716e-07 2.505200e-05
## 11 educ4colgrad 5 0.0292941235 2.208308e-05 4.439632e-07 2.261584e-05
## 12 race_ethhispanic 5 -0.0597875332 4.355802e-05 1.792395e-06 4.570889e-05
## 13 race_ethnh black 5 -0.0963156456 3.471102e-05 9.771774e-08 3.482828e-05
## 14 race_ethnh multirace 5 -0.0381152507 1.800679e-04 3.490821e-06 1.842569e-04
## 15 race_ethnh other 5 -0.0613703194 8.226185e-05 6.664614e-07 8.306161e-05
## dfcom df riv lambda fmi
## 1 24985 20868.425 0.005569488 0.0055386408 0.005633935
## 2 24985 19856.600 0.006389777 0.0063492068 0.006449274
## 3 24985 1816.039 0.047241184 0.0451101283 0.046160012
## 4 24985 24775.324 0.001082284 0.0010811140 0.001161742
## 5 24985 24975.009 0.000160040 0.0001600144 0.000240072
## 6 24985 22610.234 0.004035676 0.0040194551 0.004107543
## 7 24985 21572.348 0.004976100 0.0049514615 0.005043701
## 8 24985 10660.913 0.014800924 0.0145850520 0.014769865
## 9 24985 9091.469 0.016929770 0.0166479249 0.016864178
## 10 24985 10669.529 0.014790285 0.0145747207 0.014759386
## 11 24985 5564.115 0.024125067 0.0235567587 0.023907548
## 12 24985 1679.068 0.049379505 0.0470559074 0.048188970
## 13 24985 23257.792 0.003378215 0.0033668413 0.003452533
## 14 24985 5876.387 0.023263365 0.0227344848 0.023066923
## 15 24985 15724.981 0.009722049 0.0096284403 0.009754378
summary(est.p)
## term estimate std.error statistic df
## 1 (Intercept) 0.0115779543 0.009943841 1.16433424 20868.425
## 2 ins 0.0126247165 0.006982429 1.80806937 19856.600
## 3 smoke -0.0087815295 0.005461899 -1.60777960 1816.039
## 4 agec(24,39] -0.0006450166 0.008400461 -0.07678348 24775.324
## 5 agec(39,59] 0.0380808069 0.007926396 4.80430304 24975.009
## 6 agec(59,79] 0.1231706867 0.007883528 15.62380374 22610.234
## 7 agec(79,99] 0.2245474338 0.009770028 22.98329544 21572.348
## 8 educ0Prim -0.0120281864 0.012939417 -0.92957712 10660.913
## 9 educ1somehs 0.0069849618 0.009877836 0.70713484 9091.469
## 10 educ3somecol 0.0161394260 0.005005197 3.22453355 10669.529
## 11 educ4colgrad 0.0292941235 0.004755611 6.15990682 5564.115
## 12 race_ethhispanic -0.0597875332 0.006760835 -8.84321698 1679.068
## 13 race_ethnh black -0.0963156456 0.005901549 -16.32040124 23257.792
## 14 race_ethnh multirace -0.0381152507 0.013574125 -2.80793424 5876.387
## 15 race_ethnh other -0.0613703194 0.009113814 -6.73376911 15724.981
## p.value
## 1 2.443019e-01
## 2 7.061083e-02
## 3 1.080573e-01
## 4 9.387964e-01
## 5 1.561923e-06
## 6 0.000000e+00
## 7 0.000000e+00
## 8 3.526111e-01
## 9 4.795008e-01
## 10 1.265579e-03
## 11 7.789072e-10
## 12 0.000000e+00
## 13 0.000000e+00
## 14 5.002486e-03
## 15 1.710498e-11
We need to pay attention to the fmi column and the lambda column. These convey information about how much the missingness of each particular variable affects the model coefficients.
lam<-data.frame(lam=est.p$pooled$lambda, param=row.names(est.p$pooled))
ggplot(data=lam,aes(x=param, y=lam))+geom_col()+theme(axis.text.x = element_text(angle = 45, hjust = 1))
It appears that a couple of the education variables and the income variables have large variances to them. This suggests that there may be noticeable variation in the resulting coefficient of the model, depending on which imputed data set we use.
We can also compare to the model fit on the original data, with missings eliminated:
library(dplyr)
bnm<-brfss_17%>%
select(skcancer, ins, smoke, agec, educ, race_eth)%>%
filter(complete.cases(.))%>%
as.data.frame()
summary(lm(skcancer~ins+smoke+agec+educ+race_eth, bnm))
##
## Call:
## lm(formula = skcancer ~ ins + smoke + agec + educ + race_eth,
## data = bnm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.27961 -0.14919 -0.07166 -0.01201 1.05804
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0120603 0.0104914 1.150 0.250343
## ins 0.0115089 0.0073683 1.562 0.118312
## smoke -0.0071162 0.0055936 -1.272 0.203311
## agec(24,39] -0.0005541 0.0088397 -0.063 0.950024
## agec(39,59] 0.0380715 0.0083507 4.559 5.16e-06 ***
## agec(59,79] 0.1256237 0.0082870 15.159 < 2e-16 ***
## agec(79,99] 0.2254183 0.0102525 21.987 < 2e-16 ***
## educ0Prim -0.0177314 0.0136736 -1.297 0.194725
## educ1somehs 0.0068544 0.0102505 0.669 0.503700
## educ3somecol 0.0171326 0.0052042 3.292 0.000996 ***
## educ4colgrad 0.0306204 0.0049114 6.235 4.61e-10 ***
## race_ethhispanic -0.0597337 0.0069275 -8.623 < 2e-16 ***
## race_ethnh black -0.0981906 0.0062106 -15.810 < 2e-16 ***
## race_ethnh multirace -0.0416276 0.0139650 -2.981 0.002878 **
## race_ethnh other -0.0645533 0.0095961 -6.727 1.77e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2831 on 23261 degrees of freedom
## Multiple R-squared: 0.07758, Adjusted R-squared: 0.07702
## F-statistic: 139.7 on 14 and 23261 DF, p-value: < 2.2e-16
Here, I compare the coefficients from the model where we eliminated all missing data to the one that we fit on the imputed data:
fit1<-lm(skcancer~ins+smoke+agec+educ+race_eth, data=brfss_17)
summary(fit1)
##
## Call:
## lm(formula = skcancer ~ ins + smoke + agec + educ + race_eth,
## data = brfss_17)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.27961 -0.14919 -0.07166 -0.01201 1.05804
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0120603 0.0104914 1.150 0.250343
## ins 0.0115089 0.0073683 1.562 0.118312
## smoke -0.0071162 0.0055936 -1.272 0.203311
## agec(24,39] -0.0005541 0.0088397 -0.063 0.950024
## agec(39,59] 0.0380715 0.0083507 4.559 5.16e-06 ***
## agec(59,79] 0.1256237 0.0082870 15.159 < 2e-16 ***
## agec(79,99] 0.2254183 0.0102525 21.987 < 2e-16 ***
## educ0Prim -0.0177314 0.0136736 -1.297 0.194725
## educ1somehs 0.0068544 0.0102505 0.669 0.503700
## educ3somecol 0.0171326 0.0052042 3.292 0.000996 ***
## educ4colgrad 0.0306204 0.0049114 6.235 4.61e-10 ***
## race_ethhispanic -0.0597337 0.0069275 -8.623 < 2e-16 ***
## race_ethnh black -0.0981906 0.0062106 -15.810 < 2e-16 ***
## race_ethnh multirace -0.0416276 0.0139650 -2.981 0.002878 **
## race_ethnh other -0.0645533 0.0095961 -6.727 1.77e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2831 on 23261 degrees of freedom
## (1724 observations deleted due to missingness)
## Multiple R-squared: 0.07758, Adjusted R-squared: 0.07702
## F-statistic: 139.7 on 14 and 23261 DF, p-value: < 2.2e-16
fit.imp<-lm(skcancer~ins+smoke+agec+educ+race_eth, data=dat.imp)
summary(fit.imp)
##
## Call:
## lm(formula = skcancer ~ ins + smoke + agec + educ + race_eth,
## data = dat.imp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.27741 -0.14700 -0.06937 -0.01022 1.05607
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0108905 0.0099258 1.097 0.272564
## ins 0.0130592 0.0069626 1.876 0.060718 .
## smoke -0.0093091 0.0053105 -1.753 0.079622 .
## agec(24,39] -0.0006711 0.0083917 -0.080 0.936263
## agec(39,59] 0.0380528 0.0079203 4.804 1.56e-06 ***
## agec(59,79] 0.1230458 0.0078637 15.647 < 2e-16 ***
## agec(79,99] 0.2240718 0.0097402 23.005 < 2e-16 ***
## educ0Prim -0.0099725 0.0128370 -0.777 0.437250
## educ1somehs 0.0073718 0.0097749 0.754 0.450760
## educ3somecol 0.0169110 0.0049668 3.405 0.000663 ***
## educ4colgrad 0.0293885 0.0046950 6.260 3.92e-10 ***
## race_ethhispanic -0.0596209 0.0065934 -9.042 < 2e-16 ***
## race_ethnh black -0.0962605 0.0058940 -16.332 < 2e-16 ***
## race_ethnh multirace -0.0381190 0.0134458 -2.835 0.004586 **
## race_ethnh other -0.0611399 0.0090538 -6.753 1.48e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2809 on 24985 degrees of freedom
## Multiple R-squared: 0.07718, Adjusted R-squared: 0.07667
## F-statistic: 149.3 on 14 and 24985 DF, p-value: < 2.2e-16
In the analysis that only uses complete cases, we see a non significant insurance (ins) effect on skin cancer (skcancer), and when i impute the missing values. This suggests a non significant selection effect of the insurance variable.