Packages

library(car)
library(mice)

## Warning: package 'mice' was built under R version 4.0.5

library(ggplot2)
library(dplyr)
library(car)
library(stargazer)
library(survey)
library(questionr)
library(dplyr)
library(tidyverse)
library(haven)
library(readr)
library(lattice)

load(url("https://github.com/coreysparks/data/blob/master/brfss_2017.Rdata?raw=true"))

set.seed(1234)
samp<-sample(1:dim(brfss_17)[1], size = 25000, replace = F) #smaller sample for brevity
brfss_17<-brfss_17[samp,]

Recode variables

#Healthy days
brfss_17$healthdays<-Recode(brfss_17$physhlth, recodes = "88=0; 77=NA; 99=NA")

# sex
brfss_17$male<-ifelse(brfss_17$sex==1,1,0)

brfss_17$badhealth<-Recode(brfss_17$genhlth, recodes="4:5=1; 1:3=0; else=NA")

#race/ethnicity
brfss_17$black<-Recode(brfss_17$racegr3, recodes="2=1; 9=NA; else=0")
brfss_17$white<-Recode(brfss_17$racegr3, recodes="1=1; 9=NA; else=0")
brfss_17$other<-Recode(brfss_17$racegr3, recodes="3:4=1; 9=NA; else=0")
brfss_17$hispanic<-Recode(brfss_17$racegr3, recodes="5=1; 9=NA; else=0")

brfss_17$race_eth<-Recode(brfss_17$racegr3, 
recodes="1='nhwhite'; 2='nh black'; 3='nh other';4='nh multirace'; 5='hispanic'; else=NA",
as.factor = T)
brfss_17$race_eth<-relevel(brfss_17$race_eth, ref = "nhwhite")

#insurance
brfss_17$ins<-Recode(brfss_17$hlthpln1, recodes ="7:9=NA; 1=1;2=0")

#income grouping
brfss_17$inc<-ifelse(brfss_17$incomg==9, NA, brfss_17$incomg)

#education level
brfss_17$educ<-Recode(brfss_17$educa,
recodes="1:2='0Prim'; 3='1somehs'; 4='2hsgrad'; 5='3somecol'; 6='4colgrad';9=NA",
as.factor=T)
brfss_17$educ<-relevel(brfss_17$educ, ref='2hsgrad')

#employment
brfss_17$employ<-Recode(brfss_17$employ1,
recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
as.factor=T)
brfss_17$employ<-relevel(brfss_17$employ, ref='Employed')

#marital status
brfss_17$marst<-Recode(brfss_17$marital,
recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
as.factor=T)
brfss_17$marst<-relevel(brfss_17$marst, ref='married')

#Age cut into intervals
brfss_17$agec<-cut(brfss_17$age80, breaks=c(0,24,39,59,79,99))

#BMI, in the brfss_17a the bmi variable has 2 implied decimal places,
#so we must divide by 100 to get real bmi's

brfss_17$bmi<-brfss_17$bmi5/100

#smoking currently
brfss_17$smoke<-Recode(brfss_17$smoker3, 
recodes="1:2=1; 3:4=0; else=NA")


brfss_17$skcancer<-Recode(brfss_17$chcscncr, recodes = "1= 1; 2= 0; else=NA")
brfss_17$kidney<-Recode(brfss_17$chckidny, recodes = "1= 1; 2= 0; else=NA")
brfss_17$chrobpuld<-Recode(brfss_17$chccopd1, recodes = "1= 1; 2= 0; else=NA")
brfss_17$arthritis<-Recode(brfss_17$havarth3, recodes = "1= 1; 2= 0; else=NA")
brfss_17$depdisorder<-Recode(brfss_17$addepev2, recodes = "1= 1; 2:4=0; else=NA")

Measure an outcome variable and at least 5 predictors.The predictor variables i am using are: insurance, race, education, age, and smoking for skin cancer.

Check for missingness of these variables by just using `summary(brfss_17)`

summary(brfss_17[, c("ins", "race_eth","agec", "educ","badhealth", "skcancer", "marst", "smoke")])

##       ins                 race_eth          agec            educ      
##  Min.   :0.0000   nhwhite     :18106   (0,24] :1544   2hsgrad : 6046  
##  1st Qu.:1.0000   hispanic    : 2326   (24,39]:4398   0Prim   :  564  
##  Median :1.0000   nh black    : 2594   (39,59]:7885   1somehs :  968  
##  Mean   :0.9229   nh multirace:  443   (59,79]:9238   3somecol: 6861  
##  3rd Qu.:1.0000   nh other    : 1008   (79,99]:1935   4colgrad:10435  
##  Max.   :1.0000   NA's        :  523                  NA's    :  126  
##  NA's   :111                                                          
##    badhealth         skcancer             marst           smoke       
##  Min.   :0.0000   Min.   :0.00000   married  :12600   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00000   cohab    :  823   1st Qu.:0.0000  
##  Median :0.0000   Median :0.00000   divorced : 3412   Median :0.0000  
##  Mean   :0.1756   Mean   :0.09452   nm       : 4526   Mean   :0.1363  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   separated:  557   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.00000   widowed  : 2860   Max.   :1.0000  
##  NA's   :67       NA's   :85        NA's     :  222   NA's   :1025

Report the pattern of missingness among all of these variables

The result shows that, the smoke variable has the highest missing number with 1025 people missing in the data, or 4.1% of the sample. Bad health has the lowest number of missing with 67 missing or 0.268% missing. Age has no missing values

Mean Imputation

#I'm using skin cancer (skcancer) as my outcome variable
summary(brfss_17$skcancer)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.00000 0.00000 0.00000 0.09452 0.00000 1.00000      85

#what happens when we replace the missing with the mean?
brfss_17$skcancer.imp.mean<-ifelse(is.na(brfss_17$skcancer)==T, mean(brfss_17$skcancer, na.rm=T), brfss_17$skcancer)

mean(brfss_17$skcancer, na.rm=T)

## [1] 0.09452137

mean(brfss_17$skcancer.imp.mean) #no difference!

## [1] 0.09452137

fit<-lm(skcancer~ins+educ+smoke+race_eth+marst, brfss_17)

median(brfss_17$skcancer, na.rm=T)

## [1] 0

median(brfss_17$skcancer.imp.mean) # No difference

## [1] 0

var(brfss_17$skcancer, na.rm=T)

## [1] 0.08559052

var(brfss_17$skcancer.imp.mean) # slight noticeable difference!

## [1] 0.0852995

The result shows that imputing with the mean does nothing to central tendency (when measured using the mean and the median), but it does reduce the variance in the outcome. This is because am replacing all missing cases with the most likely value (the mean), so am artificially deflating the variance. That’s not good.

Perform a multiple imputation of all values. Perform the analysis using this imputed data set. What are your results?

Multiple Imputation

#look at the patterns of missingness
md.pattern(brfss_17[,c("skcancer", "ins","smoke","agec","educ","race_eth")])

##       agec skcancer ins educ race_eth smoke     
## 23276    1        1   1    1        1     1    0
## 945      1        1   1    1        1     0    1
## 429      1        1   1    1        0     1    1
## 37       1        1   1    1        0     0    2
## 66       1        1   1    0        1     1    1
## 12       1        1   1    0        1     0    2
## 25       1        1   1    0        0     1    2
## 15       1        1   1    0        0     0    3
## 86       1        1   0    1        1     1    1
## 9        1        1   0    1        1     0    2
## 8        1        1   0    1        0     1    2
## 1        1        1   0    1        0     0    3
## 4        1        1   0    0        1     1    2
## 1        1        1   0    0        1     0    3
## 1        1        1   0    0        0     0    4
## 76       1        0   1    1        1     1    1
## 1        1        0   1    1        1     0    2
## 4        1        0   1    1        0     1    2
## 1        1        0   1    1        0     0    3
## 1        1        0   1    0        1     0    3
## 1        1        0   1    0        0     0    4
## 1        1        0   0    1        0     1    3
##          0       85 111  126      523  1025 1870

The first row shows the number of observations in the data that are complete (first row). The second row shows the number of people who are missing only the smoke variable. Rows that have multiple 0’s in the columns indicate missing data patterns where multiple variables are missing. The bottom row tells how many total people are missing each variable, in ANY combination with other variables.

Were the results similar between the mean/modal and multiple imputed data sets? How do the results compare to the results from the model fit with the data source with missing values?

Basic imputation:

We can perform a basic multiple imputation by simply doing: Note this may take a very long time with big data sets

dat2<-brfss_17
samp2<-sample(1:dim(dat2)[1], replace = F, size = 500)
dat2$skcancerknock<-dat2$skcancer
dat2$skcancerknock[samp2]<-NA

head(dat2[, c("skcancerknock","skcancer")])

## # A tibble: 6 x 2
##   skcancerknock skcancer
##           <dbl>    <dbl>
## 1             0        0
## 2             1        1
## 3             0        0
## 4             0        0
## 5             0        0
## 6             1        1

imp<-mice(data = dat2[,c("skcancer", "ins", "smoke", "agec","educ","race_eth")], seed = 22, m = 5)

## 
##  iter imp variable
##   1   1  skcancer  ins  smoke  educ  race_eth
##   1   2  skcancer  ins  smoke  educ  race_eth
##   1   3  skcancer  ins  smoke  educ  race_eth
##   1   4  skcancer  ins  smoke  educ  race_eth
##   1   5  skcancer  ins  smoke  educ  race_eth
##   2   1  skcancer  ins  smoke  educ  race_eth
##   2   2  skcancer  ins  smoke  educ  race_eth
##   2   3  skcancer  ins  smoke  educ  race_eth
##   2   4  skcancer  ins  smoke  educ  race_eth
##   2   5  skcancer  ins  smoke  educ  race_eth
##   3   1  skcancer  ins  smoke  educ  race_eth
##   3   2  skcancer  ins  smoke  educ  race_eth
##   3   3  skcancer  ins  smoke  educ  race_eth
##   3   4  skcancer  ins  smoke  educ  race_eth
##   3   5  skcancer  ins  smoke  educ  race_eth
##   4   1  skcancer  ins  smoke  educ  race_eth
##   4   2  skcancer  ins  smoke  educ  race_eth
##   4   3  skcancer  ins  smoke  educ  race_eth
##   4   4  skcancer  ins  smoke  educ  race_eth
##   4   5  skcancer  ins  smoke  educ  race_eth
##   5   1  skcancer  ins  smoke  educ  race_eth
##   5   2  skcancer  ins  smoke  educ  race_eth
##   5   3  skcancer  ins  smoke  educ  race_eth
##   5   4  skcancer  ins  smoke  educ  race_eth
##   5   5  skcancer  ins  smoke  educ  race_eth

print(imp)

## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
##  skcancer       ins     smoke      agec      educ  race_eth 
##     "pmm"     "pmm"     "pmm"        "" "polyreg" "polyreg" 
## PredictorMatrix:
##          skcancer ins smoke agec educ race_eth
## skcancer        0   1     1    1    1        1
## ins             1   0     1    1    1        1
## smoke           1   1     0    1    1        1
## agec            1   1     1    0    1        1
## educ            1   1     1    1    0        1
## race_eth        1   1     1    1    1        0

The results here shows how many imputations were done. It also shows total missingness, which imputation method was used for each variable (because you wouldn’t want to use a normal distribution for a categorical variable!!).

It also shows the sequence of how each variable is visited (or imputed, the default is left to right).

We may want to make sure imputed values are plausible by having a look. For instance, are the skcancer values outside of the range of the data.

head(imp$imp$skcancer)

##      1 2 3 4 5
## 446  0 0 0 0 0
## 697  0 0 0 0 0
## 876  0 0 0 0 1
## 939  0 0 0 0 1
## 1150 0 0 0 0 0
## 1344 0 0 1 0 0

summary(imp$imp$skcancer)

##        1                 2                 3                 4          
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.03529   Mean   :0.04706   Mean   :0.07059   Mean   :0.04706  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##        5         
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.1294  
##  3rd Qu.:0.0000  
##  Max.   :1.0000

summary(brfss_17$skcancer)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## 0.00000 0.00000 0.00000 0.09452 0.00000 1.00000      85

head(imp$imp$ins)

##      1 2 3 4 5
## 400  1 0 1 1 1
## 514  1 1 1 1 1
## 705  1 1 1 1 1
## 1371 1 1 0 1 1
## 1452 1 0 1 1 1
## 2256 0 0 1 1 1

summary(imp$imp$ins)

##        1                2                3                4         
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.0000   1st Qu.:1.0000   1st Qu.:1.0000   1st Qu.:1.0000  
##  Median :1.0000   Median :1.0000   Median :1.0000   Median :1.0000  
##  Mean   :0.9189   Mean   :0.8108   Mean   :0.8559   Mean   :0.8468  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##        5         
##  Min.   :0.0000  
##  1st Qu.:1.0000  
##  Median :1.0000  
##  Mean   :0.8829  
##  3rd Qu.:1.0000  
##  Max.   :1.0000

This shows the imputed values for the first 6 cases across the 5 different imputations, as well as the numeric summary of the imputed values. We can see that there is variation across the imputations, because the imputed values are not the same.

Plotting of skin cancer (skcancer) and race:

library(lattice)
stripplot(imp,skcancer~race_eth|.imp, pch=20)

and we see the distribution of the original data (blue dots), the imputed data (red dots) across the levels of race, for each of the five different imputation runs(the number at the top shows which run, and the first plot is the original data).

This plot shows that the skcancer values correspond well with the observed data, so they are probably plausible values.

If we want to get our new, imputed data, we can use the complete() function, which by default extracts the first imputed data set. If we want a different one, we can do complete(imp, action=3) for example, to get the third imputed data set.

dat.imp<-complete(imp, action = 1)
head(dat.imp, n=10)

##    skcancer ins smoke    agec     educ     race_eth
## 1         0   1     0 (59,79] 4colgrad      nhwhite
## 2         1   1     0 (59,79] 4colgrad      nhwhite
## 3         0   1     0 (59,79] 3somecol      nhwhite
## 4         0   1     0 (39,59] 4colgrad      nhwhite
## 5         0   1     0 (39,59] 4colgrad      nhwhite
## 6         1   1     0 (39,59] 4colgrad nh multirace
## 7         0   1     1 (24,39] 3somecol     nh black
## 8         0   1     0 (59,79]  1somehs     nh black
## 9         1   1     0 (59,79] 4colgrad      nhwhite
## 10        0   1     0 (59,79]  2hsgrad      nhwhite

#Compare to the original data
head(brfss_17[,c("skcancer", "ins","smoke", "agec","educ","race_eth")], n=10)

## # A tibble: 10 x 6
##    skcancer   ins smoke agec    educ     race_eth    
##       <dbl> <dbl> <dbl> <fct>   <fct>    <fct>       
##  1        0     1     0 (59,79] 4colgrad nhwhite     
##  2        1     1     0 (59,79] 4colgrad nhwhite     
##  3        0     1     0 (59,79] 3somecol nhwhite     
##  4        0     1     0 (39,59] 4colgrad nhwhite     
##  5        0     1     0 (39,59] 4colgrad nhwhite     
##  6        1     1     0 (39,59] 4colgrad nh multirace
##  7        0     1     1 (24,39] 3somecol nh black    
##  8        0     1     0 (59,79] 1somehs  nh black    
##  9        1     1     0 (59,79] 4colgrad nhwhite     
## 10        0     1     0 (59,79] 2hsgrad  nhwhite

While the first few cases don’t show much missingness, we can coax some more interesting cases out and compare the original data to the imputed:

head(dat.imp[is.na(brfss_17$skcancer)==T,], n=10)

##      skcancer ins smoke    agec     educ race_eth
## 446         0   0     0 (39,59]  1somehs nh other
## 697         0   0     1 (59,79] 3somecol  nhwhite
## 876         0   1     0 (59,79]  2hsgrad  nhwhite
## 939         0   1     1 (39,59] 4colgrad  nhwhite
## 1150        0   1     0 (59,79] 4colgrad  nhwhite
## 1344        0   1     0 (39,59]  2hsgrad  nhwhite
## 1684        0   0     0 (39,59]  2hsgrad  nhwhite
## 2276        0   1     0 (59,79]    0Prim hispanic
## 2416        0   1     0 (59,79]  2hsgrad  nhwhite
## 2714        0   1     0 (59,79] 3somecol  nhwhite

head(brfss_17[is.na(brfss_17$skcancer)==T,c("skcancer", "ins","smoke", "agec","educ","race_eth")], n=10)

## # A tibble: 10 x 6
##    skcancer   ins smoke agec    educ     race_eth
##       <dbl> <dbl> <dbl> <fct>   <fct>    <fct>   
##  1       NA     0    NA (39,59] <NA>     <NA>    
##  2       NA     0     1 (59,79] 3somecol nhwhite 
##  3       NA     1     0 (59,79] 2hsgrad  nhwhite 
##  4       NA     1     1 (39,59] 4colgrad nhwhite 
##  5       NA     1     0 (59,79] 4colgrad nhwhite 
##  6       NA     1     0 (39,59] 2hsgrad  nhwhite 
##  7       NA     0     0 (39,59] 2hsgrad  nhwhite 
##  8       NA     1     0 (59,79] 0Prim    hispanic
##  9       NA     1     0 (59,79] 2hsgrad  nhwhite 
## 10       NA     1     0 (59,79] 3somecol nhwhite

Analyzing the imputed data

#Now, I will see the variability in the 5 different imputations for each outcome
fit.skcancer<-with(data=imp ,expr=lm(skcancer~ins+smoke+agec+educ+race_eth))
fit.skcancer

## call :
## with.mids(data = imp, expr = lm(skcancer ~ ins + smoke + agec + 
##     educ + race_eth))
## 
## call1 :
## mice(data = dat2[, c("skcancer", "ins", "smoke", "agec", "educ", 
##     "race_eth")], m = 5, seed = 22)
## 
## nmis :
## skcancer      ins    smoke     agec     educ race_eth 
##       85      111     1025        0      126      523 
## 
## analyses :
## [[1]]
## 
## Call:
## lm(formula = skcancer ~ ins + smoke + agec + educ + race_eth)
## 
## Coefficients:
##          (Intercept)                   ins                 smoke  
##            0.0108905             0.0130592            -0.0093091  
##          agec(24,39]           agec(39,59]           agec(59,79]  
##           -0.0006711             0.0380528             0.1230458  
##          agec(79,99]             educ0Prim           educ1somehs  
##            0.2240718            -0.0099725             0.0073718  
##         educ3somecol          educ4colgrad      race_ethhispanic  
##            0.0169110             0.0293885            -0.0596209  
##     race_ethnh black  race_ethnh multirace      race_ethnh other  
##           -0.0962605            -0.0381190            -0.0611399  
## 
## 
## [[2]]
## 
## Call:
## lm(formula = skcancer ~ ins + smoke + agec + educ + race_eth)
## 
## Coefficients:
##          (Intercept)                   ins                 smoke  
##            0.0108739             0.0127803            -0.0074638  
##          agec(24,39]           agec(39,59]           agec(59,79]  
##           -0.0008746             0.0380194             0.1230578  
##          agec(79,99]             educ0Prim           educ1somehs  
##            0.2242136            -0.0113975             0.0087554  
##         educ3somecol          educ4colgrad      race_ethhispanic  
##            0.0163805             0.0301674            -0.0600819  
##     race_ethnh black  race_ethnh multirace      race_ethnh other  
##           -0.0964353            -0.0349008            -0.0625263  
## 
## 
## [[3]]
## 
## Call:
## lm(formula = skcancer ~ ins + smoke + agec + educ + race_eth)
## 
## Coefficients:
##          (Intercept)                   ins                 smoke  
##            0.0124175             0.0127267            -0.0102366  
##          agec(24,39]           agec(39,59]           agec(59,79]  
##           -0.0007254             0.0381002             0.1225768  
##          agec(79,99]             educ0Prim           educ1somehs  
##            0.2249217            -0.0122620             0.0068816  
##         educ3somecol          educ4colgrad      race_ethhispanic  
##            0.0155579             0.0288025            -0.0612024  
##     race_ethnh black  race_ethnh multirace      race_ethnh other  
##           -0.0966236            -0.0394479            -0.0604393  
## 
## 
## [[4]]
## 
## Call:
## lm(formula = skcancer ~ ins + smoke + agec + educ + race_eth)
## 
## Coefficients:
##          (Intercept)                   ins                 smoke  
##            0.0118630             0.0117453            -0.0087067  
##          agec(24,39]           agec(39,59]           agec(59,79]  
##           -0.0007397             0.0380015             0.1233566  
##          agec(79,99]             educ0Prim           educ1somehs  
##            0.2254732            -0.0137309             0.0058850  
##         educ3somecol          educ4colgrad      race_ethhispanic  
##            0.0161813             0.0296258            -0.0576263  
##     race_ethnh black  race_ethnh multirace      race_ethnh other  
##           -0.0958059            -0.0388413            -0.0609205  
## 
## 
## [[5]]
## 
## Call:
## lm(formula = skcancer ~ ins + smoke + agec + educ + race_eth)
## 
## Coefficients:
##          (Intercept)                   ins                 smoke  
##            0.0118449             0.0128120            -0.0081914  
##          agec(24,39]           agec(39,59]           agec(59,79]  
##           -0.0002143             0.0382301             0.1238165  
##          agec(79,99]             educ0Prim           educ1somehs  
##            0.2240568            -0.0127781             0.0060310  
##         educ3somecol          educ4colgrad      race_ethhispanic  
##            0.0156664             0.0284863            -0.0604061  
##     race_ethnh black  race_ethnh multirace      race_ethnh other  
##           -0.0964529            -0.0392673            -0.0618257

variation in skin cancer (skcancer)

with (data=imp, exp=(sd(skcancer)))

## call :
## with.mids(data = imp, expr = (sd(skcancer)))
## 
## call1 :
## mice(data = dat2[, c("skcancer", "ins", "smoke", "agec", "educ", 
##     "race_eth")], m = 5, seed = 22)
## 
## nmis :
## skcancer      ins    smoke     agec     educ race_eth 
##       85      111     1025        0      126      523 
## 
## analyses :
## [[1]]
## [1] 0.2922792
## 
## [[2]]
## [1] 0.2923348
## 
## [[3]]
## [1] 0.2924457
## 
## [[4]]
## [1] 0.2923348
## 
## [[5]]
## [1] 0.2927229

Frequency table for ins (insurance)

with (data=imp, exp=(prop.table(table(ins))))

## call :
## with.mids(data = imp, expr = (prop.table(table(ins))))
## 
## call1 :
## mice(data = dat2[, c("skcancer", "ins", "smoke", "agec", "educ", 
##     "race_eth")], m = 5, seed = 22)
## 
## nmis :
## skcancer      ins    smoke     agec     educ race_eth 
##       85      111     1025        0      126      523 
## 
## analyses :
## [[1]]
## ins
##       0       1 
## 0.07708 0.92292 
## 
## [[2]]
## ins
##       0       1 
## 0.07756 0.92244 
## 
## [[3]]
## ins
##       0       1 
## 0.07736 0.92264 
## 
## [[4]]
## ins
##      0      1 
## 0.0774 0.9226 
## 
## [[5]]
## ins
##       0       1 
## 0.07724 0.92276

Frequency table for race/ethnicty

with (data=imp, exp=(prop.table(table(race_eth))))

## call :
## with.mids(data = imp, expr = (prop.table(table(race_eth))))
## 
## call1 :
## mice(data = dat2[, c("skcancer", "ins", "smoke", "agec", "educ", 
##     "race_eth")], m = 5, seed = 22)
## 
## nmis :
## skcancer      ins    smoke     agec     educ race_eth 
##       85      111     1025        0      126      523 
## 
## analyses :
## [[1]]
## race_eth
##      nhwhite     hispanic     nh black nh multirace     nh other 
##      0.73956      0.09528      0.10584      0.01800      0.04132 
## 
## [[2]]
## race_eth
##      nhwhite     hispanic     nh black nh multirace     nh other 
##      0.73908      0.09516      0.10644      0.01808      0.04124 
## 
## [[3]]
## race_eth
##      nhwhite     hispanic     nh black nh multirace     nh other 
##      0.73932      0.09528      0.10612      0.01820      0.04108 
## 
## [[4]]
## race_eth
##      nhwhite     hispanic     nh black nh multirace     nh other 
##      0.74000      0.09508      0.10572      0.01804      0.04116 
## 
## [[5]]
## race_eth
##      nhwhite     hispanic     nh black nh multirace     nh other 
##      0.73952      0.09500      0.10612      0.01812      0.04124

Frequency table for education

with (data=imp, exp=(prop.table(table(educ))))

## call :
## with.mids(data = imp, expr = (prop.table(table(educ))))
## 
## call1 :
## mice(data = dat2[, c("skcancer", "ins", "smoke", "agec", "educ", 
##     "race_eth")], m = 5, seed = 22)
## 
## nmis :
## skcancer      ins    smoke     agec     educ race_eth 
##       85      111     1025        0      126      523 
## 
## analyses :
## [[1]]
## educ
##  2hsgrad    0Prim  1somehs 3somecol 4colgrad 
##  0.24284  0.02276  0.03916  0.27568  0.41956 
## 
## [[2]]
## educ
##  2hsgrad    0Prim  1somehs 3somecol 4colgrad 
##  0.24352  0.02272  0.03888  0.27556  0.41932 
## 
## [[3]]
## educ
##  2hsgrad    0Prim  1somehs 3somecol 4colgrad 
##  0.24292  0.02276  0.03896  0.27588  0.41948 
## 
## [[4]]
## educ
##  2hsgrad    0Prim  1somehs 3somecol 4colgrad 
##  0.24344  0.02276  0.03884  0.27560  0.41936 
## 
## [[5]]
## educ
##  2hsgrad    0Prim  1somehs 3somecol 4colgrad 
##  0.24292  0.02276  0.03908  0.27560  0.41964

Now we pool the separate models from each imputed data set:

est.p<-pool(fit.skcancer)
print(est.p)

## Class: mipo    m = 5 
##                    term m      estimate         ubar            b            t
## 1           (Intercept) 5  0.0115779543 9.833231e-05 4.563839e-07 9.887997e-05
## 2                   ins 5  0.0126247165 4.844477e-05 2.579594e-07 4.875432e-05
## 3                 smoke 5 -0.0087815295 2.848660e-05 1.121451e-06 2.983234e-05
## 4           agec(24,39] 5 -0.0006450166 7.049146e-05 6.357648e-08 7.056775e-05
## 5           agec(39,59] 5  0.0380808069 6.281770e-05 8.377785e-09 6.282775e-05
## 6           agec(59,79] 5  0.1231706867 6.190020e-05 2.081743e-07 6.215001e-05
## 7           agec(79,99] 5  0.2245474338 9.498081e-05 3.938617e-07 9.545344e-05
## 8             educ0Prim 5 -0.0120281864 1.649866e-04 2.034961e-06 1.674285e-04
## 9           educ1somehs 5  0.0069849618 9.594727e-05 1.353638e-06 9.757164e-05
## 10         educ3somecol 5  0.0161394260 2.468687e-05 3.042716e-07 2.505200e-05
## 11         educ4colgrad 5  0.0292941235 2.208308e-05 4.439632e-07 2.261584e-05
## 12     race_ethhispanic 5 -0.0597875332 4.355802e-05 1.792395e-06 4.570889e-05
## 13     race_ethnh black 5 -0.0963156456 3.471102e-05 9.771774e-08 3.482828e-05
## 14 race_ethnh multirace 5 -0.0381152507 1.800679e-04 3.490821e-06 1.842569e-04
## 15     race_ethnh other 5 -0.0613703194 8.226185e-05 6.664614e-07 8.306161e-05
##    dfcom        df         riv       lambda         fmi
## 1  24985 20868.425 0.005569488 0.0055386408 0.005633935
## 2  24985 19856.600 0.006389777 0.0063492068 0.006449274
## 3  24985  1816.039 0.047241184 0.0451101283 0.046160012
## 4  24985 24775.324 0.001082284 0.0010811140 0.001161742
## 5  24985 24975.009 0.000160040 0.0001600144 0.000240072
## 6  24985 22610.234 0.004035676 0.0040194551 0.004107543
## 7  24985 21572.348 0.004976100 0.0049514615 0.005043701
## 8  24985 10660.913 0.014800924 0.0145850520 0.014769865
## 9  24985  9091.469 0.016929770 0.0166479249 0.016864178
## 10 24985 10669.529 0.014790285 0.0145747207 0.014759386
## 11 24985  5564.115 0.024125067 0.0235567587 0.023907548
## 12 24985  1679.068 0.049379505 0.0470559074 0.048188970
## 13 24985 23257.792 0.003378215 0.0033668413 0.003452533
## 14 24985  5876.387 0.023263365 0.0227344848 0.023066923
## 15 24985 15724.981 0.009722049 0.0096284403 0.009754378

summary(est.p)

##                    term      estimate   std.error    statistic        df
## 1           (Intercept)  0.0115779543 0.009943841   1.16433424 20868.425
## 2                   ins  0.0126247165 0.006982429   1.80806937 19856.600
## 3                 smoke -0.0087815295 0.005461899  -1.60777960  1816.039
## 4           agec(24,39] -0.0006450166 0.008400461  -0.07678348 24775.324
## 5           agec(39,59]  0.0380808069 0.007926396   4.80430304 24975.009
## 6           agec(59,79]  0.1231706867 0.007883528  15.62380374 22610.234
## 7           agec(79,99]  0.2245474338 0.009770028  22.98329544 21572.348
## 8             educ0Prim -0.0120281864 0.012939417  -0.92957712 10660.913
## 9           educ1somehs  0.0069849618 0.009877836   0.70713484  9091.469
## 10         educ3somecol  0.0161394260 0.005005197   3.22453355 10669.529
## 11         educ4colgrad  0.0292941235 0.004755611   6.15990682  5564.115
## 12     race_ethhispanic -0.0597875332 0.006760835  -8.84321698  1679.068
## 13     race_ethnh black -0.0963156456 0.005901549 -16.32040124 23257.792
## 14 race_ethnh multirace -0.0381152507 0.013574125  -2.80793424  5876.387
## 15     race_ethnh other -0.0613703194 0.009113814  -6.73376911 15724.981
##         p.value
## 1  2.443019e-01
## 2  7.061083e-02
## 3  1.080573e-01
## 4  9.387964e-01
## 5  1.561923e-06
## 6  0.000000e+00
## 7  0.000000e+00
## 8  3.526111e-01
## 9  4.795008e-01
## 10 1.265579e-03
## 11 7.789072e-10
## 12 0.000000e+00
## 13 0.000000e+00
## 14 5.002486e-03
## 15 1.710498e-11

We need to pay attention to the fmi column and the lambda column. These convey information about how much the missingness of each particular variable affects the model coefficients.

lam<-data.frame(lam=est.p$pooled$lambda, param=row.names(est.p$pooled))

ggplot(data=lam,aes(x=param, y=lam))+geom_col()+theme(axis.text.x = element_text(angle = 45, hjust = 1))

It appears that a couple of the education variables and the income variables have large variances to them. This suggests that there may be noticeable variation in the resulting coefficient of the model, depending on which imputed data set we use.

We can also compare to the model fit on the original data, with missings eliminated:

library(dplyr)
bnm<-brfss_17%>%
  select(skcancer, ins, smoke, agec, educ, race_eth)%>%
  filter(complete.cases(.))%>%
  as.data.frame()

summary(lm(skcancer~ins+smoke+agec+educ+race_eth, bnm))

## 
## Call:
## lm(formula = skcancer ~ ins + smoke + agec + educ + race_eth, 
##     data = bnm)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27961 -0.14919 -0.07166 -0.01201  1.05804 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.0120603  0.0104914   1.150 0.250343    
## ins                   0.0115089  0.0073683   1.562 0.118312    
## smoke                -0.0071162  0.0055936  -1.272 0.203311    
## agec(24,39]          -0.0005541  0.0088397  -0.063 0.950024    
## agec(39,59]           0.0380715  0.0083507   4.559 5.16e-06 ***
## agec(59,79]           0.1256237  0.0082870  15.159  < 2e-16 ***
## agec(79,99]           0.2254183  0.0102525  21.987  < 2e-16 ***
## educ0Prim            -0.0177314  0.0136736  -1.297 0.194725    
## educ1somehs           0.0068544  0.0102505   0.669 0.503700    
## educ3somecol          0.0171326  0.0052042   3.292 0.000996 ***
## educ4colgrad          0.0306204  0.0049114   6.235 4.61e-10 ***
## race_ethhispanic     -0.0597337  0.0069275  -8.623  < 2e-16 ***
## race_ethnh black     -0.0981906  0.0062106 -15.810  < 2e-16 ***
## race_ethnh multirace -0.0416276  0.0139650  -2.981 0.002878 ** 
## race_ethnh other     -0.0645533  0.0095961  -6.727 1.77e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2831 on 23261 degrees of freedom
## Multiple R-squared:  0.07758,    Adjusted R-squared:  0.07702 
## F-statistic: 139.7 on 14 and 23261 DF,  p-value: < 2.2e-16

Compare imputed model to original data

Here, I compare the coefficients from the model where we eliminated all missing data to the one that we fit on the imputed data:

fit1<-lm(skcancer~ins+smoke+agec+educ+race_eth, data=brfss_17)
summary(fit1)

## 
## Call:
## lm(formula = skcancer ~ ins + smoke + agec + educ + race_eth, 
##     data = brfss_17)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27961 -0.14919 -0.07166 -0.01201  1.05804 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.0120603  0.0104914   1.150 0.250343    
## ins                   0.0115089  0.0073683   1.562 0.118312    
## smoke                -0.0071162  0.0055936  -1.272 0.203311    
## agec(24,39]          -0.0005541  0.0088397  -0.063 0.950024    
## agec(39,59]           0.0380715  0.0083507   4.559 5.16e-06 ***
## agec(59,79]           0.1256237  0.0082870  15.159  < 2e-16 ***
## agec(79,99]           0.2254183  0.0102525  21.987  < 2e-16 ***
## educ0Prim            -0.0177314  0.0136736  -1.297 0.194725    
## educ1somehs           0.0068544  0.0102505   0.669 0.503700    
## educ3somecol          0.0171326  0.0052042   3.292 0.000996 ***
## educ4colgrad          0.0306204  0.0049114   6.235 4.61e-10 ***
## race_ethhispanic     -0.0597337  0.0069275  -8.623  < 2e-16 ***
## race_ethnh black     -0.0981906  0.0062106 -15.810  < 2e-16 ***
## race_ethnh multirace -0.0416276  0.0139650  -2.981 0.002878 ** 
## race_ethnh other     -0.0645533  0.0095961  -6.727 1.77e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2831 on 23261 degrees of freedom
##   (1724 observations deleted due to missingness)
## Multiple R-squared:  0.07758,    Adjusted R-squared:  0.07702 
## F-statistic: 139.7 on 14 and 23261 DF,  p-value: < 2.2e-16

fit.imp<-lm(skcancer~ins+smoke+agec+educ+race_eth, data=dat.imp)
summary(fit.imp)

## 
## Call:
## lm(formula = skcancer ~ ins + smoke + agec + educ + race_eth, 
##     data = dat.imp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27741 -0.14700 -0.06937 -0.01022  1.05607 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.0108905  0.0099258   1.097 0.272564    
## ins                   0.0130592  0.0069626   1.876 0.060718 .  
## smoke                -0.0093091  0.0053105  -1.753 0.079622 .  
## agec(24,39]          -0.0006711  0.0083917  -0.080 0.936263    
## agec(39,59]           0.0380528  0.0079203   4.804 1.56e-06 ***
## agec(59,79]           0.1230458  0.0078637  15.647  < 2e-16 ***
## agec(79,99]           0.2240718  0.0097402  23.005  < 2e-16 ***
## educ0Prim            -0.0099725  0.0128370  -0.777 0.437250    
## educ1somehs           0.0073718  0.0097749   0.754 0.450760    
## educ3somecol          0.0169110  0.0049668   3.405 0.000663 ***
## educ4colgrad          0.0293885  0.0046950   6.260 3.92e-10 ***
## race_ethhispanic     -0.0596209  0.0065934  -9.042  < 2e-16 ***
## race_ethnh black     -0.0962605  0.0058940 -16.332  < 2e-16 ***
## race_ethnh multirace -0.0381190  0.0134458  -2.835 0.004586 ** 
## race_ethnh other     -0.0611399  0.0090538  -6.753 1.48e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2809 on 24985 degrees of freedom
## Multiple R-squared:  0.07718,    Adjusted R-squared:  0.07667 
## F-statistic: 149.3 on 14 and 24985 DF,  p-value: < 2.2e-16

In the analysis that only uses complete cases, we see a non significant insurance (ins) effect on skin cancer (skcancer), and when i impute the missing values. This suggests a non significant selection effect of the insurance variable.

DEM 7283 - Multiple Imputation & Missing Data

Assogba Albert

4/10/2021

Packages

Recode variables

Measure an outcome variable and at least 5 predictors.The predictor variables i am using are: insurance, race, education, age, and smoking for skin cancer.

Check for missingness of these variables by just using `summary(brfss_17)`

Report the pattern of missingness among all of these variables

Mean Imputation

Perform a multiple imputation of all values. Perform the analysis using this imputed data set. What are your results?

Multiple Imputation

Were the results similar between the mean/modal and multiple imputed data sets? How do the results compare to the results from the model fit with the data source with missing values?

Basic imputation:

Plotting of skin cancer (skcancer) and race:

Analyzing the imputed data

variation in skin cancer (skcancer)

Frequency table for ins (insurance)

Frequency table for race/ethnicty

Frequency table for education

Compare imputed model to original data

DEM 7283 - Multiple Imputation & Missing Data

Assogba Albert

4/10/2021

Packages

Recode variables

Measure an outcome variable and at least 5 predictors.The predictor variables i am using are: insurance, race, education, age, and smoking for skin cancer.

Check for missingness of these variables by just using summary(brfss_17)

Report the pattern of missingness among all of these variables

Perform a mean (a mean for numeric data) or a modal imputation (for categorical data) of all values. Perform the analysis using this imputed data. What are your results?

Mean Imputation

Perform a multiple imputation of all values. Perform the analysis using this imputed data set. What are your results?

Multiple Imputation

Were the results similar between the mean/modal and multiple imputed data sets? How do the results compare to the results from the model fit with the data source with missing values?

Basic imputation:

Plotting of skin cancer (skcancer) and race:

Analyzing the imputed data

variation in skin cancer (skcancer)

Frequency table for ins (insurance)

Frequency table for race/ethnicty

Frequency table for education

Compare imputed model to original data

Check for missingness of these variables by just using `summary(brfss_17)`