HW8

I’ll be using the BRFSS 2020 data set since previous data I used did not have very many missing values to do imputation on.

library(car)
library(stargazer)
library(survey)
library(ggplot2)
library(pander)
library(dplyr)
library(knitr)
library(factoextra)
library(FactoMineR)
library(mice)

brfss20<- readRDS(url("https://github.com/coreysparks/DEM7283/blob/master/data/brfss20sm.rds?raw=true"))
names(brfss20) <- tolower(gsub(pattern = "_",replacement =  "",x =  names(brfss20)))

#smaller sample
samps<- sample(1:dim(brfss20)[1], size = 10000, replace = F)
brfss20<- brfss20[samps,]

brfss20$drink <- Recode(brfss20$drnkwk1, recodes = "99900=NA")

#sex
brfss20$male<-as.factor(ifelse(brfss20$sex==1,
                                "Male",
                                "Female"))

#binge drinking
brfss20$bingedrink <- Recode(brfss20$drnk3ge5, recodes = "88=0; 77=NA; 99=NA")

#smoking
brfss20$smoke <- Recode(brfss20$smoke100, recodes = "2=0; 7=NA; 9=NA", as.factor=T)

#drink and drive
brfss20$drinkdrive <- Recode(brfss20$drnkdri2, recodes = "88=0; 77=NA; 99=NA")

#sealt belt useage
brfss20$seatbeltdrive <- Recode(brfss20$seatbelt, recodes = "1:2 = 1; 3:5 = 0; 7=NA; 8  = NA; 9=NA")

#sleep per night
brfss20$sleep <- Recode(brfss20$sleptim1, recodes = "77=NA; 99=NA")

#exercise
brfss20$exercise <- Recode(brfss20$exerany2, recodes = "2=0; 7=NA; 9=NA")

#healthy days
brfss20$healthdays<-Recode(brfss20$physhlth, recodes = "88=0; 77=NA; 99=NA", as.factor=T)

#bmi
brfss20$bmi<-brfss20$bmi5/100

#income
brfss20$income <- Recode(brfss20$income2, recodes = "77=NA; 99=NA", as.factor=T)

#marital
brfss20$marst<-Recode(brfss20$marital,
                       recodes="1='married'; 2='divorced'; 3='widowed'; 4='separated'; 5='nm';6='cohab'; else=NA",
                       as.factor=T)
#employ
brfss20$employ<-Recode(brfss20$employ1,
                        recodes="1:2='Employed'; 2:6='nilf'; 7='retired'; 8='unable'; else=NA",
                        as.factor=T)


brfss20a <- brfss20 %>% select(healthdays, smoke, bmi, income, marst, employ)


summary(brfss20a)

##    healthdays    smoke           bmi            income           marst     
##  0      :7002   0   :5762   Min.   :12.88   8      :3331   cohab    : 412  
##  30     : 526   1   :3677   1st Qu.:23.81   7      :1251   divorced :1230  
##  2      : 440   NA's: 561   Median :27.12   6      :1054   married  :4986  
##  1      : 342               Mean   :28.13   5      : 685   nm       :2069  
##  3      : 266               3rd Qu.:31.19   4      : 602   separated: 230  
##  (Other):1195               Max.   :76.52   (Other): 983   widowed  : 950  
##  NA's   : 229               NA's   :1118    NA's   :2094   NA's     : 123  
##       employ    
##  Employed:5209  
##  nilf    :1373  
##  retired :2668  
##  unable  : 533  
##  NA's    : 217  
##                 
##

mcv.marriage <- factor(names(which.max(table(brfss20$marst))),
                       levels = levels(brfss20$marst))
mcv.marriage

## [1] married
## Levels: cohab divorced married nm separated widowed

brfss20$marst.imp<-as.factor(ifelse(is.na(brfss20$marst)==T, mcv.marriage, brfss20$marst))
levels(brfss20$marst.imp)<-levels(brfss20$marst)

prop.table(table(brfss20$marst))

## 
##      cohab   divorced    married         nm  separated    widowed 
## 0.04171307 0.12453174 0.50480915 0.20947656 0.02328642 0.09618305

prop.table(table(brfss20$marst.imp))

## 
##     cohab  divorced   married        nm separated   widowed 
##    0.0412    0.1230    0.5109    0.2069    0.0230    0.0950

mcv.employ <- factor(names(which.max(table(brfss20$employ))),
                       levels = levels(brfss20$employ))
mcv.employ

## [1] Employed
## Levels: Employed nilf retired unable

brfss20$employ.imp<-as.factor(ifelse(is.na(brfss20$employ)==T, mcv.employ, brfss20$employ))
levels(brfss20$employ.imp)<-levels(brfss20$employ)

prop.table(table(brfss20$employ))

## 
##   Employed       nilf    retired     unable 
## 0.53245426 0.14034550 0.27271798 0.05448227

prop.table(table(brfss20$employ.imp))

## 
## Employed     nilf  retired   unable 
##   0.5426   0.1373   0.2668   0.0533

mcv.healthdays <- factor(names(which.max(table(brfss20$healthdays))),
                     levels = levels(brfss20$healthdays))
mcv.healthdays

## [1] 0
## 31 Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 30

brfss20$healthdays.imp<-as.factor(ifelse(is.na(brfss20$healthdays)==T, mcv.healthdays, brfss20$healthdays))
levels(brfss20$healthdays.imp)<-levels(brfss20$healthdays)

prop.table(table(brfss20$healthdays))

## 
##            0            1            2            3            4            5 
## 0.7166103776 0.0350015352 0.0450312148 0.0272234162 0.0122812404 0.0251765428 
##            6            7            8            9           10           11 
## 0.0037867158 0.0152492068 0.0022515607 0.0008187494 0.0151468632 0.0002046873 
##           12           13           14           15           16           17 
## 0.0022515607 0.0006140620 0.0079828063 0.0150445195 0.0004093747 0.0007164057 
##           18           19           20           21           22           23 
## 0.0006140620 0.0003070310 0.0106437417 0.0020468734 0.0002046873 0.0001023437 
##           24           25           26           27           28           29 
## 0.0004093747 0.0034796848 0.0001023437 0.0004093747 0.0015351551 0.0005117184 
##           30 
## 0.0538327704

prop.table(table(brfss20$healthdays.imp))

## 
##      0      1      2      3      4      5      6      7      8      9     10 
## 0.7231 0.0342 0.0440 0.0266 0.0120 0.0246 0.0037 0.0149 0.0022 0.0008 0.0148 
##     11     12     13     14     15     16     17     18     19     20     21 
## 0.0002 0.0022 0.0006 0.0078 0.0147 0.0004 0.0007 0.0006 0.0003 0.0104 0.0020 
##     22     23     24     25     26     27     28     29     30 
## 0.0002 0.0001 0.0004 0.0034 0.0001 0.0004 0.0015 0.0005 0.0526

mcv.smoke <- factor(names(which.max(table(brfss20$smoke))),
                     levels = levels(brfss20$smoke))
mcv.smoke

## [1] 0
## Levels: 0 1

brfss20$smoke.imp<-as.factor(ifelse(is.na(brfss20$smoke)==T, mcv.smoke, brfss20$smoke))
levels(brfss20$smoke.imp)<-levels(brfss20$smoke)

prop.table(table(brfss20$smoke))

## 
##        0        1 
## 0.610446 0.389554

prop.table(table(brfss20$smoke.imp))

## 
##      0      1 
## 0.6323 0.3677

mcv.income <- factor(names(which.max(table(brfss20$income))),
                     levels = levels(brfss20$income))
mcv.income

## [1] 8
## Levels: 1 2 3 4 5 6 7 8

brfss20$income.imp<-as.factor(ifelse(is.na(brfss20$income)==T, mcv.income, brfss20$income))
levels(brfss20$income.imp)<-levels(brfss20$income)

prop.table(table(brfss20$income))

## 
##          1          2          3          4          5          6          7 
## 0.03389831 0.03706046 0.05337718 0.07614470 0.08664306 0.13331647 0.15823425 
##          8 
## 0.42132558

prop.table(table(brfss20$income.imp))

## 
##      1      2      3      4      5      6      7      8 
## 0.0268 0.0293 0.0422 0.0602 0.0685 0.1054 0.1251 0.5425

barplot(prop.table(table(brfss20$income)), main="Original Data", ylim=c(0, .6))

barplot(prop.table(table(brfss20$income.imp)), main="Imputed Data",ylim=c(0, .6))

brfss20$bmi.imp.mean<-ifelse(is.na(brfss20$bmi)==T, mean(brfss20$bmi, na.rm=T), brfss20$bmi)
var(brfss20$bmi, na.rm=T)

## [1] 39.35813

var(brfss20$bmi.imp.mean)

## [1] 34.95745

modalfit <- lm(bmi.imp.mean ~ marst.imp + employ.imp + income.imp + healthdays.imp + smoke.imp, brfss20)
modalfit

## 
## Call:
## lm(formula = bmi.imp.mean ~ marst.imp + employ.imp + income.imp + 
##     healthdays.imp + smoke.imp, data = brfss20)
## 
## Coefficients:
##        (Intercept)   marst.impdivorced    marst.impmarried         marst.impnm  
##           28.50971             0.58690             0.55605            -0.15798  
## marst.impseparated    marst.impwidowed      employ.impnilf   employ.impretired  
##            0.40149            -0.03181            -0.94370            -0.60174  
##   employ.impunable         income.imp2         income.imp3         income.imp4  
##            1.31099            -0.26913             0.24923            -0.01683  
##        income.imp5         income.imp6         income.imp7         income.imp8  
##           -0.10722            -0.46312            -0.72192            -1.31292  
##    healthdays.imp1     healthdays.imp2     healthdays.imp3     healthdays.imp4  
##           -0.26947             0.74340             1.43539             0.46237  
##    healthdays.imp5     healthdays.imp6     healthdays.imp7     healthdays.imp8  
##            1.31948             1.16212             0.97237             1.57721  
##    healthdays.imp9    healthdays.imp10    healthdays.imp11    healthdays.imp12  
##            5.94323             1.47129            18.68219             0.70291  
##   healthdays.imp13    healthdays.imp14    healthdays.imp15    healthdays.imp16  
##            3.47764             2.13197             1.42013             0.15513  
##   healthdays.imp17    healthdays.imp18    healthdays.imp19    healthdays.imp20  
##            5.04144             1.78923             4.47878             2.53205  
##   healthdays.imp21    healthdays.imp22    healthdays.imp23    healthdays.imp24  
##           -1.20322             9.33809             2.72118             8.34987  
##   healthdays.imp25    healthdays.imp26    healthdays.imp27    healthdays.imp28  
##            1.89663           -11.45502             3.96500             3.32930  
##   healthdays.imp29    healthdays.imp30          smoke.imp1  
##           -0.97005             2.05718             0.05575

I used a mix of mean and modal imputation for the first attempt. The tables show how the modal imputation can make noticeable differences.

md.pattern(brfss20[,c("bmi", "healthdays", "smoke","income","marst", "employ")])

##      marst employ healthdays smoke  bmi income     
## 7076     1      1          1     1    1      1    0
## 1314     1      1          1     1    1      0    1
## 414      1      1          1     1    0      1    1
## 255      1      1          1     1    0      0    2
## 127      1      1          1     0    1      1    1
## 56       1      1          1     0    1      0    2
## 59       1      1          1     0    0      1    2
## 182      1      1          1     0    0      0    3
## 124      1      1          0     1    1      1    1
## 49       1      1          0     1    1      0    2
## 10       1      1          0     1    0      1    2
## 11       1      1          0     1    0      0    3
## 3        1      1          0     0    1      1    2
## 3        1      1          0     0    1      0    3
## 1        1      1          0     0    0      1    3
## 7        1      1          0     0    0      0    4
## 39       1      0          1     1    1      1    1
## 18       1      0          1     1    1      0    2
## 5        1      0          1     1    0      1    2
## 14       1      0          1     1    0      0    3
## 1        1      0          1     0    1      1    2
## 2        1      0          1     0    1      0    3
## 1        1      0          1     0    0      1    3
## 92       1      0          1     0    0      0    4
## 4        1      0          0     1    1      1    2
## 3        1      0          0     1    1      0    3
## 2        1      0          0     1    0      1    3
## 2        1      0          0     1    0      0    4
## 3        1      0          0     0    0      0    5
## 29       0      1          1     1    1      1    1
## 26       0      1          1     1    1      0    2
## 7        0      1          1     1    0      1    2
## 19       0      1          1     1    0      0    3
## 2        0      1          1     0    1      0    3
## 1        0      1          1     0    0      1    3
## 4        0      1          1     0    0      0    4
## 1        0      1          0     1    1      0    3
## 2        0      1          0     1    0      0    4
## 1        0      1          0     0    0      1    4
## 1        0      0          1     1    1      1    2
## 3        0      0          1     1    1      0    3
## 9        0      0          1     1    0      0    4
## 15       0      0          1     0    0      0    5
## 1        0      0          0     1    1      0    4
## 1        0      0          0     1    0      0    5
## 1        0      0          0     0    0      1    5
##        123    217        229   561 1118   2094 4342

dat2<-brfss20
samp2<-sample(1:dim(dat2)[1], replace = F, size = 500)
dat2$bmiknock<-dat2$bmi
dat2$bmiknock[samp2]<-NA

head(dat2[, c("bmiknock","bmi")])

## # A tibble: 6 × 2
##   bmiknock   bmi
##      <dbl> <dbl>
## 1     NA    NA  
## 2     26.5  26.5
## 3     NA    NA  
## 4     NA    27.8
## 5     25.7  25.7
## 6     41.2  41.2

imp<-mice(data = dat2[,c("bmi", "healthdays", "smoke","income","marst", "employ")], seed = 7, m = 4)

## 
##  iter imp variable
##   1   1  bmi  healthdays  smoke  income  marst  employ
##   1   2  bmi  healthdays  smoke  income  marst  employ
##   1   3  bmi  healthdays  smoke  income  marst  employ
##   1   4  bmi  healthdays  smoke  income  marst  employ
##   2   1  bmi  healthdays  smoke  income  marst  employ
##   2   2  bmi  healthdays  smoke  income  marst  employ
##   2   3  bmi  healthdays  smoke  income  marst  employ
##   2   4  bmi  healthdays  smoke  income  marst  employ
##   3   1  bmi  healthdays  smoke  income  marst  employ
##   3   2  bmi  healthdays  smoke  income  marst  employ
##   3   3  bmi  healthdays  smoke  income  marst  employ
##   3   4  bmi  healthdays  smoke  income  marst  employ
##   4   1  bmi  healthdays  smoke  income  marst  employ
##   4   2  bmi  healthdays  smoke  income  marst  employ
##   4   3  bmi  healthdays  smoke  income  marst  employ
##   4   4  bmi  healthdays  smoke  income  marst  employ
##   5   1  bmi  healthdays  smoke  income  marst  employ
##   5   2  bmi  healthdays  smoke  income  marst  employ
##   5   3  bmi  healthdays  smoke  income  marst  employ
##   5   4  bmi  healthdays  smoke  income  marst  employ

## Warning: Number of logged events: 16

print(imp)

## Class: mids
## Number of multiple imputations:  4 
## Imputation methods:
##        bmi healthdays      smoke     income      marst     employ 
##      "pmm"  "polyreg"   "logreg"  "polyreg"  "polyreg"  "polyreg" 
## PredictorMatrix:
##            bmi healthdays smoke income marst employ
## bmi          0          1     1      1     1      1
## healthdays   1          0     1      1     1      1
## smoke        1          1     0      1     1      1
## income       1          1     1      0     1      1
## marst        1          1     1      1     0      1
## employ       1          1     1      1     1      0
## Number of logged events:  16 
##   it im    dep    meth          out
## 1  1  1 income polyreg healthdays26
## 2  1  2 income polyreg healthdays26
## 3  1  3 income polyreg healthdays26
## 4  1  4 income polyreg healthdays26
## 5  2  1 income polyreg healthdays26
## 6  2  3 income polyreg healthdays26

plot(imp)

fit.bmi<-with(data=imp ,expr=lm(bmi~healthdays+smoke+income+marst+employ))
fit.bmi

## call :
## with.mids(data = imp, expr = lm(bmi ~ healthdays + smoke + income + 
##     marst + employ))
## 
## call1 :
## mice(data = dat2[, c("bmi", "healthdays", "smoke", "income", 
##     "marst", "employ")], m = 4, seed = 7)
## 
## nmis :
##        bmi healthdays      smoke     income      marst     employ 
##       1118        229        561       2094        123        217 
## 
## analyses :
## [[1]]
## 
## Call:
## lm(formula = bmi ~ healthdays + smoke + income + marst + employ)
## 
## Coefficients:
##    (Intercept)     healthdays1     healthdays2     healthdays3     healthdays4  
##       28.19726        -0.25091         0.85013         1.64730         0.84283  
##    healthdays5     healthdays6     healthdays7     healthdays8     healthdays9  
##        1.48941         1.50840         1.13981         1.60669         6.07123  
##   healthdays10    healthdays11    healthdays12    healthdays13    healthdays14  
##        1.63952        18.80190         1.27275         6.37808         2.80542  
##   healthdays15    healthdays16    healthdays17    healthdays18    healthdays19  
##        1.65587         0.34986         5.28614         1.93008         7.09659  
##   healthdays20    healthdays21    healthdays22    healthdays23    healthdays24  
##        2.67213        -1.70368         9.16811         3.15394         6.94956  
##   healthdays25    healthdays26    healthdays27    healthdays28    healthdays29  
##        2.46149       -12.17997         2.69344         4.49536        -1.17494  
##   healthdays30          smoke1         income2         income3         income4  
##        2.35745         0.03910        -0.30122         0.21496         0.11233  
##        income5         income6         income7         income8   marstdivorced  
##        0.28607        -0.38674        -0.60542        -1.27028         0.57068  
##   marstmarried         marstnm  marstseparated    marstwidowed      employnilf  
##        0.74910        -0.32093         0.34471        -0.08591        -0.98734  
##  employretired    employunable  
##       -0.68229         1.41011  
## 
## 
## [[2]]
## 
## Call:
## lm(formula = bmi ~ healthdays + smoke + income + marst + employ)
## 
## Coefficients:
##    (Intercept)     healthdays1     healthdays2     healthdays3     healthdays4  
##      28.303075       -0.340933        0.683127        1.519277        0.515168  
##    healthdays5     healthdays6     healthdays7     healthdays8     healthdays9  
##       1.093098        0.925047        1.279870        1.580463        5.712779  
##   healthdays10    healthdays11    healthdays12    healthdays13    healthdays14  
##       1.539549       18.696583        0.430299        6.256804        2.829955  
##   healthdays15    healthdays16    healthdays17    healthdays18    healthdays19  
##       1.698378        0.215151        4.648968        1.842550        4.823718  
##   healthdays20    healthdays21    healthdays22    healthdays23    healthdays24  
##       2.959699       -0.718991        9.042631        2.628872        8.351185  
##   healthdays25    healthdays26    healthdays27    healthdays28    healthdays29  
##       1.945206      -12.995989        3.833236        5.022940       -1.278370  
##   healthdays30          smoke1         income2         income3         income4  
##       2.048114       -0.031526       -0.175398       -0.087313       -0.163713  
##        income5         income6         income7         income8   marstdivorced  
##      -0.217119       -0.433789       -0.685998       -1.212250        0.849371  
##   marstmarried         marstnm  marstseparated    marstwidowed      employnilf  
##       0.898106        0.040303        0.726821        0.004742       -1.147596  
##  employretired    employunable  
##      -0.748357        1.507619  
## 
## 
## [[3]]
## 
## Call:
## lm(formula = bmi ~ healthdays + smoke + income + marst + employ)
## 
## Coefficients:
##    (Intercept)     healthdays1     healthdays2     healthdays3     healthdays4  
##      28.877425       -0.008054        0.843664        1.534663        0.491331  
##    healthdays5     healthdays6     healthdays7     healthdays8     healthdays9  
##       1.412764        1.474607        1.150435        1.612564        5.672960  
##   healthdays10    healthdays11    healthdays12    healthdays13    healthdays14  
##       1.544565       15.022443        0.311428        4.151501        2.501833  
##   healthdays15    healthdays16    healthdays17    healthdays18    healthdays19  
##       1.484717        0.251446        5.179148        2.163598        3.312584  
##   healthdays20    healthdays21    healthdays22    healthdays23    healthdays24  
##       2.897171       -1.670294        9.503182        2.681803        8.389422  
##   healthdays25    healthdays26    healthdays27    healthdays28    healthdays29  
##       1.857376      -12.993680        4.237103        5.798033       -0.897972  
##   healthdays30          smoke1         income2         income3         income4  
##       2.171577       -0.012529       -0.268680       -0.063604       -0.255409  
##        income5         income6         income7         income8   marstdivorced  
##      -0.652926       -0.945623       -1.220824       -1.666822        0.580138  
##   marstmarried         marstnm  marstseparated    marstwidowed      employnilf  
##       0.432289       -0.132406        0.473893       -0.375350       -1.214328  
##  employretired    employunable  
##      -0.677602        1.263626  
## 
## 
## [[4]]
## 
## Call:
## lm(formula = bmi ~ healthdays + smoke + income + marst + employ)
## 
## Coefficients:
##    (Intercept)     healthdays1     healthdays2     healthdays3     healthdays4  
##      28.037218       -0.312995        0.720361        1.360711        0.606892  
##    healthdays5     healthdays6     healthdays7     healthdays8     healthdays9  
##       1.289100        2.216713        1.183092        1.195000        5.989801  
##   healthdays10    healthdays11    healthdays12    healthdays13    healthdays14  
##       1.693272       18.662799        0.501123        5.381076        2.333498  
##   healthdays15    healthdays16    healthdays17    healthdays18    healthdays19  
##       1.750627        0.261726        5.326209        2.028709        3.717946  
##   healthdays20    healthdays21    healthdays22    healthdays23    healthdays24  
##       2.425132       -2.175929        9.300465        3.019207        8.494965  
##   healthdays25    healthdays26    healthdays27    healthdays28    healthdays29  
##       2.208981      -12.780378        3.828195        3.184567       -1.158696  
##   healthdays30          smoke1         income2         income3         income4  
##       2.075422       -0.003936        0.301801        0.379315        0.111738  
##        income5         income6         income7         income8   marstdivorced  
##       0.556141       -0.141529       -0.421603       -1.175965        0.763116  
##   marstmarried         marstnm  marstseparated    marstwidowed      employnilf  
##       0.810793       -0.120460        0.667039       -0.012388       -1.076503  
##  employretired    employunable  
##      -0.710545        1.033915

Overall, the results are somewhat similar to the modal/mean imputation to the multiply imputated data.

HW8

Drew Schaefer

4/4/2022