DEM 7283 Homework 6

Professor C. Sparks

library(survey)

## Loading required package: grid

## Loading required package: Matrix

## Loading required package: survival

## 
## Attaching package: 'survey'

## The following object is masked from 'package:graphics':
## 
##     dotchart

library(MASS)

mydata<-haven::read_xpt("~/Documents/1A_DEM Doctorate/DEM 7283/R/LLCP2017.xpt")

Clean up names

newnames<-tolower(gsub(pattern = "_", replacement = "", x= names(mydata))) 
names(mydata)<-newnames

Recode variables

library(car)

## Loading required package: carData

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#recode age groups
mydata$agegps<-Recode(mydata$ageg5yr, recodes="1=0; 2:3=1; 4:5=2; 6:8=3; 9:11=4; 12:13=5; else=NA")
mydata$agegpnam<-Recode(mydata$ageg5yr, recodes="1='0_18-24'; 2:3='25-34'; 4:5='35-44'; 6:8='45-59'; 9:11='60-74'; 12:13='75plus'; else=NA", as.factor=T)

#recode sex variable to male or female
mydata$male<-Recode(mydata$sex, recodes="1='male'; 2='0female'; else=NA")

#recode 5 level race/ethnicity 
mydata <- mydata %>%
  mutate(racethlvl = ifelse(racegr3 == 1, "0nh white", 
                                ifelse(racegr3== 2, "nh black",
                                ifelse(racegr3 == 3, "nh other/multi",
                                ifelse(racegr3 == 4, "nh other/multi",
                                ifelse(racegr3 == 5, "hispanic", NA))))))

#individual dummy variables
mydata$black<-Recode(mydata$racegr3, recodes="2='nh black';9=NA; else='0non black'")
mydata$white<-Recode(mydata$racegr3, recodes="1='nh white'; 9=NA; else='0non white'")
mydata$other<-Recode(mydata$racegr3, recodes="3:4='nh other/multi'; 9=NA; else='0non other/multi'")
mydata$hispanic<-Recode(mydata$racegr3, recodes="5='hispanic'; 9=NA; else='0non hispanic'")


#recode marital status
mydata <- mydata %>%
  mutate(maritallvl = ifelse(marital == 1, "Married", 
                                ifelse(marital == 2, "PrevMarried",
                                ifelse(marital == 3, "PrevMarried",
                                ifelse(marital == 4, "PrevMarried",
                                ifelse(marital == 5, "0NevMarried", NA))))))
#dummy variable married
mydata$married<-Recode(mydata$marital, recodes="1='married'; 2:5='0no'; else=NA")

#recode education
mydata <- mydata %>%
  mutate(educlvl = ifelse(educag == 1, "lessHS", 
                                ifelse(educag == 2, "HS",
                                ifelse(educag == 3, "Some Col",
                                ifelse(educag == 4, "Col Grad", NA)))))

#recode laborforce etc
mydata$laborforce<-Recode(mydata$employ1, recodes="1:4='labforce'; 5:8='0no'; else=NA", as.factor=T)

#recode employment
mydata$employed<-Recode(mydata$employ1, recodes="1:2=1; 3:4=0; else=NA")
mydata$employyes<-Recode(mydata$employ1, recodes="1:2='employed'; 3:4='0no';  else=NA")

#recode income
mydata$inc<-Recode(mydata$income2, recodes="1:3=0; 4:5=1; 6=2; 7=3; 8=4; else=NA")
mydata$incgps<-(Recode(mydata$income2, recodes="1:3='0_<20k'; 4:5='20-34'; 6='35-49'; 7='50-74'; 8='75plus'; else=NA", as.factor=T))

#dummy variable education
mydata$col <- Recode(mydata$educag, recodes="1:2='0LessCol'; 3:4='Col'; else=NA", as.factor=T)

#recode the number of hours of sleep to binary
mydata$shortsleep<-Recode(mydata$sleptim1, recodes ="1:6=1; 6:24=0; else=NA")

#recode unintentionally fell asleep during the day
mydata$unintentslep<-Recode(mydata$slepday1, recodes ="1:14=1; 88=0; else=NA")

#difficulty concentrating or remembering
mydata$brainf<-Recode(mydata$decide, recodes ="1=1; 2=0; else=NA", as.factor=T)

#recode mental health
mydata$mentpoor<-Recode(mydata$menthlth, recodes ="1:30=1; 88=0; else=NA", as.factor=T)

#recode poor health (either mental or physcial)
mydata$poorhlthnew<-Recode(mydata$poorhlth, recodes ="1:30=1; 88=0; else=NA")

#recode insurance
mydata$ins<-Recode(mydata$hlthpln1, recodes="1=1; 2=0; else=NA")
#mydata$instype<-Recode(mydata$hlthcvr1, recodes="1:2=1; 3:4=2; 5=3; 6:7=4; 8=0; else=NA")
mydata$hlthinscov<-Recode(mydata$hlthpln1, recodes="1='hlthins'; 2='0_no'; else=NA", as.factor=T)
mydata$hlthinstype<-Recode(mydata$hlthcvr1, recodes="1:2='employer/fam'; 3:4='govsub'; 5='mil'; 6='nat'; 7='other'; 8='0_no'; else=NA", as.factor=T)

#recode last checkup
mydata <- mydata %>%
  mutate(recchecklvl = ifelse(checkup1 == 1, "<1 yr", 
                                ifelse(checkup1 == 2, "<2 yr",
                                ifelse(checkup1 == 3, "<5 yr",
                                ifelse(checkup1 == 4, "5+", NA)))))

mydata$histress<-Recode(mydata$sdhstres, recodes="1:3=0; 4:5=1; else=NA")

#dummy variable recent checkup (within 1 year)
mydata$reccheck<-Recode(mydata$checkup1, recodes ="1='recent'; 2:4='0no'; else=NA")

#outlierReplace(my_data, "brnk3ge5", which(my_data$drnk3ge5 <70), NA)

#recode binge drinking
mydata$bingedrink<-Recode(mydata$drnk3ge5, recodes= "88=0; 77=NA; 99=NA")
mydata$bingedrinkyes<-Recode(mydata$drnk3ge5, recodes="88=0; 1:76=1; else=NA", as.factor=T)

#recode every smoked
mydata$smoke<-Recode(mydata$smoker3, recodes= "1:2=1; 3:4=0; else=NA")

#mydata$cancertypes<-Recode(mydata$cncrdiff, recodes="7=NA; 9=NA")

#recode select illness
#mydata$hrdx <- Recode(mydata$crgvprb2, recode="8=1; 1:7=0; 9:76=0; else=NA")
#mydata$subst <- Recode(mydata$crgvprb2, recode="12=1; 1:11=0; 13:76=0; else=NA")
#mydata$injur <- Recode(mydata$crgvprb2, recode="13=1; 1:12=0; 14:76=0; else=NA")
mydata$mentanx <- Recode(mydata$crgvprb2, recode="10=1; 1:9=0; 11:76=0; else=NA")

#summary(mydata$agegps)summary(mydata$agegpnam)
#summary(mydata$inc)
#summary(mydata$incgps)
#summary(mydata$ins)
#summary(mydata$hlthinscov)
#summary(mydata$employed)
#summary(mydata$employyes)
#summary(mydata$shortsleep)
#summary(mydata$sleptim1)
#summary(mydata$mentpoor)
#summary(mydata$brainf)
#summary(mydata$histress)

library(mice)

## Loading required package: lattice

## 
## Attaching package: 'mice'

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

summary(mydata[,c("agegps", "agegpnam", "inc", "incgps", "ins", "hlthinscov", "educlvl", "col", "laborforce", "employed", "employyes", "shortsleep", "sleptim1", "mentpoor", "brainf", "histress", "bingedrink", "bingedrinkyes")])

##      agegps         agegpnam           inc           incgps      
##  Min.   :0.000   0_18-24: 26233   Min.   :0.00    0_<20k: 65415  
##  1st Qu.:2.000   25-34  : 47187   1st Qu.:1.00    20-34 : 73973  
##  Median :3.000   35-44  : 51597   Median :2.00    35-49 : 53148  
##  Mean   :3.106   45-59  :112407   Mean   :2.27    50-74 : 59632  
##  3rd Qu.:4.000   60-74  :141393   3rd Qu.:4.00    75plus:122763  
##  Max.   :5.000   75plus : 65098   Max.   :4.00    NA's  : 75085  
##  NA's   :6101    NA's   :  6101   NA's   :75085                  
##       ins           hlthinscov       educlvl                col        
##  Min.   :0.0000   0_no   : 35743   Length:450016      0LessCol:155264  
##  1st Qu.:1.0000   hlthins:412502   Class :character   Col     :293045  
##  Median :1.0000   NA's   :  1771   Mode  :character   NA's    :  1707  
##  Mean   :0.9203                                                        
##  3rd Qu.:1.0000                                                        
##  Max.   :1.0000                                                        
##  NA's   :1771                                                          
##     laborforce        employed       employyes           shortsleep    
##  0no     :202911   Min.   :0.00     Length:450016      Min.   :0.0     
##  labforce:243399   1st Qu.:1.00     Class :character   1st Qu.:0.0     
##  NA's    :  3706   Median :1.00     Mode  :character   Median :0.0     
##                    Mean   :0.92                        Mean   :0.3     
##                    3rd Qu.:1.00                        3rd Qu.:1.0     
##                    Max.   :1.00                        Max.   :1.0     
##                    NA's   :206617                      NA's   :400651  
##     sleptim1      mentpoor       brainf          histress     
##  Min.   : 1       0   :300134   0   :388070   Min.   :0.0     
##  1st Qu.: 6       1   :142679   1   : 46069   1st Qu.:0.0     
##  Median : 7       NA's:  7203   NA's: 15877   Median :0.0     
##  Mean   : 8                                   Mean   :0.1     
##  3rd Qu.: 8                                   3rd Qu.:0.0     
##  Max.   :99                                   Max.   :1.0     
##  NA's   :400047                               NA's   :362847  
##    bingedrink     bingedrinkyes
##  Min.   : 0.00    0   :161227  
##  1st Qu.: 0.00    1   : 55995  
##  Median : 0.00    NA's:232794  
##  Mean   : 1.14                 
##  3rd Qu.: 1.00                 
##  Max.   :76.00                 
##  NA's   :232794

From the summary outputs we see that ‘inc’ is the variable with about 16.7% (75085/450016) missing data. Age (‘agegps’) and health insurance coverage (‘hlthinscov’) have lower percentages of missing data (1.4%, 6101/450016 and 0.4%, 1771/400651; respectively).

The variables ‘col’ for at least college education and ‘laborforce’ are on the lower end of missing data (n=151 and n=362 counts of missing, respectively).

The variables ‘mentpoor’ has 1.6%, ‘brainf’ has 3.5%, and ‘bingedrink’ has about 5% missing data.

The following have a high level of missingness (>80%) and so will not be imputed: ‘shortsleep’, ‘sleptim1’, and ‘histress’.

Modal Imputation

This is an imputation technique for categorical data.

Income Groups

table(mydata$incgps)

## 
## 0_<20k  20-34  35-49  50-74 75plus 
##  65415  73973  53148  59632 122763

mcv.incgps<-factor(names(which.max(table(mydata$incgps))), levels=levels(as.factor(mydata$incgps)))

mcv.incgps

## [1] 75plus
## Levels: 0_<20k 20-34 35-49 50-74 75plus

The greatest proportion of the dataset falls into the “75plus” income category.

mydata$incgps.imp<-as.factor(ifelse(is.na(mydata$incgps)==T, mcv.incgps, mydata$incgps))
levels(mydata$incgps.imp)<-levels(mydata$incgps)

prop.table(table(mydata$incgps))

## 
##    0_<20k     20-34     35-49     50-74    75plus 
## 0.1744721 0.1972976 0.1417541 0.1590479 0.3274282

prop.table(table(mydata$incgps.imp))

## 
##    0_<20k     20-34     35-49     50-74    75plus 
## 0.1453615 0.1643786 0.1181025 0.1325108 0.4396466

barplot(prop.table(table(mydata$incgps)), main="Original Data", ylim=c(0, 0.6))

barplot(prop.table(table(mydata$incgps.imp)), main="Imputed Data", ylim=c(0, 0.6))

We can compare the proportions between the original dataset and the dataset with imputations via table and plot formats. The proportions are similar, but differ slightly numerically. However, from the bar plot we see that the distribution of proportions is equivocal.

Age Groups

table(mydata$agegpnam)

## 
## 0_18-24   25-34   35-44   45-59   60-74  75plus 
##   26233   47187   51597  112407  141393   65098

mcv.agegpnam<-factor(names(which.max(table(mydata$agegpnam))),levels=levels(as.factor(mydata$agegpnam)))

mcv.agegpnam

## [1] 60-74
## Levels: 0_18-24 25-34 35-44 45-59 60-74 75plus

The greatest proportion of the dataset falls into the “60-74” age category.

mydata$agegpnam.imp<-as.factor(ifelse(is.na(mydata$agegpnam)==T, mcv.agegpnam, mydata$agegpnam))
levels(mydata$agegpnam.imp)<-levels(mydata$agegpnam)

prop.table(table(mydata$agegpnam))

## 
##    0_18-24      25-34      35-44      45-59      60-74     75plus 
## 0.05909465 0.10629738 0.11623171 0.25321740 0.31851368 0.14664519

prop.table(table(mydata$agegpnam.imp))

## 
##    0_18-24      25-34      35-44      45-59      60-74     75plus 
## 0.05829348 0.10485627 0.11465592 0.24978445 0.32775279 0.14465708

barplot(prop.table(table(mydata$agegpnam)), main="Original Data", ylim=c(0, 0.6))

barplot(prop.table(table(mydata$agegpnam.imp)), main="Imputed Data", ylim=c(0, 0.6))

Education of college level or more

table(mydata$col)

## 
## 0LessCol      Col 
##   155264   293045

mcv.col<-factor(names(which.max(table(mydata$col))), levels=levels(as.factor(mydata$col)))
mcv.col

## [1] Col
## Levels: 0LessCol Col

The greatest proportion of the dataset falls into the “0” category, or the category of having ‘no trouble concentrating’.

mydata$col.imp<-as.factor(ifelse(is.na(mydata$col)==T, mcv.col, mydata$col))
levels(mydata$col.imp)<-levels(mydata$col)

prop.table(table(mydata$col))

## 
##  0LessCol       Col 
## 0.3463326 0.6536674

prop.table(table(mydata$col.imp))

## 
##  0LessCol       Col 
## 0.3450188 0.6549812

barplot(prop.table(table(mydata$col)), main="Original Data", ylim=c(0, 1))

barplot(prop.table(table(mydata$col.imp)), main="Imputed Data", ylim=c(0, 1))

Labor Force Participation

table(mydata$laborforce)

## 
##      0no labforce 
##   202911   243399

mcv.laborforce<-factor(names(which.max(table(mydata$laborforce))), levels=levels(as.factor(mydata$laborforce)))

mcv.laborforce

## [1] labforce
## Levels: 0no labforce

The greatest proportion of the dataset falls into the “75plus” income category.

mydata$laborforce.imp<-as.factor(ifelse(is.na(mydata$laborforce)==T, mcv.laborforce, mydata$laborforce))
levels(mydata$laborforce.imp)<-levels(mydata$laborforce)

prop.table(table(mydata$laborforce))

## 
##       0no  labforce 
## 0.4546414 0.5453586

prop.table(table(mydata$laborforce.imp))

## 
##       0no  labforce 
## 0.4508973 0.5491027

barplot(prop.table(table(mydata$laborforce)), main="Original Data", ylim=c(0, 0.6))

barplot(prop.table(table(mydata$laborforce.imp)), main="Imputed Data", ylim=c(0, 0.6))

Poor Mental Health

table(mydata$mentpoor)

## 
##      0      1 
## 300134 142679

mcv.mentpoor<-factor(names(which.max(table(mydata$mentpoor))), levels=levels(as.factor(mydata$mentpoor)))

mcv.mentpoor

## [1] 0
## Levels: 0 1

mydata$mentpoor.imp<-as.factor(ifelse(is.na(mydata$mentpoor)==T, mcv.mentpoor, mydata$mentpoor))
levels(mydata$mentpoor.imp)<-levels(mydata$mentpoor)

prop.table(table(mydata$mentpoor))

## 
##         0         1 
## 0.6777895 0.3222105

prop.table(table(mydata$mentpoor.imp))

## 
##         0         1 
## 0.6829468 0.3170532

barplot(prop.table(table(mydata$mentpoor)), main="Original Data", ylim=c(0, 1))

barplot(prop.table(table(mydata$mentpoor.imp)), main="Imputed Data", ylim=c(0, 1))

Binge Drinking

table(mydata$bingedrinkyes)

## 
##      0      1 
## 161227  55995

mcv.bingedrinkyes<-factor(names(which.max(table(mydata$bingedrinkyes))), levels=levels(as.factor(mydata$bingedrinkyes)))

mcv.bingedrinkyes

## [1] 0
## Levels: 0 1

mydata$bingedrinkyes.imp<-as.factor(ifelse(is.na(mydata$bingedrinkyes)==T, mcv.bingedrinkyes, mydata$bingedrinkyes))
levels(mydata$bingedrinkyes.imp)<-levels(mydata$bingedrinkyes)

prop.table(table(mydata$bingedrinkyes))

## 
##         0         1 
## 0.7422222 0.2577778

prop.table(table(mydata$bingedrinkyes.imp))

## 
##         0         1 
## 0.8755711 0.1244289

barplot(prop.table(table(mydata$bingedrinkyes)), main="Original Data", ylim=c(0, 1))

barplot(prop.table(table(mydata$bingedrinkyes.imp)), main="Imputed Data", ylim=c(0, 1))

Multiple Imputation

This is a means of imputing multiple values at once and maintain original distribution dynamics.

First explore the variabous patterns of missingness

md.pattern(mydata[,c("agegpnam", "incgps", "col", "laborforce", "mentpoor")])

##         col laborforce agegpnam mentpoor incgps      
## 365602    1          1        1        1      1     0
## 67383     1          1        1        1      0     1
## 4513      1          1        1        0      1     1
## 2227      1          1        1        0      0     2
## 2365      1          1        0        1      1     1
## 2674      1          1        0        1      0     2
## 72        1          1        0        0      1     2
## 105       1          1        0        0      0     3
## 1654      1          0        1        1      1     1
## 1203      1          0        1        1      0     2
## 77        1          0        1        0      1     2
## 77        1          0        1        0      0     3
## 52        1          0        0        1      1     2
## 284       1          0        0        1      0     3
## 2         1          0        0        0      1     3
## 19        1          0        0        0      0     4
## 452       0          1        1        1      1     1
## 504       0          1        1        1      0     2
## 30        0          1        1        0      1     2
## 38        0          1        1        0      0     3
## 67        0          1        0        1      1     2
## 262       0          1        0        1      0     3
## 2         0          1        0        0      1     3
## 14        0          1        0        0      0     4
## 31        0          0        1        1      1     2
## 111       0          0        1        1      0     3
## 4         0          0        1        0      1     3
## 9         0          0        1        0      0     4
## 8         0          0        0        1      1     3
## 161       0          0        0        1      0     4
## 14        0          0        0        0      0     5
##        1707       3706     6101     7203  75085 93802

It appears as though we have 36538 with all of the data and 7690 (6773+473+243+163+38) have at least one data value missing in one of the 5 variables.

md.pattern(mydata[,c("agegpnam", "incgps", "col", "laborforce", "mentpoor", "bingedrinkyes")])

##         col laborforce agegpnam mentpoor incgps bingedrinkyes       
## 188005    1          1        1        1      1             1      0
## 177597    1          1        1        1      1             0      1
## 23862     1          1        1        1      0             1      1
## 43521     1          1        1        1      0             0      2
## 1552      1          1        1        0      1             1      1
## 2961      1          1        1        0      1             0      2
## 491       1          1        1        0      0             1      2
## 1736      1          1        1        0      0             0      3
## 941       1          1        0        1      1             1      1
## 1424      1          1        0        1      1             0      2
## 851       1          1        0        1      0             1      2
## 1823      1          1        0        1      0             0      3
## 15        1          1        0        0      1             1      2
## 57        1          1        0        0      1             0      3
## 22        1          1        0        0      0             1      3
## 83        1          1        0        0      0             0      4
## 692       1          0        1        1      1             1      1
## 962       1          0        1        1      1             0      2
## 267       1          0        1        1      0             1      2
## 936       1          0        1        1      0             0      3
## 15        1          0        1        0      1             1      2
## 62        1          0        1        0      1             0      3
## 14        1          0        1        0      0             1      3
## 63        1          0        1        0      0             0      4
## 16        1          0        0        1      1             1      2
## 36        1          0        0        1      1             0      3
## 54        1          0        0        1      0             1      3
## 230       1          0        0        1      0             0      4
## 2         1          0        0        0      1             0      4
## 2         1          0        0        0      0             1      4
## 17        1          0        0        0      0             0      5
## 161       0          1        1        1      1             1      1
## 291       0          1        1        1      1             0      2
## 117       0          1        1        1      0             1      2
## 387       0          1        1        1      0             0      3
## 5         0          1        1        0      1             1      2
## 25        0          1        1        0      1             0      3
## 5         0          1        1        0      0             1      3
## 33        0          1        1        0      0             0      4
## 19        0          1        0        1      1             1      2
## 48        0          1        0        1      1             0      3
## 67        0          1        0        1      0             1      3
## 195       0          1        0        1      0             0      4
## 1         0          1        0        0      1             1      3
## 1         0          1        0        0      1             0      4
## 1         0          1        0        0      0             1      4
## 13        0          1        0        0      0             0      5
## 5         0          0        1        1      1             1      2
## 26        0          0        1        1      1             0      3
## 18        0          0        1        1      0             1      3
## 93        0          0        1        1      0             0      4
## 1         0          0        1        0      1             1      3
## 3         0          0        1        0      1             0      4
## 9         0          0        1        0      0             0      5
## 3         0          0        0        1      1             1      3
## 5         0          0        0        1      1             0      4
## 20        0          0        0        1      0             1      4
## 141       0          0        0        1      0             0      5
## 14        0          0        0        0      0             0      6
##        1707       3706     6101     7203  75085        232794 326596

It appears as though we have 18686 with all value when we include ‘bingedrinkyes’ variable.

Exploring the number of pairs of variables missing together rr-both variables observed rm-first observed, second missing mr-first missing, second observed mm-both missing

md.pairs(mydata[, c("agegpnam", "incgps", "col", "laborforce", "mentpoor", "bingedrinkyes")])

## $rr
##               agegpnam incgps    col laborforce mentpoor bingedrinkyes
## agegpnam        443915 372363 442736     440749   436940        215210
## incgps          372363 374931 374337     373103   370231        191431
## col             442736 374337 448309     444941   441217        216799
## laborforce      440749 373103 444941     446310   439309        216115
## mentpoor        436940 370231 441217     439309   442813        215098
## bingedrinkyes   215210 191431 216799     216115   215098        217222
## 
## $rm
##               agegpnam incgps  col laborforce mentpoor bingedrinkyes
## agegpnam             0  71552 1179       3166     6975        228705
## incgps            2568      0  594       1828     4700        183500
## col               5573  73972    0       3368     7092        231510
## laborforce        5561  73207 1369          0     7001        230195
## mentpoor          5873  72582 1596       3504        0        227715
## bingedrinkyes     2012  25791  423       1107     2124             0
## 
## $mr
##               agegpnam incgps    col laborforce mentpoor bingedrinkyes
## agegpnam             0   2568   5573       5561     5873          2012
## incgps           71552      0  73972      73207    72582         25791
## col               1179    594      0       1369     1596           423
## laborforce        3166   1828   3368          0     3504          1107
## mentpoor          6975   4700   7092       7001        0          2124
## bingedrinkyes   228705 183500 231510     230195   227715             0
## 
## $mm
##               agegpnam incgps  col laborforce mentpoor bingedrinkyes
## agegpnam          6101   3533  528        540      228          4089
## incgps            3533  75085 1113       1878     2503         49294
## col                528   1113 1707        338      111          1284
## laborforce         540   1878  338       3706      202          2599
## mentpoor           228   2503  111        202     7203          5079
## bingedrinkyes     4089  49294 1284       2599     5079        232794

Basic Imputation

#try a small subset
samp<-sample(1:dim(mydata)[1], size=45000)
mydata<-mydata[samp,]

basicimp<-mice(data=mydata[, c("agegpnam", "incgps", "col", "laborforce", "mentpoor", "bingedrinkyes")], seed=22, m=5) #'m=5' is for 5 productions, however 15-20 is standard

## 
##  iter imp variable
##   1   1  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   1   2  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   1   3  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   1   4  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   1   5  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   2   1  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   2   2  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   2   3  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   2   4  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   2   5  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   3   1  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   3   2  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   3   3  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   3   4  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   3   5  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   4   1  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   4   2  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   4   3  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   4   4  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   4   5  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   5   1  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   5   2  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   5   3  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   5   4  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes
##   5   5  agegpnam  incgps  col  laborforce  mentpoor  bingedrinkyes

Output to represent that 5 iterations of imputing 5 variables.

print(basicimp)

## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
##      agegpnam        incgps           col    laborforce      mentpoor 
##     "polyreg"     "polyreg"      "logreg"      "logreg"      "logreg" 
## bingedrinkyes 
##      "logreg" 
## PredictorMatrix:
##               agegpnam incgps col laborforce mentpoor bingedrinkyes
## agegpnam             0      1   1          1        1             1
## incgps               1      0   1          1        1             1
## col                  1      1   0          1        1             1
## laborforce           1      1   1          0        1             1
## mentpoor             1      1   1          1        0             1
## bingedrinkyes        1      1   1          1        1             0

Below each variable name details the meains in which imputation was conducted. Based on the variable type, a default was selected.

Imputed Age Groups Variable

summary(basicimp$imp$agegpnam)

##        1             2             3             4             5      
##  0_18-24: 34   0_18-24: 28   0_18-24: 37   0_18-24: 39   0_18-24: 31  
##  25-34  : 73   25-34  : 73   25-34  : 57   25-34  : 69   25-34  : 68  
##  35-44  : 79   35-44  : 80   35-44  : 86   35-44  : 78   35-44  : 71  
##  45-59  :167   45-59  :177   45-59  :150   45-59  :163   45-59  :170  
##  60-74  :193   60-74  :192   60-74  :224   60-74  :195   60-74  :199  
##  75plus : 82   75plus : 78   75plus : 74   75plus : 84   75plus : 89

Imputed Income Groups Variable

summary(basicimp$imp$incgps)

##       1             2             3             4             5       
##  0_<20k:1544   0_<20k:1476   0_<20k:1566   0_<20k:1585   0_<20k:1584  
##  20-34 :1746   20-34 :1697   20-34 :1701   20-34 :1683   20-34 :1651  
##  35-49 :1140   35-49 :1146   35-49 :1145   35-49 :1133   35-49 :1122  
##  50-74 :1104   50-74 :1186   50-74 :1090   50-74 :1111   50-74 :1166  
##  75plus:2062   75plus:2091   75plus:2094   75plus:2084   75plus:2073

Imputed College Education Variable

summary(basicimp$imp$col)

##         1             2             3             4              5     
##  0LessCol:57   0LessCol:56   0LessCol:58   0LessCol: 51   0LessCol:62  
##  Col     :96   Col     :97   Col     :95   Col     :102   Col     :91

Imputed Labor Force Participation Variable

summary(basicimp$imp$laborforce)

##         1              2              3              4      
##  0no     :159   0no     :141   0no     :144   0no     :151  
##  labforce:195   labforce:213   labforce:210   labforce:203  
##         5      
##  0no     :146  
##  labforce:208

Imputed Poor Mental Health Variable

summary(basicimp$imp$mentpoor)

##  1       2       3       4       5      
##  0:442   0:460   0:465   0:477   0:466  
##  1:263   1:245   1:240   1:228   1:239

Imputed Binge Drinking variable

summary(basicimp$imp$bingedrinkyes)

##  1         2         3         4         5        
##  0:17787   0:17634   0:17728   0:17738   0:17588  
##  1: 5529   1: 5682   1: 5588   1: 5578   1: 5728

library(lattice)
stripplot(basicimp, agegpnam~bingedrinkyes|.imp, pch=20)

dat.basicimp <- complete(basicimp, action = 1)
head(dat.basicimp, n=10)

##    agegpnam incgps      col laborforce mentpoor bingedrinkyes
## 1     60-74 0_<20k 0LessCol   labforce        0             0
## 2   0_18-24 0_<20k 0LessCol   labforce        0             0
## 3     45-59  20-34 0LessCol        0no        1             0
## 4     60-74 0_<20k      Col        0no        0             0
## 5     60-74 75plus      Col   labforce        0             0
## 6     45-59 0_<20k      Col   labforce        0             0
## 7     35-44 75plus      Col   labforce        0             0
## 8     60-74 0_<20k 0LessCol        0no        0             0
## 9     60-74 75plus      Col        0no        0             0
## 10    35-44 75plus      Col   labforce        0             0

library(foreign)
fit.multimp<-with(data=basicimp, expr=glm(bingedrinkyes~incgps+agegpnam+col+laborforce+mentpoor, family = "binomial"))
fit.multimp

## call :
## with.mids(data = basicimp, expr = glm(bingedrinkyes ~ incgps + 
##     agegpnam + col + laborforce + mentpoor, family = "binomial"))
## 
## call1 :
## mice(data = mydata[, c("agegpnam", "incgps", "col", "laborforce", 
##     "mentpoor", "bingedrinkyes")], m = 5, seed = 22)
## 
## nmis :
##      agegpnam        incgps           col    laborforce      mentpoor 
##           628          7596           153           354           705 
## bingedrinkyes 
##         23316 
## 
## analyses :
## [[1]]
## 
## Call:  glm(formula = bingedrinkyes ~ incgps + agegpnam + col + laborforce + 
##     mentpoor, family = "binomial")
## 
## Coefficients:
##        (Intercept)         incgps20-34         incgps35-49  
##            0.02173            -0.15656            -0.02357  
##        incgps50-74        incgps75plus       agegpnam25-34  
##           -0.17484            -0.11457            -0.18588  
##      agegpnam35-44       agegpnam45-59       agegpnam60-74  
##           -0.62756            -0.96651            -1.36750  
##     agegpnam75plus              colCol  laborforcelabforce  
##           -2.38040            -0.45900             0.29863  
##          mentpoor1  
##            0.23021  
## 
## Degrees of Freedom: 44999 Total (i.e. Null);  44987 Residual
## Null Deviance:       50370 
## Residual Deviance: 46040     AIC: 46070
## 
## [[2]]
## 
## Call:  glm(formula = bingedrinkyes ~ incgps + agegpnam + col + laborforce + 
##     mentpoor, family = "binomial")
## 
## Coefficients:
##        (Intercept)         incgps20-34         incgps35-49  
##             0.1531             -0.1915             -0.1483  
##        incgps50-74        incgps75plus       agegpnam25-34  
##            -0.1147             -0.1319             -0.3123  
##      agegpnam35-44       agegpnam45-59       agegpnam60-74  
##            -0.6596             -0.9879             -1.4613  
##     agegpnam75plus              colCol  laborforcelabforce  
##            -2.5423             -0.4556              0.2540  
##          mentpoor1  
##             0.2302  
## 
## Degrees of Freedom: 44999 Total (i.e. Null);  44987 Residual
## Null Deviance:       50710 
## Residual Deviance: 46250     AIC: 46280
## 
## [[3]]
## 
## Call:  glm(formula = bingedrinkyes ~ incgps + agegpnam + col + laborforce + 
##     mentpoor, family = "binomial")
## 
## Coefficients:
##        (Intercept)         incgps20-34         incgps35-49  
##            0.07379            -0.15533            -0.11049  
##        incgps50-74        incgps75plus       agegpnam25-34  
##           -0.19696            -0.14567            -0.15934  
##      agegpnam35-44       agegpnam45-59       agegpnam60-74  
##           -0.56037            -0.95935            -1.41385  
##     agegpnam75plus              colCol  laborforcelabforce  
##           -2.51435            -0.41608             0.25237  
##          mentpoor1  
##            0.19728  
## 
## Degrees of Freedom: 44999 Total (i.e. Null);  44987 Residual
## Null Deviance:       50500 
## Residual Deviance: 45970     AIC: 46000
## 
## [[4]]
## 
## Call:  glm(formula = bingedrinkyes ~ incgps + agegpnam + col + laborforce + 
##     mentpoor, family = "binomial")
## 
## Coefficients:
##        (Intercept)         incgps20-34         incgps35-49  
##            0.05530            -0.11266            -0.01481  
##        incgps50-74        incgps75plus       agegpnam25-34  
##           -0.10434            -0.05662            -0.25117  
##      agegpnam35-44       agegpnam45-59       agegpnam60-74  
##           -0.56465            -0.99610            -1.43889  
##     agegpnam75plus              colCol  laborforcelabforce  
##           -2.43277            -0.45459             0.25530  
##          mentpoor1  
##            0.20273  
## 
## Degrees of Freedom: 44999 Total (i.e. Null);  44987 Residual
## Null Deviance:       50480 
## Residual Deviance: 46130     AIC: 46160
## 
## [[5]]
## 
## Call:  glm(formula = bingedrinkyes ~ incgps + agegpnam + col + laborforce + 
##     mentpoor, family = "binomial")
## 
## Coefficients:
##        (Intercept)         incgps20-34         incgps35-49  
##            0.10436            -0.13558            -0.08557  
##        incgps50-74        incgps75plus       agegpnam25-34  
##           -0.12424            -0.12861            -0.25072  
##      agegpnam35-44       agegpnam45-59       agegpnam60-74  
##           -0.55230            -0.92607            -1.43011  
##     agegpnam75plus              colCol  laborforcelabforce  
##           -2.57922            -0.47759             0.25262  
##          mentpoor1  
##            0.23096  
## 
## Degrees of Freedom: 44999 Total (i.e. Null);  44987 Residual
## Null Deviance:       50810 
## Residual Deviance: 46150     AIC: 46180

Pooling of Data

est.p<-pool(fit.multimp)
print(est.p)

## Class: mipo    m = 5 
##                       estimate         ubar            b            t
## (Intercept)         0.08165616 0.0024927533 0.0024920980 0.0054832709
## incgps20-34        -0.15033157 0.0014129811 0.0008492525 0.0024320841
## incgps35-49        -0.07655175 0.0017078553 0.0032505608 0.0056085282
## incgps50-74        -0.14300621 0.0016977027 0.0016438100 0.0036702747
## incgps75plus       -0.11547407 0.0013465362 0.0012044799 0.0027919121
## agegpnam25-34      -0.23187458 0.0024673799 0.0036416353 0.0068373423
## agegpnam35-44      -0.59289665 0.0024728579 0.0022890627 0.0052197331
## agegpnam45-59      -0.96718026 0.0020454074 0.0007541394 0.0029503747
## agegpnam60-74      -1.42232731 0.0021386486 0.0012325356 0.0036176912
## agegpnam75plus     -2.48981795 0.0043774153 0.0066448946 0.0123512888
## colCol             -0.45257777 0.0006548276 0.0005030718 0.0012585138
## laborforcelabforce  0.26258277 0.0007984065 0.0004073323 0.0012872053
## mentpoor1           0.21827576 0.0005916406 0.0002820488 0.0009300992
##                    dfcom        df       riv    lambda       fmi
## (Intercept)        44987 13.438821 1.1996846 0.5453894 0.6006988
## incgps20-34        44987 22.761592 0.7212432 0.4190246 0.4641286
## incgps35-49        44987  8.264501 2.2839599 0.6954896 0.7495551
## incgps50-74        44987 13.838928 1.1619066 0.5374453 0.5923841
## incgps75plus       44987 14.914311 1.0734030 0.5177011 0.5715462
## agegpnam25-34      44987  9.786273 1.7710943 0.6391317 0.6955779
## agegpnam35-44      44987 14.433926 1.1108100 0.5262482 0.5805965
## agegpnam45-59      44987 42.457749 0.4424386 0.3067296 0.3372313
## agegpnam60-74      44987 23.909525 0.6915782 0.4088361 0.4527732
## agegpnam75plus     44987  9.591455 1.8215940 0.6455904 0.7018841
## colCol             44987 17.371250 0.9219009 0.4796818 0.5307654
## laborforcelabforce 44987 27.711744 0.6122179 0.3797365 0.4201291
## mentpoor1          44987 30.175139 0.5720678 0.3638951 0.4022434

summary(est.p)

##                       estimate  std.error  statistic        df
## (Intercept)         0.08165616 0.07404911   1.102730 13.438821
## incgps20-34        -0.15033157 0.04931616  -3.048322 22.761592
## incgps35-49        -0.07655175 0.07489011  -1.022188  8.264501
## incgps50-74        -0.14300621 0.06058279  -2.360509 13.838928
## incgps75plus       -0.11547407 0.05283855  -2.185413 14.914311
## agegpnam25-34      -0.23187458 0.08268822  -2.804203  9.786273
## agegpnam35-44      -0.59289665 0.07224772  -8.206441 14.433926
## agegpnam45-59      -0.96718026 0.05431735 -17.806101 42.457749
## agegpnam60-74      -1.42232731 0.06014725 -23.647422 23.909525
## agegpnam75plus     -2.48981795 0.11113635 -22.403272  9.591455
## colCol             -0.45257777 0.03547554 -12.757460 17.371250
## laborforcelabforce  0.26258277 0.03587764   7.318841 27.711744
## mentpoor1           0.21827576 0.03049753   7.157162 30.175139
##                         p.value
## (Intercept)        2.763560e-01
## incgps20-34        3.950773e-03
## incgps35-49        3.124812e-01
## incgps50-74        2.291602e-02
## incgps75plus       3.442476e-02
## agegpnam25-34      7.575694e-03
## agegpnam35-44      2.662626e-10
## agegpnam45-59      0.000000e+00
## agegpnam60-74      0.000000e+00
## agegpnam75plus     0.000000e+00
## colCol             4.440892e-16
## laborforcelabforce 4.795038e-09
## mentpoor1          8.171652e-09

Regression without any imputation

fit1<-glm(bingedrinkyes~agegpnam+incgps+col+laborforce+mentpoor, data=mydata, family="binomial")
#fit1
summary(fit1)

## 
## Call:
## glm(formula = bingedrinkyes ~ agegpnam + incgps + col + laborforce + 
##     mentpoor, family = "binomial", data = mydata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4453  -0.8101  -0.5992   1.1374   2.4254  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         0.16602    0.08678   1.913 0.055718 .  
## agegpnam25-34      -0.29587    0.07645  -3.870 0.000109 ***
## agegpnam35-44      -0.62320    0.07718  -8.075 6.74e-16 ***
## agegpnam45-59      -0.99308    0.07218 -13.758  < 2e-16 ***
## agegpnam60-74      -1.45450    0.07536 -19.301  < 2e-16 ***
## agegpnam75plus     -2.48970    0.11662 -21.349  < 2e-16 ***
## incgps20-34        -0.13230    0.06868  -1.926 0.054080 .  
## incgps35-49        -0.10870    0.07074  -1.537 0.124372    
## incgps50-74        -0.13711    0.06830  -2.007 0.044708 *  
## incgps75plus       -0.14255    0.06217  -2.293 0.021843 *  
## colCol             -0.42091    0.04119 -10.220  < 2e-16 ***
## laborforcelabforce  0.22470    0.04649   4.833 1.34e-06 ***
## mentpoor1           0.22007    0.03680   5.980 2.24e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 21640  on 18668  degrees of freedom
## Residual deviance: 19972  on 18656  degrees of freedom
##   (26331 observations deleted due to missingness)
## AIC: 19998
## 
## Number of Fisher Scoring iterations: 5

Regression with modal imputation

fit1.modimp <- glm(bingedrinkyes.imp~agegpnam.imp+incgps.imp+col.imp+laborforce.imp+mentpoor.imp, data = mydata, family="binomial")
summary(fit1.modimp)

## 
## Call:
## glm(formula = bingedrinkyes.imp ~ agegpnam.imp + incgps.imp + 
##     col.imp + laborforce.imp + mentpoor.imp, family = "binomial", 
##     data = mydata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9366  -0.5730  -0.4333  -0.2387   2.8805  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -1.79070    0.06676 -26.822  < 2e-16 ***
## agegpnam.imp25-34      -0.09473    0.05749  -1.648   0.0994 .  
## agegpnam.imp35-44      -0.48254    0.05889  -8.194 2.54e-16 ***
## agegpnam.imp45-59      -0.81303    0.05390 -15.084  < 2e-16 ***
## agegpnam.imp60-74      -1.24879    0.05670 -22.025  < 2e-16 ***
## agegpnam.imp75plus     -2.34206    0.09950 -23.539  < 2e-16 ***
## incgps.imp20-34         0.09096    0.05827   1.561   0.1185    
## incgps.imp35-49         0.28267    0.06083   4.647 3.37e-06 ***
## incgps.imp50-74         0.36518    0.05894   6.195 5.82e-10 ***
## incgps.imp75plus        0.25818    0.04997   5.167 2.38e-07 ***
## col.impCol              0.02701    0.03293   0.820   0.4121    
## laborforce.implabforce  0.52272    0.03774  13.852  < 2e-16 ***
## mentpoor.imp1           0.27905    0.03080   9.060  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 33866  on 44999  degrees of freedom
## Residual deviance: 31163  on 44987  degrees of freedom
## AIC: 31189
## 
## Number of Fisher Scoring iterations: 6

Regression with multiple imputation

fit1.multiimp <- glm(bingedrinkyes~., data = dat.basicimp, family="binomial")
summary(fit1.multiimp)

## 
## Call:
## glm(formula = bingedrinkyes ~ ., family = "binomial", data = dat.basicimp)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4183  -0.7747  -0.5893  -0.3129   2.4664  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         0.02173    0.04998   0.435 0.663786    
## agegpnam25-34      -0.18588    0.04969  -3.741 0.000183 ***
## agegpnam35-44      -0.62756    0.04993 -12.569  < 2e-16 ***
## agegpnam45-59      -0.96651    0.04532 -21.326  < 2e-16 ***
## agegpnam60-74      -1.36750    0.04621 -29.591  < 2e-16 ***
## agegpnam75plus     -2.38040    0.06524 -36.486  < 2e-16 ***
## incgps20-34        -0.15656    0.03759  -4.165 3.12e-05 ***
## incgps35-49        -0.02357    0.04113  -0.573 0.566584    
## incgps50-74        -0.17484    0.04146  -4.217 2.47e-05 ***
## incgps75plus       -0.11457    0.03678  -3.115 0.001840 ** 
## colCol             -0.45900    0.02561 -17.920  < 2e-16 ***
## laborforcelabforce  0.29863    0.02832  10.544  < 2e-16 ***
## mentpoor1           0.23021    0.02435   9.453  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 50374  on 44999  degrees of freedom
## Residual deviance: 46043  on 44987  degrees of freedom
## AIC: 46069
## 
## Number of Fisher Scoring iterations: 5

Lowest AIC was when using the original dataset without any imputations. Significance levels changed from no impuation to modal imputation datasets. Income group ‘20-34’ had a significant association with ‘bingedrinkyes’ compared to reference income group ‘<20’ in the original dataset and in the multiple imputation dataset; however, that same relationship was not apparent in the dataset using modal imputation. In addition the direction of relationships in the modal imputation dataset differed from the other two. The multiple imputation datase results were more in line with that of the original dataset. In the original dataset over 26K observations were deleted to due to missingness. Therefore using multiple imputaion method yielded the best results that were akin to original without having to delete any observations.

Variation in the models for the imputed data

library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(ggplot2)
#get the coefficients from each of the 5 imputations
coefs<-as.data.frame(matrix(unlist(lapply(fit.multimp$analyses, coef)), nrow=5, ncol=13, byrow=T))
names(coefs)<-names(fit.multimp$analyses[[1]]$coef)
#plot the coefficients from each of the different rounds of imputation to see the variability in the
#results

coefs%>%
  select(`incgps20-34`:`mentpoor1`)%>%
  melt()%>%
  mutate(basicimp=rep(1:5, 12))%>%
  ggplot()+geom_point(aes(y=value,x=variable, group=variable, color=as.factor(basicimp) ))+ggtitle("Estimated Betas from Each Imputed Regression Model for Binge Drinking Outcome")+theme(axis.text.x = element_text(angle = 45, hjust = 1))

## No id variables; using all as measure variables

We note that the betas are in relative proximation of each other, with some having greater spread compared to others in which the betas greatly overlap.

While the actual imputed values may have a greater spread, overall variance is replicated and thus results parallel that of the non-imputed dataset.