library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(MASS)
mydata<-haven::read_xpt("~/Documents/1A_DEM Doctorate/DEM 7283/R/LLCP2017.xpt")
newnames<-tolower(gsub(pattern = "_", replacement = "", x= names(mydata)))
names(mydata)<-newnames
library(car)
## Loading required package: carData
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#recode age groups
mydata$agegps<-Recode(mydata$ageg5yr, recodes="1=0; 2:3=1; 4:5=2; 6:8=3; 9:11=4; 12:13=5; else=NA")
mydata$agegpnam<-Recode(mydata$ageg5yr, recodes="1='0_18-24'; 2:3='25-34'; 4:5='35-44'; 6:8='45-59'; 9:11='60-74'; 12:13='75plus'; else=NA", as.factor=T)
#recode sex variable to male or female
mydata$male<-Recode(mydata$sex, recodes="1='male'; 2='0female'; else=NA")
#recode 5 level race/ethnicity
mydata <- mydata %>%
mutate(racethlvl = ifelse(racegr3 == 1, "0nh white",
ifelse(racegr3== 2, "nh black",
ifelse(racegr3 == 3, "nh other/multi",
ifelse(racegr3 == 4, "nh other/multi",
ifelse(racegr3 == 5, "hispanic", NA))))))
#individual dummy variables
mydata$black<-Recode(mydata$racegr3, recodes="2='nh black';9=NA; else='0non black'")
mydata$white<-Recode(mydata$racegr3, recodes="1='nh white'; 9=NA; else='0non white'")
mydata$other<-Recode(mydata$racegr3, recodes="3:4='nh other/multi'; 9=NA; else='0non other/multi'")
mydata$hispanic<-Recode(mydata$racegr3, recodes="5='hispanic'; 9=NA; else='0non hispanic'")
#recode marital status
mydata <- mydata %>%
mutate(maritallvl = ifelse(marital == 1, "Married",
ifelse(marital == 2, "PrevMarried",
ifelse(marital == 3, "PrevMarried",
ifelse(marital == 4, "PrevMarried",
ifelse(marital == 5, "0NevMarried", NA))))))
#dummy variable married
mydata$married<-Recode(mydata$marital, recodes="1='married'; 2:5='0no'; else=NA")
#recode education
mydata <- mydata %>%
mutate(educlvl = ifelse(educag == 1, "lessHS",
ifelse(educag == 2, "HS",
ifelse(educag == 3, "Some Col",
ifelse(educag == 4, "Col Grad", NA)))))
#recode laborforce etc
mydata$laborforce<-Recode(mydata$employ1, recodes="1:4='labforce'; 5:8='0no'; else=NA", as.factor=T)
#recode employment
mydata$employed<-Recode(mydata$employ1, recodes="1:2=1; 3:4=0; else=NA")
mydata$employyes<-Recode(mydata$employ1, recodes="1:2='employed'; 3:4='0no'; else=NA")
#recode income
mydata$inc<-Recode(mydata$income2, recodes="1:3=0; 4:5=1; 6=2; 7=3; 8=4; else=NA")
mydata$incgps<-(Recode(mydata$income2, recodes="1:3='0_<20k'; 4:5='20-34'; 6='35-49'; 7='50-74'; 8='75plus'; else=NA", as.factor=T))
#dummy variable education
mydata$col <- Recode(mydata$educag, recodes="1:2='0LessCol'; 3:4='Col'; else=NA", as.factor=T)
#recode the number of hours of sleep to binary
mydata$shortsleep<-Recode(mydata$sleptim1, recodes ="1:6=1; 6:24=0; else=NA")
#recode unintentionally fell asleep during the day
mydata$unintentslep<-Recode(mydata$slepday1, recodes ="1:14=1; 88=0; else=NA")
#difficulty concentrating or remembering
mydata$brainf<-Recode(mydata$decide, recodes ="1=1; 2=0; else=NA", as.factor=T)
#recode mental health
mydata$mentpoor<-Recode(mydata$menthlth, recodes ="1:30=1; 88=0; else=NA", as.factor=T)
#recode poor health (either mental or physcial)
mydata$poorhlthnew<-Recode(mydata$poorhlth, recodes ="1:30=1; 88=0; else=NA")
#recode insurance
mydata$ins<-Recode(mydata$hlthpln1, recodes="1=1; 2=0; else=NA")
#mydata$instype<-Recode(mydata$hlthcvr1, recodes="1:2=1; 3:4=2; 5=3; 6:7=4; 8=0; else=NA")
mydata$hlthinscov<-Recode(mydata$hlthpln1, recodes="1='hlthins'; 2='0_no'; else=NA", as.factor=T)
mydata$hlthinstype<-Recode(mydata$hlthcvr1, recodes="1:2='employer/fam'; 3:4='govsub'; 5='mil'; 6='nat'; 7='other'; 8='0_no'; else=NA", as.factor=T)
#recode last checkup
mydata <- mydata %>%
mutate(recchecklvl = ifelse(checkup1 == 1, "<1 yr",
ifelse(checkup1 == 2, "<2 yr",
ifelse(checkup1 == 3, "<5 yr",
ifelse(checkup1 == 4, "5+", NA)))))
mydata$histress<-Recode(mydata$sdhstres, recodes="1:3=0; 4:5=1; else=NA")
#dummy variable recent checkup (within 1 year)
mydata$reccheck<-Recode(mydata$checkup1, recodes ="1='recent'; 2:4='0no'; else=NA")
#outlierReplace(my_data, "brnk3ge5", which(my_data$drnk3ge5 <70), NA)
#recode binge drinking
mydata$bingedrink<-Recode(mydata$drnk3ge5, recodes= "88=0; 77=NA; 99=NA")
mydata$bingedrinkyes<-Recode(mydata$drnk3ge5, recodes="88=0; 1:76=1; else=NA", as.factor=T)
#recode every smoked
mydata$smoke<-Recode(mydata$smoker3, recodes= "1:2=1; 3:4=0; else=NA")
#mydata$cancertypes<-Recode(mydata$cncrdiff, recodes="7=NA; 9=NA")
#recode select illness
#mydata$hrdx <- Recode(mydata$crgvprb2, recode="8=1; 1:7=0; 9:76=0; else=NA")
#mydata$subst <- Recode(mydata$crgvprb2, recode="12=1; 1:11=0; 13:76=0; else=NA")
#mydata$injur <- Recode(mydata$crgvprb2, recode="13=1; 1:12=0; 14:76=0; else=NA")
mydata$mentanx <- Recode(mydata$crgvprb2, recode="10=1; 1:9=0; 11:76=0; else=NA")
#summary(mydata$agegps)summary(mydata$agegpnam)
#summary(mydata$inc)
#summary(mydata$incgps)
#summary(mydata$ins)
#summary(mydata$hlthinscov)
#summary(mydata$employed)
#summary(mydata$employyes)
#summary(mydata$shortsleep)
#summary(mydata$sleptim1)
#summary(mydata$mentpoor)
#summary(mydata$brainf)
#summary(mydata$histress)
library(mice)
## Loading required package: lattice
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
summary(mydata[,c("agegps", "agegpnam", "inc", "incgps", "ins", "hlthinscov", "educlvl", "col", "laborforce", "employed", "employyes", "shortsleep", "sleptim1", "mentpoor", "brainf", "histress", "bingedrink", "bingedrinkyes")])
## agegps agegpnam inc incgps
## Min. :0.000 0_18-24: 26233 Min. :0.00 0_<20k: 65415
## 1st Qu.:2.000 25-34 : 47187 1st Qu.:1.00 20-34 : 73973
## Median :3.000 35-44 : 51597 Median :2.00 35-49 : 53148
## Mean :3.106 45-59 :112407 Mean :2.27 50-74 : 59632
## 3rd Qu.:4.000 60-74 :141393 3rd Qu.:4.00 75plus:122763
## Max. :5.000 75plus : 65098 Max. :4.00 NA's : 75085
## NA's :6101 NA's : 6101 NA's :75085
## ins hlthinscov educlvl col
## Min. :0.0000 0_no : 35743 Length:450016 0LessCol:155264
## 1st Qu.:1.0000 hlthins:412502 Class :character Col :293045
## Median :1.0000 NA's : 1771 Mode :character NA's : 1707
## Mean :0.9203
## 3rd Qu.:1.0000
## Max. :1.0000
## NA's :1771
## laborforce employed employyes shortsleep
## 0no :202911 Min. :0.00 Length:450016 Min. :0.0
## labforce:243399 1st Qu.:1.00 Class :character 1st Qu.:0.0
## NA's : 3706 Median :1.00 Mode :character Median :0.0
## Mean :0.92 Mean :0.3
## 3rd Qu.:1.00 3rd Qu.:1.0
## Max. :1.00 Max. :1.0
## NA's :206617 NA's :400651
## sleptim1 mentpoor brainf histress
## Min. : 1 0 :300134 0 :388070 Min. :0.0
## 1st Qu.: 6 1 :142679 1 : 46069 1st Qu.:0.0
## Median : 7 NA's: 7203 NA's: 15877 Median :0.0
## Mean : 8 Mean :0.1
## 3rd Qu.: 8 3rd Qu.:0.0
## Max. :99 Max. :1.0
## NA's :400047 NA's :362847
## bingedrink bingedrinkyes
## Min. : 0.00 0 :161227
## 1st Qu.: 0.00 1 : 55995
## Median : 0.00 NA's:232794
## Mean : 1.14
## 3rd Qu.: 1.00
## Max. :76.00
## NA's :232794
From the summary outputs we see that ‘inc’ is the variable with about 16.7% (75085/450016) missing data. Age (‘agegps’) and health insurance coverage (‘hlthinscov’) have lower percentages of missing data (1.4%, 6101/450016 and 0.4%, 1771/400651; respectively).
The variables ‘col’ for at least college education and ‘laborforce’ are on the lower end of missing data (n=151 and n=362 counts of missing, respectively).
The variables ‘mentpoor’ has 1.6%, ‘brainf’ has 3.5%, and ‘bingedrink’ has about 5% missing data.
The following have a high level of missingness (>80%) and so will not be imputed: ‘shortsleep’, ‘sleptim1’, and ‘histress’.
This is an imputation technique for categorical data.
table(mydata$incgps)
##
## 0_<20k 20-34 35-49 50-74 75plus
## 65415 73973 53148 59632 122763
mcv.incgps<-factor(names(which.max(table(mydata$incgps))), levels=levels(as.factor(mydata$incgps)))
mcv.incgps
## [1] 75plus
## Levels: 0_<20k 20-34 35-49 50-74 75plus
The greatest proportion of the dataset falls into the “75plus” income category.
mydata$incgps.imp<-as.factor(ifelse(is.na(mydata$incgps)==T, mcv.incgps, mydata$incgps))
levels(mydata$incgps.imp)<-levels(mydata$incgps)
prop.table(table(mydata$incgps))
##
## 0_<20k 20-34 35-49 50-74 75plus
## 0.1744721 0.1972976 0.1417541 0.1590479 0.3274282
prop.table(table(mydata$incgps.imp))
##
## 0_<20k 20-34 35-49 50-74 75plus
## 0.1453615 0.1643786 0.1181025 0.1325108 0.4396466
barplot(prop.table(table(mydata$incgps)), main="Original Data", ylim=c(0, 0.6))
barplot(prop.table(table(mydata$incgps.imp)), main="Imputed Data", ylim=c(0, 0.6))
We can compare the proportions between the original dataset and the dataset with imputations via table and plot formats. The proportions are similar, but differ slightly numerically. However, from the bar plot we see that the distribution of proportions is equivocal.
table(mydata$agegpnam)
##
## 0_18-24 25-34 35-44 45-59 60-74 75plus
## 26233 47187 51597 112407 141393 65098
mcv.agegpnam<-factor(names(which.max(table(mydata$agegpnam))),levels=levels(as.factor(mydata$agegpnam)))
mcv.agegpnam
## [1] 60-74
## Levels: 0_18-24 25-34 35-44 45-59 60-74 75plus
The greatest proportion of the dataset falls into the “60-74” age category.
mydata$agegpnam.imp<-as.factor(ifelse(is.na(mydata$agegpnam)==T, mcv.agegpnam, mydata$agegpnam))
levels(mydata$agegpnam.imp)<-levels(mydata$agegpnam)
prop.table(table(mydata$agegpnam))
##
## 0_18-24 25-34 35-44 45-59 60-74 75plus
## 0.05909465 0.10629738 0.11623171 0.25321740 0.31851368 0.14664519
prop.table(table(mydata$agegpnam.imp))
##
## 0_18-24 25-34 35-44 45-59 60-74 75plus
## 0.05829348 0.10485627 0.11465592 0.24978445 0.32775279 0.14465708
barplot(prop.table(table(mydata$agegpnam)), main="Original Data", ylim=c(0, 0.6))
barplot(prop.table(table(mydata$agegpnam.imp)), main="Imputed Data", ylim=c(0, 0.6))
We can compare the proportions between the original dataset and the dataset with imputations via table and plot formats. The proportions are similar, but differ slightly numerically. However, from the bar plot we see that the distribution of proportions is equivocal. Groups “45-59” and “60-74” remain as being the greatest proportions.
table(mydata$col)
##
## 0LessCol Col
## 155264 293045
mcv.col<-factor(names(which.max(table(mydata$col))), levels=levels(as.factor(mydata$col)))
mcv.col
## [1] Col
## Levels: 0LessCol Col
The greatest proportion of the dataset falls into the “0” category, or the category of having ‘no trouble concentrating’.
mydata$col.imp<-as.factor(ifelse(is.na(mydata$col)==T, mcv.col, mydata$col))
levels(mydata$col.imp)<-levels(mydata$col)
prop.table(table(mydata$col))
##
## 0LessCol Col
## 0.3463326 0.6536674
prop.table(table(mydata$col.imp))
##
## 0LessCol Col
## 0.3450188 0.6549812
barplot(prop.table(table(mydata$col)), main="Original Data", ylim=c(0, 1))
barplot(prop.table(table(mydata$col.imp)), main="Imputed Data", ylim=c(0, 1))
We can compare the proportions between the original dataset and the dataset with imputations via table and plot formats. The proportions are similar, but differ slightly numerically. However, from the bar plot we see that the distribution of proportions is equivocal. Group “0” and “60-74” remains as being the greatest proportion.
table(mydata$laborforce)
##
## 0no labforce
## 202911 243399
mcv.laborforce<-factor(names(which.max(table(mydata$laborforce))), levels=levels(as.factor(mydata$laborforce)))
mcv.laborforce
## [1] labforce
## Levels: 0no labforce
The greatest proportion of the dataset falls into the “75plus” income category.
mydata$laborforce.imp<-as.factor(ifelse(is.na(mydata$laborforce)==T, mcv.laborforce, mydata$laborforce))
levels(mydata$laborforce.imp)<-levels(mydata$laborforce)
prop.table(table(mydata$laborforce))
##
## 0no labforce
## 0.4546414 0.5453586
prop.table(table(mydata$laborforce.imp))
##
## 0no labforce
## 0.4508973 0.5491027
barplot(prop.table(table(mydata$laborforce)), main="Original Data", ylim=c(0, 0.6))
barplot(prop.table(table(mydata$laborforce.imp)), main="Imputed Data", ylim=c(0, 0.6))
table(mydata$mentpoor)
##
## 0 1
## 300134 142679
mcv.mentpoor<-factor(names(which.max(table(mydata$mentpoor))), levels=levels(as.factor(mydata$mentpoor)))
mcv.mentpoor
## [1] 0
## Levels: 0 1
mydata$mentpoor.imp<-as.factor(ifelse(is.na(mydata$mentpoor)==T, mcv.mentpoor, mydata$mentpoor))
levels(mydata$mentpoor.imp)<-levels(mydata$mentpoor)
prop.table(table(mydata$mentpoor))
##
## 0 1
## 0.6777895 0.3222105
prop.table(table(mydata$mentpoor.imp))
##
## 0 1
## 0.6829468 0.3170532
barplot(prop.table(table(mydata$mentpoor)), main="Original Data", ylim=c(0, 1))
barplot(prop.table(table(mydata$mentpoor.imp)), main="Imputed Data", ylim=c(0, 1))
table(mydata$bingedrinkyes)
##
## 0 1
## 161227 55995
mcv.bingedrinkyes<-factor(names(which.max(table(mydata$bingedrinkyes))), levels=levels(as.factor(mydata$bingedrinkyes)))
mcv.bingedrinkyes
## [1] 0
## Levels: 0 1
mydata$bingedrinkyes.imp<-as.factor(ifelse(is.na(mydata$bingedrinkyes)==T, mcv.bingedrinkyes, mydata$bingedrinkyes))
levels(mydata$bingedrinkyes.imp)<-levels(mydata$bingedrinkyes)
prop.table(table(mydata$bingedrinkyes))
##
## 0 1
## 0.7422222 0.2577778
prop.table(table(mydata$bingedrinkyes.imp))
##
## 0 1
## 0.8755711 0.1244289
barplot(prop.table(table(mydata$bingedrinkyes)), main="Original Data", ylim=c(0, 1))
barplot(prop.table(table(mydata$bingedrinkyes.imp)), main="Imputed Data", ylim=c(0, 1))
This is a means of imputing multiple values at once and maintain original distribution dynamics.
First explore the variabous patterns of missingness
md.pattern(mydata[,c("agegpnam", "incgps", "col", "laborforce", "mentpoor")])
## col laborforce agegpnam mentpoor incgps
## 365602 1 1 1 1 1 0
## 67383 1 1 1 1 0 1
## 4513 1 1 1 0 1 1
## 2227 1 1 1 0 0 2
## 2365 1 1 0 1 1 1
## 2674 1 1 0 1 0 2
## 72 1 1 0 0 1 2
## 105 1 1 0 0 0 3
## 1654 1 0 1 1 1 1
## 1203 1 0 1 1 0 2
## 77 1 0 1 0 1 2
## 77 1 0 1 0 0 3
## 52 1 0 0 1 1 2
## 284 1 0 0 1 0 3
## 2 1 0 0 0 1 3
## 19 1 0 0 0 0 4
## 452 0 1 1 1 1 1
## 504 0 1 1 1 0 2
## 30 0 1 1 0 1 2
## 38 0 1 1 0 0 3
## 67 0 1 0 1 1 2
## 262 0 1 0 1 0 3
## 2 0 1 0 0 1 3
## 14 0 1 0 0 0 4
## 31 0 0 1 1 1 2
## 111 0 0 1 1 0 3
## 4 0 0 1 0 1 3
## 9 0 0 1 0 0 4
## 8 0 0 0 1 1 3
## 161 0 0 0 1 0 4
## 14 0 0 0 0 0 5
## 1707 3706 6101 7203 75085 93802
It appears as though we have 36538 with all of the data and 7690 (6773+473+243+163+38) have at least one data value missing in one of the 5 variables.
md.pattern(mydata[,c("agegpnam", "incgps", "col", "laborforce", "mentpoor", "bingedrinkyes")])
## col laborforce agegpnam mentpoor incgps bingedrinkyes
## 188005 1 1 1 1 1 1 0
## 177597 1 1 1 1 1 0 1
## 23862 1 1 1 1 0 1 1
## 43521 1 1 1 1 0 0 2
## 1552 1 1 1 0 1 1 1
## 2961 1 1 1 0 1 0 2
## 491 1 1 1 0 0 1 2
## 1736 1 1 1 0 0 0 3
## 941 1 1 0 1 1 1 1
## 1424 1 1 0 1 1 0 2
## 851 1 1 0 1 0 1 2
## 1823 1 1 0 1 0 0 3
## 15 1 1 0 0 1 1 2
## 57 1 1 0 0 1 0 3
## 22 1 1 0 0 0 1 3
## 83 1 1 0 0 0 0 4
## 692 1 0 1 1 1 1 1
## 962 1 0 1 1 1 0 2
## 267 1 0 1 1 0 1 2
## 936 1 0 1 1 0 0 3
## 15 1 0 1 0 1 1 2
## 62 1 0 1 0 1 0 3
## 14 1 0 1 0 0 1 3
## 63 1 0 1 0 0 0 4
## 16 1 0 0 1 1 1 2
## 36 1 0 0 1 1 0 3
## 54 1 0 0 1 0 1 3
## 230 1 0 0 1 0 0 4
## 2 1 0 0 0 1 0 4
## 2 1 0 0 0 0 1 4
## 17 1 0 0 0 0 0 5
## 161 0 1 1 1 1 1 1
## 291 0 1 1 1 1 0 2
## 117 0 1 1 1 0 1 2
## 387 0 1 1 1 0 0 3
## 5 0 1 1 0 1 1 2
## 25 0 1 1 0 1 0 3
## 5 0 1 1 0 0 1 3
## 33 0 1 1 0 0 0 4
## 19 0 1 0 1 1 1 2
## 48 0 1 0 1 1 0 3
## 67 0 1 0 1 0 1 3
## 195 0 1 0 1 0 0 4
## 1 0 1 0 0 1 1 3
## 1 0 1 0 0 1 0 4
## 1 0 1 0 0 0 1 4
## 13 0 1 0 0 0 0 5
## 5 0 0 1 1 1 1 2
## 26 0 0 1 1 1 0 3
## 18 0 0 1 1 0 1 3
## 93 0 0 1 1 0 0 4
## 1 0 0 1 0 1 1 3
## 3 0 0 1 0 1 0 4
## 9 0 0 1 0 0 0 5
## 3 0 0 0 1 1 1 3
## 5 0 0 0 1 1 0 4
## 20 0 0 0 1 0 1 4
## 141 0 0 0 1 0 0 5
## 14 0 0 0 0 0 0 6
## 1707 3706 6101 7203 75085 232794 326596
It appears as though we have 18686 with all value when we include ‘bingedrinkyes’ variable.
Exploring the number of pairs of variables missing together rr-both variables observed rm-first observed, second missing mr-first missing, second observed mm-both missing
md.pairs(mydata[, c("agegpnam", "incgps", "col", "laborforce", "mentpoor", "bingedrinkyes")])
## $rr
## agegpnam incgps col laborforce mentpoor bingedrinkyes
## agegpnam 443915 372363 442736 440749 436940 215210
## incgps 372363 374931 374337 373103 370231 191431
## col 442736 374337 448309 444941 441217 216799
## laborforce 440749 373103 444941 446310 439309 216115
## mentpoor 436940 370231 441217 439309 442813 215098
## bingedrinkyes 215210 191431 216799 216115 215098 217222
##
## $rm
## agegpnam incgps col laborforce mentpoor bingedrinkyes
## agegpnam 0 71552 1179 3166 6975 228705
## incgps 2568 0 594 1828 4700 183500
## col 5573 73972 0 3368 7092 231510
## laborforce 5561 73207 1369 0 7001 230195
## mentpoor 5873 72582 1596 3504 0 227715
## bingedrinkyes 2012 25791 423 1107 2124 0
##
## $mr
## agegpnam incgps col laborforce mentpoor bingedrinkyes
## agegpnam 0 2568 5573 5561 5873 2012
## incgps 71552 0 73972 73207 72582 25791
## col 1179 594 0 1369 1596 423
## laborforce 3166 1828 3368 0 3504 1107
## mentpoor 6975 4700 7092 7001 0 2124
## bingedrinkyes 228705 183500 231510 230195 227715 0
##
## $mm
## agegpnam incgps col laborforce mentpoor bingedrinkyes
## agegpnam 6101 3533 528 540 228 4089
## incgps 3533 75085 1113 1878 2503 49294
## col 528 1113 1707 338 111 1284
## laborforce 540 1878 338 3706 202 2599
## mentpoor 228 2503 111 202 7203 5079
## bingedrinkyes 4089 49294 1284 2599 5079 232794
#try a small subset
samp<-sample(1:dim(mydata)[1], size=45000)
mydata<-mydata[samp,]
basicimp<-mice(data=mydata[, c("agegpnam", "incgps", "col", "laborforce", "mentpoor", "bingedrinkyes")], seed=22, m=5) #'m=5' is for 5 productions, however 15-20 is standard
##
## iter imp variable
## 1 1 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 1 2 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 1 3 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 1 4 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 1 5 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 2 1 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 2 2 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 2 3 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 2 4 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 2 5 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 3 1 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 3 2 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 3 3 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 3 4 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 3 5 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 4 1 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 4 2 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 4 3 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 4 4 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 4 5 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 5 1 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 5 2 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 5 3 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 5 4 agegpnam incgps col laborforce mentpoor bingedrinkyes
## 5 5 agegpnam incgps col laborforce mentpoor bingedrinkyes
Output to represent that 5 iterations of imputing 5 variables.
print(basicimp)
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## agegpnam incgps col laborforce mentpoor
## "polyreg" "polyreg" "logreg" "logreg" "logreg"
## bingedrinkyes
## "logreg"
## PredictorMatrix:
## agegpnam incgps col laborforce mentpoor bingedrinkyes
## agegpnam 0 1 1 1 1 1
## incgps 1 0 1 1 1 1
## col 1 1 0 1 1 1
## laborforce 1 1 1 0 1 1
## mentpoor 1 1 1 1 0 1
## bingedrinkyes 1 1 1 1 1 0
Below each variable name details the meains in which imputation was conducted. Based on the variable type, a default was selected.
summary(basicimp$imp$agegpnam)
## 1 2 3 4 5
## 0_18-24: 34 0_18-24: 28 0_18-24: 37 0_18-24: 39 0_18-24: 31
## 25-34 : 73 25-34 : 73 25-34 : 57 25-34 : 69 25-34 : 68
## 35-44 : 79 35-44 : 80 35-44 : 86 35-44 : 78 35-44 : 71
## 45-59 :167 45-59 :177 45-59 :150 45-59 :163 45-59 :170
## 60-74 :193 60-74 :192 60-74 :224 60-74 :195 60-74 :199
## 75plus : 82 75plus : 78 75plus : 74 75plus : 84 75plus : 89
summary(basicimp$imp$incgps)
## 1 2 3 4 5
## 0_<20k:1544 0_<20k:1476 0_<20k:1566 0_<20k:1585 0_<20k:1584
## 20-34 :1746 20-34 :1697 20-34 :1701 20-34 :1683 20-34 :1651
## 35-49 :1140 35-49 :1146 35-49 :1145 35-49 :1133 35-49 :1122
## 50-74 :1104 50-74 :1186 50-74 :1090 50-74 :1111 50-74 :1166
## 75plus:2062 75plus:2091 75plus:2094 75plus:2084 75plus:2073
summary(basicimp$imp$col)
## 1 2 3 4 5
## 0LessCol:57 0LessCol:56 0LessCol:58 0LessCol: 51 0LessCol:62
## Col :96 Col :97 Col :95 Col :102 Col :91
summary(basicimp$imp$laborforce)
## 1 2 3 4
## 0no :159 0no :141 0no :144 0no :151
## labforce:195 labforce:213 labforce:210 labforce:203
## 5
## 0no :146
## labforce:208
summary(basicimp$imp$mentpoor)
## 1 2 3 4 5
## 0:442 0:460 0:465 0:477 0:466
## 1:263 1:245 1:240 1:228 1:239
summary(basicimp$imp$bingedrinkyes)
## 1 2 3 4 5
## 0:17787 0:17634 0:17728 0:17738 0:17588
## 1: 5529 1: 5682 1: 5588 1: 5578 1: 5728
library(lattice)
stripplot(basicimp, agegpnam~bingedrinkyes|.imp, pch=20)
dat.basicimp <- complete(basicimp, action = 1)
head(dat.basicimp, n=10)
## agegpnam incgps col laborforce mentpoor bingedrinkyes
## 1 60-74 0_<20k 0LessCol labforce 0 0
## 2 0_18-24 0_<20k 0LessCol labforce 0 0
## 3 45-59 20-34 0LessCol 0no 1 0
## 4 60-74 0_<20k Col 0no 0 0
## 5 60-74 75plus Col labforce 0 0
## 6 45-59 0_<20k Col labforce 0 0
## 7 35-44 75plus Col labforce 0 0
## 8 60-74 0_<20k 0LessCol 0no 0 0
## 9 60-74 75plus Col 0no 0 0
## 10 35-44 75plus Col labforce 0 0
library(foreign)
fit.multimp<-with(data=basicimp, expr=glm(bingedrinkyes~incgps+agegpnam+col+laborforce+mentpoor, family = "binomial"))
fit.multimp
## call :
## with.mids(data = basicimp, expr = glm(bingedrinkyes ~ incgps +
## agegpnam + col + laborforce + mentpoor, family = "binomial"))
##
## call1 :
## mice(data = mydata[, c("agegpnam", "incgps", "col", "laborforce",
## "mentpoor", "bingedrinkyes")], m = 5, seed = 22)
##
## nmis :
## agegpnam incgps col laborforce mentpoor
## 628 7596 153 354 705
## bingedrinkyes
## 23316
##
## analyses :
## [[1]]
##
## Call: glm(formula = bingedrinkyes ~ incgps + agegpnam + col + laborforce +
## mentpoor, family = "binomial")
##
## Coefficients:
## (Intercept) incgps20-34 incgps35-49
## 0.02173 -0.15656 -0.02357
## incgps50-74 incgps75plus agegpnam25-34
## -0.17484 -0.11457 -0.18588
## agegpnam35-44 agegpnam45-59 agegpnam60-74
## -0.62756 -0.96651 -1.36750
## agegpnam75plus colCol laborforcelabforce
## -2.38040 -0.45900 0.29863
## mentpoor1
## 0.23021
##
## Degrees of Freedom: 44999 Total (i.e. Null); 44987 Residual
## Null Deviance: 50370
## Residual Deviance: 46040 AIC: 46070
##
## [[2]]
##
## Call: glm(formula = bingedrinkyes ~ incgps + agegpnam + col + laborforce +
## mentpoor, family = "binomial")
##
## Coefficients:
## (Intercept) incgps20-34 incgps35-49
## 0.1531 -0.1915 -0.1483
## incgps50-74 incgps75plus agegpnam25-34
## -0.1147 -0.1319 -0.3123
## agegpnam35-44 agegpnam45-59 agegpnam60-74
## -0.6596 -0.9879 -1.4613
## agegpnam75plus colCol laborforcelabforce
## -2.5423 -0.4556 0.2540
## mentpoor1
## 0.2302
##
## Degrees of Freedom: 44999 Total (i.e. Null); 44987 Residual
## Null Deviance: 50710
## Residual Deviance: 46250 AIC: 46280
##
## [[3]]
##
## Call: glm(formula = bingedrinkyes ~ incgps + agegpnam + col + laborforce +
## mentpoor, family = "binomial")
##
## Coefficients:
## (Intercept) incgps20-34 incgps35-49
## 0.07379 -0.15533 -0.11049
## incgps50-74 incgps75plus agegpnam25-34
## -0.19696 -0.14567 -0.15934
## agegpnam35-44 agegpnam45-59 agegpnam60-74
## -0.56037 -0.95935 -1.41385
## agegpnam75plus colCol laborforcelabforce
## -2.51435 -0.41608 0.25237
## mentpoor1
## 0.19728
##
## Degrees of Freedom: 44999 Total (i.e. Null); 44987 Residual
## Null Deviance: 50500
## Residual Deviance: 45970 AIC: 46000
##
## [[4]]
##
## Call: glm(formula = bingedrinkyes ~ incgps + agegpnam + col + laborforce +
## mentpoor, family = "binomial")
##
## Coefficients:
## (Intercept) incgps20-34 incgps35-49
## 0.05530 -0.11266 -0.01481
## incgps50-74 incgps75plus agegpnam25-34
## -0.10434 -0.05662 -0.25117
## agegpnam35-44 agegpnam45-59 agegpnam60-74
## -0.56465 -0.99610 -1.43889
## agegpnam75plus colCol laborforcelabforce
## -2.43277 -0.45459 0.25530
## mentpoor1
## 0.20273
##
## Degrees of Freedom: 44999 Total (i.e. Null); 44987 Residual
## Null Deviance: 50480
## Residual Deviance: 46130 AIC: 46160
##
## [[5]]
##
## Call: glm(formula = bingedrinkyes ~ incgps + agegpnam + col + laborforce +
## mentpoor, family = "binomial")
##
## Coefficients:
## (Intercept) incgps20-34 incgps35-49
## 0.10436 -0.13558 -0.08557
## incgps50-74 incgps75plus agegpnam25-34
## -0.12424 -0.12861 -0.25072
## agegpnam35-44 agegpnam45-59 agegpnam60-74
## -0.55230 -0.92607 -1.43011
## agegpnam75plus colCol laborforcelabforce
## -2.57922 -0.47759 0.25262
## mentpoor1
## 0.23096
##
## Degrees of Freedom: 44999 Total (i.e. Null); 44987 Residual
## Null Deviance: 50810
## Residual Deviance: 46150 AIC: 46180
est.p<-pool(fit.multimp)
print(est.p)
## Class: mipo m = 5
## estimate ubar b t
## (Intercept) 0.08165616 0.0024927533 0.0024920980 0.0054832709
## incgps20-34 -0.15033157 0.0014129811 0.0008492525 0.0024320841
## incgps35-49 -0.07655175 0.0017078553 0.0032505608 0.0056085282
## incgps50-74 -0.14300621 0.0016977027 0.0016438100 0.0036702747
## incgps75plus -0.11547407 0.0013465362 0.0012044799 0.0027919121
## agegpnam25-34 -0.23187458 0.0024673799 0.0036416353 0.0068373423
## agegpnam35-44 -0.59289665 0.0024728579 0.0022890627 0.0052197331
## agegpnam45-59 -0.96718026 0.0020454074 0.0007541394 0.0029503747
## agegpnam60-74 -1.42232731 0.0021386486 0.0012325356 0.0036176912
## agegpnam75plus -2.48981795 0.0043774153 0.0066448946 0.0123512888
## colCol -0.45257777 0.0006548276 0.0005030718 0.0012585138
## laborforcelabforce 0.26258277 0.0007984065 0.0004073323 0.0012872053
## mentpoor1 0.21827576 0.0005916406 0.0002820488 0.0009300992
## dfcom df riv lambda fmi
## (Intercept) 44987 13.438821 1.1996846 0.5453894 0.6006988
## incgps20-34 44987 22.761592 0.7212432 0.4190246 0.4641286
## incgps35-49 44987 8.264501 2.2839599 0.6954896 0.7495551
## incgps50-74 44987 13.838928 1.1619066 0.5374453 0.5923841
## incgps75plus 44987 14.914311 1.0734030 0.5177011 0.5715462
## agegpnam25-34 44987 9.786273 1.7710943 0.6391317 0.6955779
## agegpnam35-44 44987 14.433926 1.1108100 0.5262482 0.5805965
## agegpnam45-59 44987 42.457749 0.4424386 0.3067296 0.3372313
## agegpnam60-74 44987 23.909525 0.6915782 0.4088361 0.4527732
## agegpnam75plus 44987 9.591455 1.8215940 0.6455904 0.7018841
## colCol 44987 17.371250 0.9219009 0.4796818 0.5307654
## laborforcelabforce 44987 27.711744 0.6122179 0.3797365 0.4201291
## mentpoor1 44987 30.175139 0.5720678 0.3638951 0.4022434
summary(est.p)
## estimate std.error statistic df
## (Intercept) 0.08165616 0.07404911 1.102730 13.438821
## incgps20-34 -0.15033157 0.04931616 -3.048322 22.761592
## incgps35-49 -0.07655175 0.07489011 -1.022188 8.264501
## incgps50-74 -0.14300621 0.06058279 -2.360509 13.838928
## incgps75plus -0.11547407 0.05283855 -2.185413 14.914311
## agegpnam25-34 -0.23187458 0.08268822 -2.804203 9.786273
## agegpnam35-44 -0.59289665 0.07224772 -8.206441 14.433926
## agegpnam45-59 -0.96718026 0.05431735 -17.806101 42.457749
## agegpnam60-74 -1.42232731 0.06014725 -23.647422 23.909525
## agegpnam75plus -2.48981795 0.11113635 -22.403272 9.591455
## colCol -0.45257777 0.03547554 -12.757460 17.371250
## laborforcelabforce 0.26258277 0.03587764 7.318841 27.711744
## mentpoor1 0.21827576 0.03049753 7.157162 30.175139
## p.value
## (Intercept) 2.763560e-01
## incgps20-34 3.950773e-03
## incgps35-49 3.124812e-01
## incgps50-74 2.291602e-02
## incgps75plus 3.442476e-02
## agegpnam25-34 7.575694e-03
## agegpnam35-44 2.662626e-10
## agegpnam45-59 0.000000e+00
## agegpnam60-74 0.000000e+00
## agegpnam75plus 0.000000e+00
## colCol 4.440892e-16
## laborforcelabforce 4.795038e-09
## mentpoor1 8.171652e-09
fit1<-glm(bingedrinkyes~agegpnam+incgps+col+laborforce+mentpoor, data=mydata, family="binomial")
#fit1
summary(fit1)
##
## Call:
## glm(formula = bingedrinkyes ~ agegpnam + incgps + col + laborforce +
## mentpoor, family = "binomial", data = mydata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4453 -0.8101 -0.5992 1.1374 2.4254
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.16602 0.08678 1.913 0.055718 .
## agegpnam25-34 -0.29587 0.07645 -3.870 0.000109 ***
## agegpnam35-44 -0.62320 0.07718 -8.075 6.74e-16 ***
## agegpnam45-59 -0.99308 0.07218 -13.758 < 2e-16 ***
## agegpnam60-74 -1.45450 0.07536 -19.301 < 2e-16 ***
## agegpnam75plus -2.48970 0.11662 -21.349 < 2e-16 ***
## incgps20-34 -0.13230 0.06868 -1.926 0.054080 .
## incgps35-49 -0.10870 0.07074 -1.537 0.124372
## incgps50-74 -0.13711 0.06830 -2.007 0.044708 *
## incgps75plus -0.14255 0.06217 -2.293 0.021843 *
## colCol -0.42091 0.04119 -10.220 < 2e-16 ***
## laborforcelabforce 0.22470 0.04649 4.833 1.34e-06 ***
## mentpoor1 0.22007 0.03680 5.980 2.24e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 21640 on 18668 degrees of freedom
## Residual deviance: 19972 on 18656 degrees of freedom
## (26331 observations deleted due to missingness)
## AIC: 19998
##
## Number of Fisher Scoring iterations: 5
fit1.modimp <- glm(bingedrinkyes.imp~agegpnam.imp+incgps.imp+col.imp+laborforce.imp+mentpoor.imp, data = mydata, family="binomial")
summary(fit1.modimp)
##
## Call:
## glm(formula = bingedrinkyes.imp ~ agegpnam.imp + incgps.imp +
## col.imp + laborforce.imp + mentpoor.imp, family = "binomial",
## data = mydata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9366 -0.5730 -0.4333 -0.2387 2.8805
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.79070 0.06676 -26.822 < 2e-16 ***
## agegpnam.imp25-34 -0.09473 0.05749 -1.648 0.0994 .
## agegpnam.imp35-44 -0.48254 0.05889 -8.194 2.54e-16 ***
## agegpnam.imp45-59 -0.81303 0.05390 -15.084 < 2e-16 ***
## agegpnam.imp60-74 -1.24879 0.05670 -22.025 < 2e-16 ***
## agegpnam.imp75plus -2.34206 0.09950 -23.539 < 2e-16 ***
## incgps.imp20-34 0.09096 0.05827 1.561 0.1185
## incgps.imp35-49 0.28267 0.06083 4.647 3.37e-06 ***
## incgps.imp50-74 0.36518 0.05894 6.195 5.82e-10 ***
## incgps.imp75plus 0.25818 0.04997 5.167 2.38e-07 ***
## col.impCol 0.02701 0.03293 0.820 0.4121
## laborforce.implabforce 0.52272 0.03774 13.852 < 2e-16 ***
## mentpoor.imp1 0.27905 0.03080 9.060 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 33866 on 44999 degrees of freedom
## Residual deviance: 31163 on 44987 degrees of freedom
## AIC: 31189
##
## Number of Fisher Scoring iterations: 6
fit1.multiimp <- glm(bingedrinkyes~., data = dat.basicimp, family="binomial")
summary(fit1.multiimp)
##
## Call:
## glm(formula = bingedrinkyes ~ ., family = "binomial", data = dat.basicimp)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4183 -0.7747 -0.5893 -0.3129 2.4664
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.02173 0.04998 0.435 0.663786
## agegpnam25-34 -0.18588 0.04969 -3.741 0.000183 ***
## agegpnam35-44 -0.62756 0.04993 -12.569 < 2e-16 ***
## agegpnam45-59 -0.96651 0.04532 -21.326 < 2e-16 ***
## agegpnam60-74 -1.36750 0.04621 -29.591 < 2e-16 ***
## agegpnam75plus -2.38040 0.06524 -36.486 < 2e-16 ***
## incgps20-34 -0.15656 0.03759 -4.165 3.12e-05 ***
## incgps35-49 -0.02357 0.04113 -0.573 0.566584
## incgps50-74 -0.17484 0.04146 -4.217 2.47e-05 ***
## incgps75plus -0.11457 0.03678 -3.115 0.001840 **
## colCol -0.45900 0.02561 -17.920 < 2e-16 ***
## laborforcelabforce 0.29863 0.02832 10.544 < 2e-16 ***
## mentpoor1 0.23021 0.02435 9.453 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 50374 on 44999 degrees of freedom
## Residual deviance: 46043 on 44987 degrees of freedom
## AIC: 46069
##
## Number of Fisher Scoring iterations: 5
Lowest AIC was when using the original dataset without any imputations. Significance levels changed from no impuation to modal imputation datasets. Income group ‘20-34’ had a significant association with ‘bingedrinkyes’ compared to reference income group ‘<20’ in the original dataset and in the multiple imputation dataset; however, that same relationship was not apparent in the dataset using modal imputation. In addition the direction of relationships in the modal imputation dataset differed from the other two. The multiple imputation datase results were more in line with that of the original dataset. In the original dataset over 26K observations were deleted to due to missingness. Therefore using multiple imputaion method yielded the best results that were akin to original without having to delete any observations.
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggplot2)
#get the coefficients from each of the 5 imputations
coefs<-as.data.frame(matrix(unlist(lapply(fit.multimp$analyses, coef)), nrow=5, ncol=13, byrow=T))
names(coefs)<-names(fit.multimp$analyses[[1]]$coef)
#plot the coefficients from each of the different rounds of imputation to see the variability in the
#results
coefs%>%
select(`incgps20-34`:`mentpoor1`)%>%
melt()%>%
mutate(basicimp=rep(1:5, 12))%>%
ggplot()+geom_point(aes(y=value,x=variable, group=variable, color=as.factor(basicimp) ))+ggtitle("Estimated Betas from Each Imputed Regression Model for Binge Drinking Outcome")+theme(axis.text.x = element_text(angle = 45, hjust = 1))
## No id variables; using all as measure variables
We note that the betas are in relative proximation of each other, with some having greater spread compared to others in which the betas greatly overlap.
While the actual imputed values may have a greater spread, overall variance is replicated and thus results parallel that of the non-imputed dataset.