Multiple Imputation

Load necessary packages.

options(repos="https://cran.rstudio.com" )
library(htmlTable)
library(car)
## Loading required package: carData
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## Loading required package: survival
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(questionr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Load data from ipums

# 
install.packages("ipumsr")
## 
## The downloaded binary packages are in
##  /var/folders/lr/thl33hwj2jdd01hth_whxlz00000gn/T//RtmpHRHpg1/downloaded_packages
library(ipumsr)
## Registered S3 methods overwritten by 'ipumsr':
##   method                                 from 
##   format.pillar_shaft_haven_labelled_chr haven
##   format.pillar_shaft_haven_labelled_num haven
##   pillar_shaft.haven_labelled            haven
# Note to self: Must set wd to where all files are, then open ddi and feed into read_ipums_micro
setwd("~/Desktop/UTSA/DEM7283_StatsII/HW2")
ddi <- read_ipums_ddi("usa_00011.xml")
data_total <- read_ipums_micro(ddi)
## Use of data from IPUMS-USA is subject to conditions including that users should
## cite the data appropriately. Use command `ipums_conditions()` for more details.
names(data_total)
##  [1] "YEAR"      "MULTYEAR"  "SAMPLE"    "SERIAL"    "CBSERIAL"  "HHWT"     
##  [7] "CLUSTER"   "STATEFIP"  "COUNTYFIP" "MET2013"   "PUMA"      "STRATA"   
## [13] "GQ"        "OWNERSHP"  "OWNERSHPD" "VALUEH"    "PERNUM"    "PERWT"    
## [19] "SEX"       "AGE"       "RACE"      "RACED"     "HISPAN"    "HISPAND"  
## [25] "EMPSTAT"   "EMPSTATD"  "LABFORCE"  "OCC"       "IND"       "CLASSWKR" 
## [31] "CLASSWKRD" "WKSWORK2"  "UHRSWORK"  "INCWAGE"   "POVERTY"   "MIGRATE1" 
## [37] "MIGRATE1D"
#At first I tried to do the multiple imputation with this whole dataset but it took forever so let's filter out just for the observations in the San Antonio-New Braunfels MSA (code = 33260    ). This leaves me with 6,021 observations.
data <- filter(data_total, MET2013==33260)

Cleaning data/recoding variables

Outcome Variable: Homeowner (yes/no)

Predictor Variables: * Age * Employment Status * Moved within last year * Poverty status * Race * Hispanic/Latino Ethncity

#Homeownership
data$tenure_c <-Recode(data$OWNERSHP, recodes="0=NA; 1='Owned/Mortgaged'; 2='Rented'", as.factor=T)

#Age
data$age_c <-Recode(data$AGE, recodes="0:17='0 Child'; 18:64='1 Working Age'; else='2 Senior'", as.factor=T)

#Employment Status
data$empl_c <-Recode(data$EMPSTAT, recodes="0=NA; 1='1 Employed'; 2='2 Unemployed'; 3='3 Not in LF'", as.factor=T)

#Moved within last year
data$move <- Recode(data$MIGRATE1, recodes="0=NA; 1='2. Did not Move'; 2:4='1. Moved within last year'; 9=NA", as.factor=T)

#Poverty Status
data$pov_b <-Recode(data$POVERTY, recodes="000=NA; 1:100='Below Poverty'; else='Above Poverty'", as.factor=T)

#Race
data$race_c <- Recode(data$RACE, recodes = "1='1. White'; 2='2. Black'; 3='3. Native American'; 4:6='4. Asian/PI'; else = '5. Other, or more than one race'", as.factor=T)

#Hispanic/Latino Ethnicity
data$hisp <- Recode(data$HISPAN, recodes = "0='1. Not Hispanic'; 1:4='2. Hispanic/Latino'; else = NA", as.factor=T)

Pattern of missingness for all variables

There are no NAs for Race or Ethnicity, which is interesting and I couldn’t find why those would, in particular, not have any NAs (perhaps any NAs go to “other?”). There are also no NAs for age, which I wonder if the ACS already imputes that?

The highest number of NAs (1,339) are for employment status, which might make sense as a question that many might be reluctant to answer, or perhaps might not understand. Housing tenure has 219 NAs, followed by poverty status with 158 and “move within last year” with 90. When examining the missingness pattern, we see that 143 observations are missing poverty and tenure (I think I’m interpreting the pattern correctly?). If this is the case, perhaps there are shared characteristics of those who are not responding to both of those questions.

summary(data[, c("tenure_c", "age_c","empl_c","move","pov_b", "race_c", "hisp")])
##             tenure_c              age_c               empl_c    
##  Owned/Mortgaged:4423   0 Child      :1482   1 Employed  :3006  
##  Rented         :1379   1 Working Age:3671   2 Unemployed: 103  
##  NA's           : 219   2 Senior     : 868   3 Not in LF :1573  
##                                              NA's        :1339  
##                                                                 
##                         move                pov_b     
##  1. Moved within last year: 939   Above Poverty:5345  
##  2. Did not Move          :4992   Below Poverty: 518  
##  NA's                     :  90   NA's         : 158  
##                                                       
##                                                       
##                              race_c                     hisp     
##  1. White                       :5158   1. Not Hispanic   :3812  
##  2. Black                       : 361   2. Hispanic/Latino:2209  
##  3. Native American             :  32                            
##  4. Asian/PI                    : 112                            
##  5. Other, or more than one race: 358
library(mice)
## Loading required package: lattice
## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car
## 
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
md.pattern(data[, c("tenure_c", "age_c","empl_c","move","pov_b", "race_c", "hisp")])

##      age_c race_c hisp move pov_b tenure_c empl_c     
## 4465     1      1    1    1     1        1      1    0
## 1237     1      1    1    1     1        1      0    1
## 74       1      1    1    1     1        0      1    1
## 10       1      1    1    1     0        1      0    2
## 143      1      1    1    1     0        0      1    2
## 2        1      1    1    1     0        0      0    3
## 87       1      1    1    0     1        1      0    2
## 3        1      1    1    0     0        1      0    3
##          0      0    0   90   158      219   1339 1806

Comparison: Modal vs. Multiple

Were the results similar between the mean/modal and multiply imputed data sets? How do the results compare to the results from the model fit with the data source with missing values?

  • Here I’m comparing regressions bewteen data sets (I think this is “the analysis” referenced in homework?). I’m honestly unsure how to compare them, and unsure if I’m doing this right. Is this on the right track?

  • The coefficients for the modally imputed data set are different than both the original data and the multiply imputed dataset.

#Unchaged data (leaving NAs as is)
library(survey)
des <- svydesign(id=~CLUSTER, strata=~STRATA, weights=~PERWT, data=data)
fit.logit1<-svyglm(tenure_c~age_c+empl_c+move+pov_b+race_c+hisp,
                  design= des,
                  family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit1)
## 
## Call:
## svyglm(formula = tenure_c ~ age_c + empl_c + move + pov_b + race_c + 
##     hisp, design = des, family = binomial)
## 
## Survey design:
## svydesign(id = ~CLUSTER, strata = ~STRATA, weights = ~PERWT, 
##     data = data)
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            0.3099425  0.3197897   0.969 0.332547
## age_c1 Working Age                     0.1260949  0.2806702   0.449 0.653285
## age_c2 Senior                         -0.4103804  0.3111046  -1.319 0.187269
## empl_c2 Unemployed                    -0.0003687  0.3058694  -0.001 0.999038
## empl_c3 Not in LF                     -0.4527183  0.1122262  -4.034 5.67e-05
## move2. Did not Move                   -1.8678294  0.1509127 -12.377  < 2e-16
## pov_bBelow Poverty                     1.2314764  0.2178243   5.654 1.77e-08
## race_c2. Black                         0.8663488  0.2294602   3.776 0.000164
## race_c3. Native American               1.3557348  0.6978477   1.943 0.052174
## race_c4. Asian/PI                     -0.3028557  0.5347939  -0.566 0.571245
## race_c5. Other, or more than one race  0.0857941  0.2411641   0.356 0.722062
## hisp2. Hispanic/Latino                 0.2128376  0.1431861   1.486 0.137304
##                                          
## (Intercept)                              
## age_c1 Working Age                       
## age_c2 Senior                            
## empl_c2 Unemployed                       
## empl_c3 Not in LF                     ***
## move2. Did not Move                   ***
## pov_bBelow Poverty                    ***
## race_c2. Black                        ***
## race_c3. Native American              .  
## race_c4. Asian/PI                        
## race_c5. Other, or more than one race    
## hisp2. Hispanic/Latino                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9912079)
## 
## Number of Fisher Scoring iterations: 4
#Modally imputed data
des <- svydesign(id=~CLUSTER, strata=~STRATA, weights=~PERWT, data=data)
fit.logit2<-svyglm(tenure_c.imp~age_c+empl_c.imp+move.imp+pov_b.imp+race_c+hisp,
                  design= des,
                  family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit2)
## 
## Call:
## svyglm(formula = tenure_c.imp ~ age_c + empl_c.imp + move.imp + 
##     pov_b.imp + race_c + hisp, design = des, family = binomial)
## 
## Survey design:
## svydesign(id = ~CLUSTER, strata = ~STRATA, weights = ~PERWT, 
##     data = data)
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            0.249851   0.171123   1.460  0.14440    
## age_c1 Working Age                     0.006252   0.095811   0.065  0.94798    
## age_c2 Senior                         -0.478671   0.185377  -2.582  0.00988 ** 
## empl_c.imp2 Unemployed                -0.038790   0.304452  -0.127  0.89863    
## empl_c.imp3 Not in LF                 -0.593506   0.104662  -5.671 1.59e-08 ***
## move.imp2. Did not Move               -1.736605   0.151531 -11.460  < 2e-16 ***
## pov_b.impBelow Poverty                 1.209164   0.243671   4.962 7.45e-07 ***
## race_c2. Black                         1.087414   0.255727   4.252 2.20e-05 ***
## race_c3. Native American               1.736709   0.648277   2.679  0.00743 ** 
## race_c4. Asian/PI                     -0.109817   0.551454  -0.199  0.84217    
## race_c5. Other, or more than one race -0.060030   0.231282  -0.260  0.79523    
## hisp2. Hispanic/Latino                 0.352091   0.145231   2.424  0.01541 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.00653)
## 
## Number of Fisher Scoring iterations: 4
#Multiply imputed data
des2 <- svydesign(id=~CLUSTER, strata=~STRATA, weights=~PERWT, data=dat2)
fit.logit3<-svyglm(tenure_knock~age_c+empl_c+move+pov_b+race_c+hisp,
                  design= des2,
                  family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(fit.logit3)
## 
## Call:
## svyglm(formula = tenure_knock ~ age_c + empl_c + move + pov_b + 
##     race_c + hisp, design = des2, family = binomial)
## 
## Survey design:
## svydesign(id = ~CLUSTER, strata = ~STRATA, weights = ~PERWT, 
##     data = dat2)
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            0.32132    0.33537   0.958 0.338120    
## age_c1 Working Age                     0.14611    0.29565   0.494 0.621228    
## age_c2 Senior                         -0.42601    0.32861  -1.296 0.194986    
## empl_c2 Unemployed                    -0.04601    0.31441  -0.146 0.883666    
## empl_c3 Not in LF                     -0.40919    0.11863  -3.449 0.000573 ***
## move2. Did not Move                   -1.88783    0.15187 -12.431  < 2e-16 ***
## pov_bBelow Poverty                     1.20884    0.22296   5.422 6.55e-08 ***
## race_c2. Black                         0.76512    0.23382   3.272 0.001084 ** 
## race_c3. Native American               1.37760    0.70731   1.948 0.051585 .  
## race_c4. Asian/PI                     -0.02362    0.52455  -0.045 0.964090    
## race_c5. Other, or more than one race  0.03638    0.25156   0.145 0.885033    
## hisp2. Hispanic/Latino                 0.19705    0.14671   1.343 0.179382    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 0.9919528)
## 
## Number of Fisher Scoring iterations: 4