#dat17 <- read_excel("acs5_2017_comp.xlsx", sheet = "2017acs5yr", skip = 1)
#summary(dat17)
hw6 <- read_excel("~/Google Drive/J254592.xlsx")
#replace_na(dat17)
#summary(hw6)


hw6 <- hw6 %>% 
  transmute(
    move = ER42132,
    educ = ER34020,
    sex = ER36018,
    age = ER33904,
    health = ER38202,
    race = ER46543,
    wt = ER34046
  )
table(hw6$age)
## 
##   0   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23 
##  49   2 129 176 213 192 249 210 227 219 238 200 250 223 221  59
#table(hw6$race)
#replace_with_na(hw6, replace = list(x = 999))
    
  
  
#summary(hw6)
hw6 <- hw6%>% 
  mutate(
    educ = as.factor(ifelse(educ <= 11, "lths",
                  ifelse(educ == 12, "hs",
                        ifelse(educ >=13 & educ <= 90, "college", NA)))),
    health = as.factor(ifelse(health <= 2, "good",
                    ifelse(health ==3, "fair",
                         ifelse(health >= 4 & health <= 6  , "poor", NA)))),
    move = ifelse(move == 1,1,#1 means moved
                  ifelse(move == 5, 0,NA)),
    sex = as.factor(ifelse(sex == 1, "m","f")),
    race = as.factor(ifelse(race == 1, "white",
                            ifelse(race == 2, "black",
                                   ifelse(race >= 3 & race <= 7, "other", NA)))),
    age=as.factor(ifelse(age >=9 & age <= 12,"child",
               ifelse(age>=13 & age <=19, "teen",
                      ifelse( age >=20 & age <= 90, "adult", NA))))
  )
hw6$educ <- relevel(hw6$educ, ref = "hs")
hw6$sex <- relevel(hw6$sex, ref = "m")
hw6$health <- relevel(hw6$health, ref = "fair")
hw6$race <- relevel(hw6$race, ref = "white")

#summary(hw6$race)
#table(hw6$welf)
#table(hw6$educ)
hw6.rem <- hw6[complete.cases(hw6),]
 #str(hw6.rem$age) 

options(survey.lonely.psu = "adjust")
des1 <- svydesign(ids = ~1, weights = ~wt, data = hw6 )

Measure an outcome variable with at leat 5 predictors

I’m revisiting the PSID 2017 data

I’m going to do a logistic regression using whether or not someone moved as the dependent variable and 5 predictors:
educ: education lths = less than high school
hs = high school
college = any college
sex: gender
age: age health: health good, fair and poor welf: did you receive welfare in the past year

Looking at a summary of the data and we see that there are NAs in all of the variables
summary(hw6$educ)
##      hs college    lths    NA's 
##     589     600    1653      15
summary(hw6$sex)
##    m    f NA's 
## 1795 1013   49
summary(hw6$age)
## adult child  teen  NA's 
##   753   520  1535    49
summary(hw6$health)
## fair good poor NA's 
##  819 1577  402   59
summary(hw6$race)
## white black other  NA's 
##  1435  1184   167    71

The first model with missing values removed

fit1 <- svyglm(move ~ educ + sex + age + health + race, data = hw6.rem, family = "binomial", design = des1)
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!

Imputing by mode
Education

table(hw6$educ)
## 
##      hs college    lths 
##     589     600    1653
mcv.educ <- factor(names(which.max(table(hw6$educ))), levels = levels(hw6$educ))
mcv.educ
## [1] lths
## Levels: hs college lths
hw6$educ.imp <- as.factor(ifelse(is.na(hw6$educ) == T, mcv.educ, hw6$educ))
levels(hw6$educ.imp) <- levels(hw6$educ)

prop.table(table(hw6$educ))
## 
##        hs   college      lths 
## 0.2072484 0.2111189 0.5816327
prop.table(table(hw6$educ.imp))
## 
##        hs   college      lths 
## 0.2061603 0.2100105 0.5838292

Health

table(hw6$health)
## 
## fair good poor 
##  819 1577  402
mcv.health <- factor(names(which.max(table(hw6$health))), levels = levels(hw6$health))
mcv.health
## [1] good
## Levels: fair good poor
hw6$health.imp <- as.factor(ifelse(is.na(hw6$health) == T, mcv.health, hw6$health))
levels(hw6$health.imp) <- levels(hw6$health)

prop.table(table(hw6$health))
## 
##      fair      good      poor 
## 0.2927091 0.5636169 0.1436741
prop.table(table(hw6$health.imp))
## 
##      fair      good      poor 
## 0.2866643 0.5726286 0.1407070

race

table(hw6$race)
## 
## white black other 
##  1435  1184   167
mcv.race <- factor(names(which.max(table(hw6$race))), levels = levels(hw6$race))
mcv.race
## [1] white
## Levels: white black other
hw6$race.imp <- as.factor(ifelse(is.na(hw6$race) == T, mcv.race, hw6$race))
levels(hw6$race.imp) <- levels(hw6$race)

prop.table(table(hw6$race))
## 
##      white      black      other 
## 0.51507538 0.42498205 0.05994257
prop.table(table(hw6$race.imp))
## 
##      white      black      other 
## 0.52712636 0.41442072 0.05845292

sex

table(hw6$sex)
## 
##    m    f 
## 1795 1013
mcv.sex <- factor(names(which.max(table(hw6$sex))), levels = levels(hw6$sex))
mcv.sex
## [1] m
## Levels: m f
hw6$sex.imp <- as.factor(ifelse(is.na(hw6$sex) == T, mcv.sex, hw6$sex))
levels(hw6$sex.imp) <- levels(hw6$sex)

prop.table(table(hw6$sex))
## 
##        m        f 
## 0.639245 0.360755
prop.table(table(hw6$sex.imp))
## 
##         m         f 
## 0.6454323 0.3545677

Age

table(hw6$age)
## 
## adult child  teen 
##   753   520  1535
mcv.age <- factor(names(which.max(table(hw6$age))), levels = levels(hw6$age))
mcv.age
## [1] teen
## Levels: adult child teen
hw6$age.imp <- as.factor(ifelse(is.na(hw6$age) == T, mcv.age, hw6$age))
levels(hw6$age.imp) <- levels(hw6$age)

prop.table(table(hw6$age))
## 
##     adult     child      teen 
## 0.2681624 0.1851852 0.5466524
prop.table(table(hw6$age.imp))
## 
##     adult     child      teen 
## 0.2635632 0.1820091 0.5544277

logistic regression with imputed modes

options(survey.lonely.psu = "adjust")
des2 <- svydesign(ids = ~1, weights = ~wt, data = hw6 )

fit2 <- svyglm(move ~ educ.imp + sex.imp + age.imp + health.imp + race.imp, data = hw6, family = "binomial", design = des2) 
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!

Multiple imputation
pairs

md.pairs(hw6[ , c("educ", "health", "age", "race", "sex")])
## $rr
##        educ health  age race  sex
## educ   2842   2783 2793 2771 2793
## health 2783   2798 2798 2732 2798
## age    2793   2798 2808 2741 2808
## race   2771   2732 2741 2786 2741
## sex    2793   2798 2808 2741 2808
## 
## $rm
##        educ health age race sex
## educ      0     59  49   71  49
## health   15      0   0   66   0
## age      15     10   0   67   0
## race     15     54  45    0  45
## sex      15     10   0   67   0
## 
## $mr
##        educ health age race sex
## educ      0     15  15   15  15
## health   59      0  10   54  10
## age      49      0   0   45   0
## race     71     66  67    0  67
## sex      49      0   0   45   0
## 
## $mm
##        educ health age race sex
## educ     15      0   0    0   0
## health    0     59  49    5  49
## age       0     49  49    4  49
## race      0      5   4   71   4
## sex       0     49  49    4  49

from this we can see the pairwise missingness among the variables

Multiple Imputation

imp.mul <- mice(data = hw6[ , c("educ", "health", "age", "race", "sex")], seed = 30, m = 20)
## 
##  iter imp variable
##   1   1  educ  health  age  race  sex
##   1   2  educ  health  age  race  sex
##   1   3  educ  health  age  race  sex
##   1   4  educ  health  age  race  sex
##   1   5  educ  health  age  race  sex
##   1   6  educ  health  age  race  sex
##   1   7  educ  health  age  race  sex
##   1   8  educ  health  age  race  sex
##   1   9  educ  health  age  race  sex
##   1   10  educ  health  age  race  sex
##   1   11  educ  health  age  race  sex
##   1   12  educ  health  age  race  sex
##   1   13  educ  health  age  race  sex
##   1   14  educ  health  age  race  sex
##   1   15  educ  health  age  race  sex
##   1   16  educ  health  age  race  sex
##   1   17  educ  health  age  race  sex
##   1   18  educ  health  age  race  sex
##   1   19  educ  health  age  race  sex
##   1   20  educ  health  age  race  sex
##   2   1  educ  health  age  race  sex
##   2   2  educ  health  age  race  sex
##   2   3  educ  health  age  race  sex
##   2   4  educ  health  age  race  sex
##   2   5  educ  health  age  race  sex
##   2   6  educ  health  age  race  sex
##   2   7  educ  health  age  race  sex
##   2   8  educ  health  age  race  sex
##   2   9  educ  health  age  race  sex
##   2   10  educ  health  age  race  sex
##   2   11  educ  health  age  race  sex
##   2   12  educ  health  age  race  sex
##   2   13  educ  health  age  race  sex
##   2   14  educ  health  age  race  sex
##   2   15  educ  health  age  race  sex
##   2   16  educ  health  age  race  sex
##   2   17  educ  health  age  race  sex
##   2   18  educ  health  age  race  sex
##   2   19  educ  health  age  race  sex
##   2   20  educ  health  age  race  sex
##   3   1  educ  health  age  race  sex
##   3   2  educ  health  age  race  sex
##   3   3  educ  health  age  race  sex
##   3   4  educ  health  age  race  sex
##   3   5  educ  health  age  race  sex
##   3   6  educ  health  age  race  sex
##   3   7  educ  health  age  race  sex
##   3   8  educ  health  age  race  sex
##   3   9  educ  health  age  race  sex
##   3   10  educ  health  age  race  sex
##   3   11  educ  health  age  race  sex
##   3   12  educ  health  age  race  sex
##   3   13  educ  health  age  race  sex
##   3   14  educ  health  age  race  sex
##   3   15  educ  health  age  race  sex
##   3   16  educ  health  age  race  sex
##   3   17  educ  health  age  race  sex
##   3   18  educ  health  age  race  sex
##   3   19  educ  health  age  race  sex
##   3   20  educ  health  age  race  sex
##   4   1  educ  health  age  race  sex
##   4   2  educ  health  age  race  sex
##   4   3  educ  health  age  race  sex
##   4   4  educ  health  age  race  sex
##   4   5  educ  health  age  race  sex
##   4   6  educ  health  age  race  sex
##   4   7  educ  health  age  race  sex
##   4   8  educ  health  age  race  sex
##   4   9  educ  health  age  race  sex
##   4   10  educ  health  age  race  sex
##   4   11  educ  health  age  race  sex
##   4   12  educ  health  age  race  sex
##   4   13  educ  health  age  race  sex
##   4   14  educ  health  age  race  sex
##   4   15  educ  health  age  race  sex
##   4   16  educ  health  age  race  sex
##   4   17  educ  health  age  race  sex
##   4   18  educ  health  age  race  sex
##   4   19  educ  health  age  race  sex
##   4   20  educ  health  age  race  sex
##   5   1  educ  health  age  race  sex
##   5   2  educ  health  age  race  sex
##   5   3  educ  health  age  race  sex
##   5   4  educ  health  age  race  sex
##   5   5  educ  health  age  race  sex
##   5   6  educ  health  age  race  sex
##   5   7  educ  health  age  race  sex
##   5   8  educ  health  age  race  sex
##   5   9  educ  health  age  race  sex
##   5   10  educ  health  age  race  sex
##   5   11  educ  health  age  race  sex
##   5   12  educ  health  age  race  sex
##   5   13  educ  health  age  race  sex
##   5   14  educ  health  age  race  sex
##   5   15  educ  health  age  race  sex
##   5   16  educ  health  age  race  sex
##   5   17  educ  health  age  race  sex
##   5   18  educ  health  age  race  sex
##   5   19  educ  health  age  race  sex
##   5   20  educ  health  age  race  sex
print(imp.mul)
## Class: mids
## Number of multiple imputations:  20 
## Imputation methods:
##      educ    health       age      race       sex 
## "polyreg" "polyreg" "polyreg" "polyreg"  "logreg" 
## PredictorMatrix:
##        educ health age race sex
## educ      0      1   1    1   1
## health    1      0   1    1   1
## age       1      1   0    1   1
## race      1      1   1    0   1
## sex       1      1   1    1   0
head(imp.mul$imp$educ)
##           1       2       3       4       5       6       7       8
## 227      hs    lths    lths    lths    lths    lths    lths      hs
## 355 college    lths    lths      hs college    lths    lths college
## 368    lths    lths college    lths      hs    lths      hs    lths
## 706 college college    lths      hs    lths college    lths      hs
## 884    lths college college college    lths      hs      hs    lths
## 894 college    lths    lths college      hs    lths college    lths
##           9      10      11      12      13      14      15      16   17
## 227    lths    lths    lths    lths    lths    lths    lths      hs lths
## 355      hs      hs college college college college      hs    lths   hs
## 368    lths      hs      hs    lths    lths      hs      hs college   hs
## 706      hs      hs    lths college    lths college college    lths lths
## 884 college college college      hs    lths    lths      hs    lths   hs
## 894    lths    lths    lths college college    lths    lths    lths lths
##          18      19      20
## 227 college    lths college
## 355    lths college college
## 368    lths college    lths
## 706    lths college    lths
## 884 college college college
## 894    lths college    lths
summary(imp.mul$imp$educ)
##        1           2           3           4           5           6    
##  hs     :4   hs     :2   hs     :4   hs     :5   hs     :4   hs     :4  
##  college:7   college:4   college:5   college:5   college:2   college:5  
##  lths   :4   lths   :9   lths   :6   lths   :5   lths   :9   lths   :6  
##        7           8           9           10          11          12   
##  hs     :5   hs     :4   hs     :4   hs     :7   hs     :1   hs     :5  
##  college:2   college:5   college:5   college:4   college:6   college:3  
##  lths   :8   lths   :6   lths   :6   lths   :4   lths   :8   lths   :7  
##        13          14          15          16          17          18   
##  hs     :3   hs     :5   hs     :6   hs     :3   hs     :7   hs     :4  
##  college:5   college:4   college:5   college:5   college:0   college:5  
##  lths   :7   lths   :6   lths   :4   lths   :7   lths   :8   lths   :6  
##        19          20   
##  hs     :2   hs     :3  
##  college:8   college:6  
##  lths   :5   lths   :6
dat.imp <- complete(imp.mul, action = 12)

dat.imp$wt <- hw6$wt
head(dat.imp)
##      educ health   age  race sex    wt
## 1    lths   good child white   f 13827
## 2      hs   poor adult white   m  9605
## 3      hs   good  teen white   m 20668
## 4    lths   good  teen white   m 12246
## 5    lths   fair adult white   f  7406
## 6 college   fair adult white   m  6361

Running the model with the multiple imputed values

options(survey.lonely.psu = "adjust")
des3 <- svydesign(ids = ~1, weights = ~wt, data = dat.imp )
fit3 <- svyglm(move ~ educ + sex + age + health + race, data = dat.imp, family = "binomial", design = des1)
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!

Now comparing the three models

First, missing values removed

summary(fit1)
## 
## Call:
## svyglm(formula = move ~ educ + sex + age + health + race, design = des1, 
##     family = "binomial", data = hw6.rem)
## 
## Survey design:
## svydesign(ids = ~1, weights = ~wt, data = hw6)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.21727    0.18049  -1.204 0.228780    
## educcollege -0.09882    0.16275  -0.607 0.543789    
## educlths    -0.50466    0.15138  -3.334 0.000868 ***
## sexf         0.90810    0.13664   6.646 3.63e-11 ***
## agechild    -0.83458    0.19361  -4.311 1.69e-05 ***
## ageteen     -0.94810    0.13307  -7.125 1.33e-12 ***
## healthgood   0.01992    0.13228   0.151 0.880314    
## healthpoor   0.25203    0.19210   1.312 0.189625    
## raceblack    0.21024    0.14695   1.431 0.152639    
## raceother    0.18309    0.21050   0.870 0.384485    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.013313)
## 
## Number of Fisher Scoring iterations: 4

With modal imputation

summary(fit2)
## 
## Call:
## svyglm(formula = move ~ educ.imp + sex.imp + age.imp + health.imp + 
##     race.imp, design = des2, family = "binomial", data = hw6)
## 
## Survey design:
## svydesign(ids = ~1, weights = ~wt, data = hw6)
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.25860    0.17740  -1.458 0.145023    
## educ.impcollege -0.07136    0.15883  -0.449 0.653248    
## educ.implths    -0.51576    0.14662  -3.518 0.000442 ***
## sex.impf         0.84897    0.13337   6.366 2.26e-10 ***
## age.impchild    -0.81310    0.19081  -4.261 2.10e-05 ***
## age.impteen     -0.88277    0.12983  -6.799 1.28e-11 ***
## health.impgood   0.05948    0.12994   0.458 0.647183    
## health.imppoor   0.25883    0.18936   1.367 0.171776    
## race.impblack    0.27857    0.14440   1.929 0.053802 .  
## race.impother    0.20130    0.20574   0.978 0.327950    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.013803)
## 
## Number of Fisher Scoring iterations: 4

An with the multiple imputations

summary(fit3)
## 
## Call:
## svyglm(formula = move ~ educ + sex + age + health + race, design = des1, 
##     family = "binomial", data = dat.imp)
## 
## Survey design:
## svydesign(ids = ~1, weights = ~wt, data = hw6)
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.21727    0.18049  -1.204 0.228780    
## educcollege -0.09882    0.16275  -0.607 0.543789    
## educlths    -0.50466    0.15138  -3.334 0.000868 ***
## sexf         0.90810    0.13664   6.646 3.63e-11 ***
## agechild    -0.83458    0.19361  -4.311 1.69e-05 ***
## ageteen     -0.94810    0.13307  -7.125 1.33e-12 ***
## healthgood   0.01992    0.13228   0.151 0.880314    
## healthpoor   0.25203    0.19210   1.312 0.189625    
## raceblack    0.21024    0.14695   1.431 0.152639    
## raceother    0.18309    0.21050   0.870 0.384485    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1.013313)
## 
## Number of Fisher Scoring iterations: 4

OK, we can see with the difference in the first two models, removing all the NA’s vice using modal imputation to replace the NA’s. The coefficients and standard errors change, but all four variables that are significant in model one remain significant in model two. The only real difference is that race = back’s p-value moves from 0.153 to 0.053. I can’t figure out why my model three is exactly the same as my model one. I know I had issues getting it to take on the design model weight for three, but I don’t think that should have changed my multiple imputation to be exactly the same as removing NA’s. A bit of a quandry here