#dat17 <- read_excel("acs5_2017_comp.xlsx", sheet = "2017acs5yr", skip = 1)
#summary(dat17)
hw6 <- read_excel("~/Google Drive/J254592.xlsx")
#replace_na(dat17)
#summary(hw6)
hw6 <- hw6 %>%
transmute(
move = ER42132,
educ = ER34020,
sex = ER36018,
age = ER33904,
health = ER38202,
race = ER46543,
wt = ER34046
)
table(hw6$age)
##
## 0 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## 49 2 129 176 213 192 249 210 227 219 238 200 250 223 221 59
#table(hw6$race)
#replace_with_na(hw6, replace = list(x = 999))
#summary(hw6)
hw6 <- hw6%>%
mutate(
educ = as.factor(ifelse(educ <= 11, "lths",
ifelse(educ == 12, "hs",
ifelse(educ >=13 & educ <= 90, "college", NA)))),
health = as.factor(ifelse(health <= 2, "good",
ifelse(health ==3, "fair",
ifelse(health >= 4 & health <= 6 , "poor", NA)))),
move = ifelse(move == 1,1,#1 means moved
ifelse(move == 5, 0,NA)),
sex = as.factor(ifelse(sex == 1, "m","f")),
race = as.factor(ifelse(race == 1, "white",
ifelse(race == 2, "black",
ifelse(race >= 3 & race <= 7, "other", NA)))),
age=as.factor(ifelse(age >=9 & age <= 12,"child",
ifelse(age>=13 & age <=19, "teen",
ifelse( age >=20 & age <= 90, "adult", NA))))
)
hw6$educ <- relevel(hw6$educ, ref = "hs")
hw6$sex <- relevel(hw6$sex, ref = "m")
hw6$health <- relevel(hw6$health, ref = "fair")
hw6$race <- relevel(hw6$race, ref = "white")
#summary(hw6$race)
#table(hw6$welf)
#table(hw6$educ)
hw6.rem <- hw6[complete.cases(hw6),]
#str(hw6.rem$age)
options(survey.lonely.psu = "adjust")
des1 <- svydesign(ids = ~1, weights = ~wt, data = hw6 )
I’m revisiting the PSID 2017 data
I’m going to do a logistic regression using whether or not someone moved as the dependent variable and 5 predictors:
educ: education lths = less than high school
hs = high school
college = any college
sex: gender
age: age health: health good, fair and poor welf: did you receive welfare in the past year
summary(hw6$educ)
## hs college lths NA's
## 589 600 1653 15
summary(hw6$sex)
## m f NA's
## 1795 1013 49
summary(hw6$age)
## adult child teen NA's
## 753 520 1535 49
summary(hw6$health)
## fair good poor NA's
## 819 1577 402 59
summary(hw6$race)
## white black other NA's
## 1435 1184 167 71
The first model with missing values removed
fit1 <- svyglm(move ~ educ + sex + age + health + race, data = hw6.rem, family = "binomial", design = des1)
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
Imputing by mode
Education
table(hw6$educ)
##
## hs college lths
## 589 600 1653
mcv.educ <- factor(names(which.max(table(hw6$educ))), levels = levels(hw6$educ))
mcv.educ
## [1] lths
## Levels: hs college lths
hw6$educ.imp <- as.factor(ifelse(is.na(hw6$educ) == T, mcv.educ, hw6$educ))
levels(hw6$educ.imp) <- levels(hw6$educ)
prop.table(table(hw6$educ))
##
## hs college lths
## 0.2072484 0.2111189 0.5816327
prop.table(table(hw6$educ.imp))
##
## hs college lths
## 0.2061603 0.2100105 0.5838292
Health
table(hw6$health)
##
## fair good poor
## 819 1577 402
mcv.health <- factor(names(which.max(table(hw6$health))), levels = levels(hw6$health))
mcv.health
## [1] good
## Levels: fair good poor
hw6$health.imp <- as.factor(ifelse(is.na(hw6$health) == T, mcv.health, hw6$health))
levels(hw6$health.imp) <- levels(hw6$health)
prop.table(table(hw6$health))
##
## fair good poor
## 0.2927091 0.5636169 0.1436741
prop.table(table(hw6$health.imp))
##
## fair good poor
## 0.2866643 0.5726286 0.1407070
race
table(hw6$race)
##
## white black other
## 1435 1184 167
mcv.race <- factor(names(which.max(table(hw6$race))), levels = levels(hw6$race))
mcv.race
## [1] white
## Levels: white black other
hw6$race.imp <- as.factor(ifelse(is.na(hw6$race) == T, mcv.race, hw6$race))
levels(hw6$race.imp) <- levels(hw6$race)
prop.table(table(hw6$race))
##
## white black other
## 0.51507538 0.42498205 0.05994257
prop.table(table(hw6$race.imp))
##
## white black other
## 0.52712636 0.41442072 0.05845292
sex
table(hw6$sex)
##
## m f
## 1795 1013
mcv.sex <- factor(names(which.max(table(hw6$sex))), levels = levels(hw6$sex))
mcv.sex
## [1] m
## Levels: m f
hw6$sex.imp <- as.factor(ifelse(is.na(hw6$sex) == T, mcv.sex, hw6$sex))
levels(hw6$sex.imp) <- levels(hw6$sex)
prop.table(table(hw6$sex))
##
## m f
## 0.639245 0.360755
prop.table(table(hw6$sex.imp))
##
## m f
## 0.6454323 0.3545677
Age
table(hw6$age)
##
## adult child teen
## 753 520 1535
mcv.age <- factor(names(which.max(table(hw6$age))), levels = levels(hw6$age))
mcv.age
## [1] teen
## Levels: adult child teen
hw6$age.imp <- as.factor(ifelse(is.na(hw6$age) == T, mcv.age, hw6$age))
levels(hw6$age.imp) <- levels(hw6$age)
prop.table(table(hw6$age))
##
## adult child teen
## 0.2681624 0.1851852 0.5466524
prop.table(table(hw6$age.imp))
##
## adult child teen
## 0.2635632 0.1820091 0.5544277
logistic regression with imputed modes
options(survey.lonely.psu = "adjust")
des2 <- svydesign(ids = ~1, weights = ~wt, data = hw6 )
fit2 <- svyglm(move ~ educ.imp + sex.imp + age.imp + health.imp + race.imp, data = hw6, family = "binomial", design = des2)
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
Multiple imputation
pairs
md.pairs(hw6[ , c("educ", "health", "age", "race", "sex")])
## $rr
## educ health age race sex
## educ 2842 2783 2793 2771 2793
## health 2783 2798 2798 2732 2798
## age 2793 2798 2808 2741 2808
## race 2771 2732 2741 2786 2741
## sex 2793 2798 2808 2741 2808
##
## $rm
## educ health age race sex
## educ 0 59 49 71 49
## health 15 0 0 66 0
## age 15 10 0 67 0
## race 15 54 45 0 45
## sex 15 10 0 67 0
##
## $mr
## educ health age race sex
## educ 0 15 15 15 15
## health 59 0 10 54 10
## age 49 0 0 45 0
## race 71 66 67 0 67
## sex 49 0 0 45 0
##
## $mm
## educ health age race sex
## educ 15 0 0 0 0
## health 0 59 49 5 49
## age 0 49 49 4 49
## race 0 5 4 71 4
## sex 0 49 49 4 49
from this we can see the pairwise missingness among the variables
Multiple Imputation
imp.mul <- mice(data = hw6[ , c("educ", "health", "age", "race", "sex")], seed = 30, m = 20)
##
## iter imp variable
## 1 1 educ health age race sex
## 1 2 educ health age race sex
## 1 3 educ health age race sex
## 1 4 educ health age race sex
## 1 5 educ health age race sex
## 1 6 educ health age race sex
## 1 7 educ health age race sex
## 1 8 educ health age race sex
## 1 9 educ health age race sex
## 1 10 educ health age race sex
## 1 11 educ health age race sex
## 1 12 educ health age race sex
## 1 13 educ health age race sex
## 1 14 educ health age race sex
## 1 15 educ health age race sex
## 1 16 educ health age race sex
## 1 17 educ health age race sex
## 1 18 educ health age race sex
## 1 19 educ health age race sex
## 1 20 educ health age race sex
## 2 1 educ health age race sex
## 2 2 educ health age race sex
## 2 3 educ health age race sex
## 2 4 educ health age race sex
## 2 5 educ health age race sex
## 2 6 educ health age race sex
## 2 7 educ health age race sex
## 2 8 educ health age race sex
## 2 9 educ health age race sex
## 2 10 educ health age race sex
## 2 11 educ health age race sex
## 2 12 educ health age race sex
## 2 13 educ health age race sex
## 2 14 educ health age race sex
## 2 15 educ health age race sex
## 2 16 educ health age race sex
## 2 17 educ health age race sex
## 2 18 educ health age race sex
## 2 19 educ health age race sex
## 2 20 educ health age race sex
## 3 1 educ health age race sex
## 3 2 educ health age race sex
## 3 3 educ health age race sex
## 3 4 educ health age race sex
## 3 5 educ health age race sex
## 3 6 educ health age race sex
## 3 7 educ health age race sex
## 3 8 educ health age race sex
## 3 9 educ health age race sex
## 3 10 educ health age race sex
## 3 11 educ health age race sex
## 3 12 educ health age race sex
## 3 13 educ health age race sex
## 3 14 educ health age race sex
## 3 15 educ health age race sex
## 3 16 educ health age race sex
## 3 17 educ health age race sex
## 3 18 educ health age race sex
## 3 19 educ health age race sex
## 3 20 educ health age race sex
## 4 1 educ health age race sex
## 4 2 educ health age race sex
## 4 3 educ health age race sex
## 4 4 educ health age race sex
## 4 5 educ health age race sex
## 4 6 educ health age race sex
## 4 7 educ health age race sex
## 4 8 educ health age race sex
## 4 9 educ health age race sex
## 4 10 educ health age race sex
## 4 11 educ health age race sex
## 4 12 educ health age race sex
## 4 13 educ health age race sex
## 4 14 educ health age race sex
## 4 15 educ health age race sex
## 4 16 educ health age race sex
## 4 17 educ health age race sex
## 4 18 educ health age race sex
## 4 19 educ health age race sex
## 4 20 educ health age race sex
## 5 1 educ health age race sex
## 5 2 educ health age race sex
## 5 3 educ health age race sex
## 5 4 educ health age race sex
## 5 5 educ health age race sex
## 5 6 educ health age race sex
## 5 7 educ health age race sex
## 5 8 educ health age race sex
## 5 9 educ health age race sex
## 5 10 educ health age race sex
## 5 11 educ health age race sex
## 5 12 educ health age race sex
## 5 13 educ health age race sex
## 5 14 educ health age race sex
## 5 15 educ health age race sex
## 5 16 educ health age race sex
## 5 17 educ health age race sex
## 5 18 educ health age race sex
## 5 19 educ health age race sex
## 5 20 educ health age race sex
print(imp.mul)
## Class: mids
## Number of multiple imputations: 20
## Imputation methods:
## educ health age race sex
## "polyreg" "polyreg" "polyreg" "polyreg" "logreg"
## PredictorMatrix:
## educ health age race sex
## educ 0 1 1 1 1
## health 1 0 1 1 1
## age 1 1 0 1 1
## race 1 1 1 0 1
## sex 1 1 1 1 0
head(imp.mul$imp$educ)
## 1 2 3 4 5 6 7 8
## 227 hs lths lths lths lths lths lths hs
## 355 college lths lths hs college lths lths college
## 368 lths lths college lths hs lths hs lths
## 706 college college lths hs lths college lths hs
## 884 lths college college college lths hs hs lths
## 894 college lths lths college hs lths college lths
## 9 10 11 12 13 14 15 16 17
## 227 lths lths lths lths lths lths lths hs lths
## 355 hs hs college college college college hs lths hs
## 368 lths hs hs lths lths hs hs college hs
## 706 hs hs lths college lths college college lths lths
## 884 college college college hs lths lths hs lths hs
## 894 lths lths lths college college lths lths lths lths
## 18 19 20
## 227 college lths college
## 355 lths college college
## 368 lths college lths
## 706 lths college lths
## 884 college college college
## 894 lths college lths
summary(imp.mul$imp$educ)
## 1 2 3 4 5 6
## hs :4 hs :2 hs :4 hs :5 hs :4 hs :4
## college:7 college:4 college:5 college:5 college:2 college:5
## lths :4 lths :9 lths :6 lths :5 lths :9 lths :6
## 7 8 9 10 11 12
## hs :5 hs :4 hs :4 hs :7 hs :1 hs :5
## college:2 college:5 college:5 college:4 college:6 college:3
## lths :8 lths :6 lths :6 lths :4 lths :8 lths :7
## 13 14 15 16 17 18
## hs :3 hs :5 hs :6 hs :3 hs :7 hs :4
## college:5 college:4 college:5 college:5 college:0 college:5
## lths :7 lths :6 lths :4 lths :7 lths :8 lths :6
## 19 20
## hs :2 hs :3
## college:8 college:6
## lths :5 lths :6
dat.imp <- complete(imp.mul, action = 12)
dat.imp$wt <- hw6$wt
head(dat.imp)
## educ health age race sex wt
## 1 lths good child white f 13827
## 2 hs poor adult white m 9605
## 3 hs good teen white m 20668
## 4 lths good teen white m 12246
## 5 lths fair adult white f 7406
## 6 college fair adult white m 6361
options(survey.lonely.psu = "adjust")
des3 <- svydesign(ids = ~1, weights = ~wt, data = dat.imp )
fit3 <- svyglm(move ~ educ + sex + age + health + race, data = dat.imp, family = "binomial", design = des1)
## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
First, missing values removed
summary(fit1)
##
## Call:
## svyglm(formula = move ~ educ + sex + age + health + race, design = des1,
## family = "binomial", data = hw6.rem)
##
## Survey design:
## svydesign(ids = ~1, weights = ~wt, data = hw6)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.21727 0.18049 -1.204 0.228780
## educcollege -0.09882 0.16275 -0.607 0.543789
## educlths -0.50466 0.15138 -3.334 0.000868 ***
## sexf 0.90810 0.13664 6.646 3.63e-11 ***
## agechild -0.83458 0.19361 -4.311 1.69e-05 ***
## ageteen -0.94810 0.13307 -7.125 1.33e-12 ***
## healthgood 0.01992 0.13228 0.151 0.880314
## healthpoor 0.25203 0.19210 1.312 0.189625
## raceblack 0.21024 0.14695 1.431 0.152639
## raceother 0.18309 0.21050 0.870 0.384485
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.013313)
##
## Number of Fisher Scoring iterations: 4
With modal imputation
summary(fit2)
##
## Call:
## svyglm(formula = move ~ educ.imp + sex.imp + age.imp + health.imp +
## race.imp, design = des2, family = "binomial", data = hw6)
##
## Survey design:
## svydesign(ids = ~1, weights = ~wt, data = hw6)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.25860 0.17740 -1.458 0.145023
## educ.impcollege -0.07136 0.15883 -0.449 0.653248
## educ.implths -0.51576 0.14662 -3.518 0.000442 ***
## sex.impf 0.84897 0.13337 6.366 2.26e-10 ***
## age.impchild -0.81310 0.19081 -4.261 2.10e-05 ***
## age.impteen -0.88277 0.12983 -6.799 1.28e-11 ***
## health.impgood 0.05948 0.12994 0.458 0.647183
## health.imppoor 0.25883 0.18936 1.367 0.171776
## race.impblack 0.27857 0.14440 1.929 0.053802 .
## race.impother 0.20130 0.20574 0.978 0.327950
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.013803)
##
## Number of Fisher Scoring iterations: 4
An with the multiple imputations
summary(fit3)
##
## Call:
## svyglm(formula = move ~ educ + sex + age + health + race, design = des1,
## family = "binomial", data = dat.imp)
##
## Survey design:
## svydesign(ids = ~1, weights = ~wt, data = hw6)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.21727 0.18049 -1.204 0.228780
## educcollege -0.09882 0.16275 -0.607 0.543789
## educlths -0.50466 0.15138 -3.334 0.000868 ***
## sexf 0.90810 0.13664 6.646 3.63e-11 ***
## agechild -0.83458 0.19361 -4.311 1.69e-05 ***
## ageteen -0.94810 0.13307 -7.125 1.33e-12 ***
## healthgood 0.01992 0.13228 0.151 0.880314
## healthpoor 0.25203 0.19210 1.312 0.189625
## raceblack 0.21024 0.14695 1.431 0.152639
## raceother 0.18309 0.21050 0.870 0.384485
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1.013313)
##
## Number of Fisher Scoring iterations: 4
OK, we can see with the difference in the first two models, removing all the NA’s vice using modal imputation to replace the NA’s. The coefficients and standard errors change, but all four variables that are significant in model one remain significant in model two. The only real difference is that race = back’s p-value moves from 0.153 to 0.053. I can’t figure out why my model three is exactly the same as my model one. I know I had issues getting it to take on the design model weight for three, but I don’t think that should have changed my multiple imputation to be exactly the same as removing NA’s. A bit of a quandry here