#hw4 <- haven::read_sav("~/Google Drive/stat 2/Fam2017er/FAM2017ER.sas")
#hw4 <- read_csv("PSID hw4.csv")
hw4 <- read_excel("~/Google Drive/stat 2/hw4/hw4.xlsx")
hw4 <- hw4 %>%
filter(ER66017 < 121,
ER68513 < 53,
ER68555 < 6,
ER71538 <20) %>%
transmute(
agec = as.factor(ifelse(ER66017 <= 39, "< 40",
ifelse(ER66017 <= 65 & ER66017 >= 40, "40-65", "65+"))),
sex = as.factor(ifelse(ER66018 == 1, "m", "f")),
hosp = ER68513,
smoke = as.factor(ifelse(ER68555 == 1, "y","n")),
educ = as.factor(ifelse(ER71538 <= 11, "1lths",
ifelse(ER71538 == 12, "2hs",
ifelse(ER71538 >= 16, "4coll deg+", "3some coll")))),
wt = ER71570)
#table(hw4$sex)
#table(hw4$hosp)
#table(hw4$educ)
#table(hw4$agec)
hw4 <- hw4 %>%
mutate(
agec = relevel(agec, ref = "< 40"),
sex = relevel(sex, ref = "m"),
smoke = relevel(smoke, ref = "n"),
educ = relevel(educ, ref = "2hs")
)
des <- svydesign(ids = ~1,
weights = ~wt,
data = hw4)
First, for this dataset I’m using the 2017 PSID main family data examining the heads of households. The count data I’ll be using will be the number of weeks an individual stayed in a hospital in the previous year.
How many weeks in the past year did you need hospitalization?
the four predictors are:
agec: categorical age, < 40, 40-65 & >65, with the reference category <40
sex: categorical sex, m and f, reference category is m
smoke: y and n, reference category is n
educ: categorical education, 1lths, 2hs, 3some coll and 4 coll+, reference is 2hs
An offset term is not needed as the outcomes all occur during the same period.
First some descriptive statistics:
svyhist(~hw4$hosp, des)
table(hw4$hosp)
##
## 0 1 2 3 6 7 8 17 32 39
## 9395 14 16 6 5 1 1 1 1 1
summary(hw4$hosp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.02087 0.00000 39.00000
svyby(~hosp, ~smoke + agec, des, svymean, na.rm= T)
## smoke agec hosp se
## n.< 40 n < 40 0.003557137 0.002971226
## y.< 40 y < 40 0.040879382 0.024396822
## n.40-65 n 40-65 0.008127730 0.003018409
## y.40-65 y 40-65 0.016664808 0.008894069
## n.65+ n 65+ 0.142361673 0.074862571
## y.65+ y 65+ 0.000000000 0.000000000
svyby(~hosp, ~sex + educ, des, svymean, na.rm= T)
## sex educ hosp se
## m.2hs m 2hs 0.02112658 0.008968846
## f.2hs f 2hs 0.01025021 0.007499985
## m.1lths m 1lths 0.02513625 0.010647687
## f.1lths f 1lths 0.02444977 0.019819472
## m.3some coll m 3some coll 0.10397864 0.091194654
## f.3some coll f 3some coll 0.08289289 0.059187540
## m.4coll deg+ m 4coll deg+ 0.04259638 0.041787051
## f.4coll deg+ f 4coll deg+ 0.00000000 0.000000000
and now a poisson model:
fit1 <- svyglm(hosp ~ smoke + educ + sex + agec, design = des, family = poisson)
summary(fit1)
##
## Call:
## svyglm(formula = hosp ~ smoke + educ + sex + agec, design = des,
## family = poisson)
##
## Survey design:
## svydesign(ids = ~1, weights = ~wt, data = hw4)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.2285 0.8563 -6.106 1.07e-09 ***
## smokey -0.2343 0.6004 -0.390 0.696391
## educ1lths 0.2411 0.5557 0.434 0.664430
## educ3some coll 1.7912 0.7507 2.386 0.017051 *
## educ4coll deg+ 0.4572 1.0184 0.449 0.653481
## sexf -0.8522 0.7976 -1.069 0.285317
## agec40-65 -0.0721 0.5848 -0.123 0.901878
## agec65+ 2.6314 0.7359 3.576 0.000351 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 11.4438)
##
## Number of Fisher Scoring iterations: 7
round(exp(summary(fit1)$coef[-1,1]),3)
## smokey educ1lths educ3some coll educ4coll deg+ sexf
## 0.791 1.273 5.996 1.580 0.426
## agec40-65 agec65+
## 0.930 13.893
fit2 <- glm(hosp ~ smoke + educ + sex + agec, data = hw4,family = poisson)
summary(fit2)
##
## Call:
## glm(formula = hosp ~ smoke + educ + sex + agec, family = poisson,
## data = hw4)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6146 -0.1748 -0.1424 -0.1171 18.8405
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.67954 0.21063 -22.217 < 2e-16 ***
## smokey 0.21675 0.19599 1.106 0.26875
## educ1lths 0.10807 0.22061 0.490 0.62423
## educ3some coll 0.58058 0.18107 3.206 0.00134 **
## educ4coll deg+ -0.51950 0.22660 -2.293 0.02187 *
## sexf -0.12827 0.15202 -0.844 0.39880
## agec40-65 -0.08287 0.22176 -0.374 0.70863
## agec65+ 2.21557 0.18066 12.264 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 2380.5 on 9440 degrees of freedom
## Residual deviance: 2134.2 on 9433 degrees of freedom
## AIC: 2279.5
##
## Number of Fisher Scoring iterations: 7
we can see from the summary of fit2, that the ratio of the residual deviance and the residual degrees of freedom is not equal to one by a factor of approximately 2 to 9 (1:4.5). So we may conclude that overdispersion is present within the data (that is the variance is larger than the mean, which is not ideal for a single parameter distribution like the poisson.
The poisson model (survey design) appears to be a decent fit however the presence of overdispersion would would lead us to want to try another model. Perhaps the negative binomial might be a good fit?
coeftest(fit2, vcov. = vcovHC(fit2, type = "HC1"))
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.679541 0.578884 -8.0837 6.281e-16 ***
## smokey 0.216750 0.380514 0.5696 0.568932
## educ1lths 0.108068 0.423802 0.2550 0.798726
## educ3some coll 0.580583 0.661482 0.8777 0.380106
## educ4coll deg+ -0.519503 0.982573 -0.5287 0.597002
## sexf -0.128274 0.533922 -0.2402 0.810138
## agec40-65 -0.082871 0.430318 -0.1926 0.847288
## agec65+ 2.215571 0.614703 3.6043 0.000313 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit.nb2 <- glm.nb(hosp ~ smoke + agec + educ + sex,
data = hw4,
weights = wt / mean(wt, na.rm = T))
coeftest(fit.nb2, vcov. = vcovHC(fit.nb2, type = "HC1"))
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.23242 0.72134 -5.8674 4.425e-09 ***
## smokey 0.66782 0.48218 1.3850 0.16606
## agec40-65 0.11032 0.59037 0.1869 0.85176
## agec65+ 3.49805 0.80469 4.3471 1.379e-05 ***
## educ1lths -0.37786 0.61827 -0.6112 0.54110
## educ3some coll -0.43213 0.57346 -0.7536 0.45112
## educ4coll deg+ -1.92159 0.77641 -2.4750 0.01332 *
## sexf -0.92102 0.56543 -1.6289 0.10334
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit.nb2)
##
## Call:
## glm.nb(formula = hosp ~ smoke + agec + educ + sex, data = hw4,
## weights = wt/mean(wt, na.rm = T), init.theta = 0.003299441382,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.45002 -0.11263 -0.06816 -0.03595 3.00619
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.2324 0.5783 -7.319 2.51e-13 ***
## smokey 0.6678 0.5683 1.175 0.239952
## agec40-65 0.1103 0.5539 0.199 0.842136
## agec65+ 3.4980 0.5804 6.027 1.67e-09 ***
## educ1lths -0.3779 0.6623 -0.571 0.568338
## educ3some coll -0.4321 0.5531 -0.781 0.434619
## educ4coll deg+ -1.9216 0.5778 -3.326 0.000882 ***
## sexf -0.9210 0.4765 -1.933 0.053262 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.0033) family taken to be 1)
##
## Null deviance: 221.48 on 9009 degrees of freedom
## Residual deviance: 163.61 on 9002 degrees of freedom
## AIC: 983.58
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.003299
## Std. Err.: 0.000544
##
## 2 x log-likelihood: -965.582000
The AIC for the poisson (non-survey design) is 2279.5 and the AIC for the negative binomial is 983.58. So in this case it appears that the negative binomial fits the data better (I will admit that I’m not sure how to get the AIC from the results of the survey design poisson, that may have influenced the results.)
Overall, with the survey design poisson model it appears that only age, being over the age of 65 and having some college but not a 4 year degree, were significant predictors of the number of weeks of being hospitalized in a given year, while the non-survey design added having at least a 4 year college degree to the list of significant predictors.
The negative binomial, the model that is the better fit, shows that only some college and being over the age of 65 were significant predictors.