The Panel Study of Income Dynamics (PSID) is the longest running longitudinal household survey in the world. The study began in 1968 with a nationally representative sample of over 18,000 individuals living in 5,000 families in the United States. Information on these individuals and their descendants has been collected continuously, including data covering employment, income, wealth, expenditures, health, marriage, childbearing, child development, philanthropy, education, and numerous other topics.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readxl)
library(broom)
# Load the dataset
psid <- read_xlsx("C:/Dropbox/DRPH/P9489 Application of Epi Research Methods II/PSID/PSID_2.17.2020.xlsx")
oldnames = names(psid)
newnames = c(paste(oldnames[1:3],"1968",sep="_"), paste(oldnames[4:13],"2013",sep="_"), paste(oldnames[14:23],"2015",sep="_"), paste(oldnames[24:33],"2017",sep="_"))
psidn <- psid %>%
# Rename variables so you know which variables are in which years
rename_at(vars(oldnames), ~ newnames) %>%
# Create a new dataframe that includes only respondents who completed the survey in 2015
filter(ER34301_2015>0) %>%
# Recode the 2015 health status of the health of the household (HH) to a Likert scale
# recode don't know or refused as missing)
# 1=Poor, 2=Fair, 3=Good, 4=Very good, 5=Excellent
mutate(ER62366_2015 = dplyr::recode(ER62366_2015, "1"=5,"2"=4,"3"=3,"4"=2,"5"=1,"8"=NA_real_,"9"=NA_real_)) %>%
# Create a new variable indicating whether the HH reported "Poor" or "Fair" health in 2015
# ER62366_PH_2015: (Poor Health) 1=Poor and Fair, 0=Good Very good, and Excellent
mutate(ER62366_PH_2015 = dplyr::recode(ER62366_2015, "1"=1,"2"=1,"3"=0,"4"=0,"5"=0))
# check that mappings are correct
addmargins(table(psidn$ER62366_2015,psidn$ER62366_PH_2015, exclude = NULL))
##
## 0 1 <NA> Sum
## 1 0 1055 0 1055
## 2 0 3236 0 3236
## 3 8125 0 0 8125
## 4 9370 0 0 9370
## 5 4518 0 0 4518
## <NA> 0 0 42 42
## Sum 22013 4291 42 26346
# Does the age of the HH predict "Poor" or "Fair" health in 2015?
model1 <- glm(ER62366_PH_2015 ~ ER60017_2015, family=binomial, data=psidn)
summary(model1)
##
## Call:
## glm(formula = ER62366_PH_2015 ~ ER60017_2015, family = binomial,
## data = psidn)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0895 -0.6233 -0.5280 -0.4696 2.2295
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.896913 0.054128 -53.52 <2e-16 ***
## ER60017_2015 0.027696 0.001081 25.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23401 on 26303 degrees of freedom
## Residual deviance: 22751 on 26302 degrees of freedom
## (42 observations deleted due to missingness)
## AIC: 22755
##
## Number of Fisher Scoring iterations: 4
tidy(model1, conf.int = TRUE, exponentiate = TRUE)
## # A tibble: 2 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.0552 0.0541 -53.5 0. 0.0496 0.0613
## 2 ER60017_2015 1.03 0.00108 25.6 1.11e-144 1.03 1.03
# Yes: (RR=1.0280, 95% CI=1.0259-1.0302; P-value<0.0001)
# The risk of Poor and Fair health is 2.8% (95% CI 2.59-3.02) higher for every year of age
# Is sex an effect measure modifier of this relationship?
model2 <- glm(ER62366_PH_2015 ~ ER60017_2015*ER60018_2015, family=binomial, data=psidn)
summary(model2)
##
## Call:
## glm(formula = ER62366_PH_2015 ~ ER60017_2015 * ER60018_2015,
## family = binomial, data = psidn)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1463 -0.6468 -0.5012 -0.4059 2.3899
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.782199 0.167265 -28.59 < 2e-16 ***
## ER60017_2015 0.047315 0.003310 14.29 < 2e-16 ***
## ER60018_2015 1.336761 0.108500 12.32 < 2e-16 ***
## ER60017_2015:ER60018_2015 -0.013168 0.002191 -6.01 1.86e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23401 on 26303 degrees of freedom
## Residual deviance: 22313 on 26300 degrees of freedom
## (42 observations deleted due to missingness)
## AIC: 22321
##
## Number of Fisher Scoring iterations: 4
tidy(model2, conf.int = TRUE, exponentiate = TRUE)
## # A tibble: 4 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.00838 0.167 -28.6 8.81e-180 0.00603 0.0116
## 2 ER60017_2015 1.05 0.00331 14.3 2.41e- 46 1.04 1.06
## 3 ER60018_2015 3.81 0.108 12.3 7.04e- 35 3.08 4.71
## 4 ER60017_2015:ER6001~ 0.987 0.00219 -6.01 1.86e- 9 0.983 0.991
# Check why tibble does not print more than 2 decimal points when knit
# Alternative consideration - male and females have statistically different estimates (check this)
# males
psidn_m <- psidn[psidn$ER60018_2015==1,]
# females
psidn_f <- psidn[psidn$ER60018_2015==2,]
#recall - crude model OR: 1.028
tidy(model1, conf.int = TRUE, exponentiate = TRUE)
## # A tibble: 2 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.0552 0.0541 -53.5 0. 0.0496 0.0613
## 2 ER60017_2015 1.03 0.00108 25.6 1.11e-144 1.03 1.03
# male strata 1.035
model1_m <- glm(ER62366_PH_2015 ~ ER60017_2015, family=binomial, data=psidn_m)
tidy(model1_m, conf.int = TRUE, exponentiate = TRUE)
## # A tibble: 2 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.0319 0.0735 -46.9 0. 0.0276 0.0368
## 2 ER60017_2015 1.03 0.00143 23.8 1.50e-125 1.03 1.04
# female strata 1.021
model1_f <- glm(ER62366_PH_2015 ~ ER60017_2015, family=binomial, data=psidn_f)
tidy(model1_f, conf.int = TRUE, exponentiate = TRUE)
## # A tibble: 2 x 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.121 0.0798 -26.4 8.17e-154 0.104 0.142
## 2 ER60017_2015 1.02 0.00166 12.7 1.05e- 36 1.02 1.02
# This part is for fun
ggplot(data = psidn) +
geom_smooth(mapping = aes(x = ER60017_2015, y = ER62366_PH_2015, group = ER60018_2015, color = as.factor(ER60018_2015)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 42 rows containing non-finite values (stat_smooth).
ggplot(psidn, aes(ER60017_2015, ER62366_PH_2015, group = ER60018_2015, color = as.factor(ER60018_2015))) + geom_smooth(method="glm", se=TRUE, na.rm= TRUE)