Data Wrangling with the PSID

The Panel Study of Income Dynamics (PSID) is the longest running longitudinal household survey in the world. The study began in 1968 with a nationally representative sample of over 18,000 individuals living in 5,000 families in the United States. Information on these individuals and their descendants has been collected continuously, including data covering employment, income, wealth, expenditures, health, marriage, childbearing, child development, philanthropy, education, and numerous other topics.

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(readxl)
library(broom)

# Load the dataset
psid <- read_xlsx("C:/Dropbox/DRPH/P9489 Application of Epi Research Methods II/PSID/PSID_2.17.2020.xlsx")

oldnames = names(psid)
newnames = c(paste(oldnames[1:3],"1968",sep="_"), paste(oldnames[4:13],"2013",sep="_"), paste(oldnames[14:23],"2015",sep="_"), paste(oldnames[24:33],"2017",sep="_"))

psidn <- psid %>%
  # Rename variables so you know which variables are in which years
  rename_at(vars(oldnames), ~ newnames) %>%
  
  # Create a new dataframe that includes only respondents who completed the survey in 2015
  filter(ER34301_2015>0) %>%
  
  # Recode the 2015 health status of the health of the household (HH) to a Likert scale 
  # recode don't know or refused as missing)
  # 1=Poor, 2=Fair, 3=Good, 4=Very good, 5=Excellent
  mutate(ER62366_2015 = dplyr::recode(ER62366_2015, "1"=5,"2"=4,"3"=3,"4"=2,"5"=1,"8"=NA_real_,"9"=NA_real_)) %>%
  
  # Create a new variable indicating whether the HH reported "Poor" or "Fair" health in 2015
  # ER62366_PH_2015: (Poor Health) 1=Poor and Fair, 0=Good Very good, and Excellent
  mutate(ER62366_PH_2015 = dplyr::recode(ER62366_2015, "1"=1,"2"=1,"3"=0,"4"=0,"5"=0))

# check that mappings are correct
addmargins(table(psidn$ER62366_2015,psidn$ER62366_PH_2015, exclude = NULL))

##       
##            0     1  <NA>   Sum
##   1        0  1055     0  1055
##   2        0  3236     0  3236
##   3     8125     0     0  8125
##   4     9370     0     0  9370
##   5     4518     0     0  4518
##   <NA>     0     0    42    42
##   Sum  22013  4291    42 26346

# Does the age of the HH predict "Poor" or "Fair" health in 2015?
model1 <- glm(ER62366_PH_2015 ~ ER60017_2015, family=binomial, data=psidn)
summary(model1)

## 
## Call:
## glm(formula = ER62366_PH_2015 ~ ER60017_2015, family = binomial, 
##     data = psidn)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0895  -0.6233  -0.5280  -0.4696   2.2295  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.896913   0.054128  -53.52   <2e-16 ***
## ER60017_2015  0.027696   0.001081   25.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23401  on 26303  degrees of freedom
## Residual deviance: 22751  on 26302  degrees of freedom
##   (42 observations deleted due to missingness)
## AIC: 22755
## 
## Number of Fisher Scoring iterations: 4

tidy(model1, conf.int = TRUE, exponentiate = TRUE)

## # A tibble: 2 x 7
##   term         estimate std.error statistic   p.value conf.low conf.high
##   <chr>           <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 (Intercept)    0.0552   0.0541      -53.5 0.          0.0496    0.0613
## 2 ER60017_2015   1.03     0.00108      25.6 1.11e-144   1.03      1.03

# Yes: (RR=1.0280, 95% CI=1.0259-1.0302; P-value<0.0001)
# The risk of Poor and Fair health is 2.8% (95% CI 2.59-3.02) higher for every year of age

# Is sex an effect measure modifier of this relationship?
model2 <- glm(ER62366_PH_2015 ~ ER60017_2015*ER60018_2015, family=binomial, data=psidn)
summary(model2)

## 
## Call:
## glm(formula = ER62366_PH_2015 ~ ER60017_2015 * ER60018_2015, 
##     family = binomial, data = psidn)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1463  -0.6468  -0.5012  -0.4059   2.3899  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -4.782199   0.167265  -28.59  < 2e-16 ***
## ER60017_2015               0.047315   0.003310   14.29  < 2e-16 ***
## ER60018_2015               1.336761   0.108500   12.32  < 2e-16 ***
## ER60017_2015:ER60018_2015 -0.013168   0.002191   -6.01 1.86e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 23401  on 26303  degrees of freedom
## Residual deviance: 22313  on 26300  degrees of freedom
##   (42 observations deleted due to missingness)
## AIC: 22321
## 
## Number of Fisher Scoring iterations: 4

tidy(model2, conf.int = TRUE, exponentiate = TRUE)

## # A tibble: 4 x 7
##   term                 estimate std.error statistic   p.value conf.low conf.high
##   <chr>                   <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 (Intercept)           0.00838   0.167      -28.6  8.81e-180  0.00603    0.0116
## 2 ER60017_2015          1.05      0.00331     14.3  2.41e- 46  1.04       1.06  
## 3 ER60018_2015          3.81      0.108       12.3  7.04e- 35  3.08       4.71  
## 4 ER60017_2015:ER6001~  0.987     0.00219     -6.01 1.86e-  9  0.983      0.991

# Check why tibble does not print more than 2 decimal points when knit

# Alternative consideration - male and females have statistically different estimates (check this)

# males
psidn_m <- psidn[psidn$ER60018_2015==1,]

# females
psidn_f <- psidn[psidn$ER60018_2015==2,]

#recall - crude model OR: 1.028
tidy(model1, conf.int = TRUE, exponentiate = TRUE)

## # A tibble: 2 x 7
##   term         estimate std.error statistic   p.value conf.low conf.high
##   <chr>           <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 (Intercept)    0.0552   0.0541      -53.5 0.          0.0496    0.0613
## 2 ER60017_2015   1.03     0.00108      25.6 1.11e-144   1.03      1.03

# male strata 1.035
model1_m <- glm(ER62366_PH_2015 ~ ER60017_2015, family=binomial, data=psidn_m)
tidy(model1_m, conf.int = TRUE, exponentiate = TRUE)

## # A tibble: 2 x 7
##   term         estimate std.error statistic   p.value conf.low conf.high
##   <chr>           <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 (Intercept)    0.0319   0.0735      -46.9 0.          0.0276    0.0368
## 2 ER60017_2015   1.03     0.00143      23.8 1.50e-125   1.03      1.04

# female strata 1.021
model1_f <- glm(ER62366_PH_2015 ~ ER60017_2015, family=binomial, data=psidn_f)
tidy(model1_f, conf.int = TRUE, exponentiate = TRUE)

## # A tibble: 2 x 7
##   term         estimate std.error statistic   p.value conf.low conf.high
##   <chr>           <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
## 1 (Intercept)     0.121   0.0798      -26.4 8.17e-154    0.104     0.142
## 2 ER60017_2015    1.02    0.00166      12.7 1.05e- 36    1.02      1.02

# This part is for fun
ggplot(data = psidn) +
  geom_smooth(mapping = aes(x = ER60017_2015, y = ER62366_PH_2015, group = ER60018_2015, color = as.factor(ER60018_2015)))

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## Warning: Removed 42 rows containing non-finite values (stat_smooth).

ggplot(psidn, aes(ER60017_2015, ER62366_PH_2015, group = ER60018_2015, color = as.factor(ER60018_2015))) + geom_smooth(method="glm", se=TRUE, na.rm= TRUE)

Data Wrangling with the PSID

Applications of Epidemiologic Methods II