Setup and Data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(haven)
library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(knitr)
library(kableExtra)

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

library(broom)
library(ggeffects)
library(ggstats)
library(gtsummary)
library(GGally)
library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

library(lmtest)

## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(corrplot)

## corrplot 0.95 loaded

there are 2 sets of data that I’ll be working with. Addhealth dataset wave4 and wave5.

wave4 <- read_sas("C:/Users/tahia/OneDrive/Desktop/UAlbany PhD/Epi 553/Project/Datasets/Wave 4 dataset/doi-10.15139-s3-11920/w4inhome.sas7bdat")

wave5 <- read_xpt("C:/Users/tahia/OneDrive/Desktop/UAlbany PhD/Epi 553/Project/Datasets/Wave 5 dataset/doi-10.15139-s3-zyrz5j/pwave5.xpt")

creating sub dataset, wave 4

wave4_sub <- wave4 %>%
  select(
    AID,
    BIO_SEX4,
    H4IR4,
    H4ED2,
    H4EC1,
    H4SE6,
    H4TO3,
    H4TO35,
    H4DA4,
    H4DA5,
    H4DA6,
    H4DA7,
    H4DA8,
    H4SP5,
    H4SP6,
    H4EC5,
    H4LM11,
    H4ID5H
  )

glimpse(wave4_sub)

## Rows: 5,114
## Columns: 18
## $ AID      <chr> "57101310", "57103869", "57109625", "57111071", "57113943", "…
## $ BIO_SEX4 <dbl> 2, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2…
## $ H4IR4    <dbl> 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 4, 2, 1, 1, 1, 1, 2, 2…
## $ H4ED2    <dbl> 3, 2, 2, 6, 6, 1, 6, 12, 7, 6, 3, 9, 3, 6, 4, 6, 5, 5, 6, 2, …
## $ H4EC1    <dbl> 9, 1, 5, 6, 9, 9, 9, 11, 12, 7, 8, 10, 6, 9, 6, 9, 3, 5, 98, …
## $ H4SE6    <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1…
## $ H4TO3    <dbl> 1, 0, 0, 0, 7, 1, 0, 7, 7, 0, 0, 0, 7, 7, 7, 0, 1, 1, 0, 1, 7…
## $ H4TO35   <dbl> 2, 97, 5, 4, 1, 98, 2, 2, 4, 1, 2, 3, 97, 2, 4, 4, 97, 2, 0, …
## $ H4DA4    <dbl> 0, 1, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ H4DA5    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 5, 0, 0, 0, 0, 0…
## $ H4DA6    <dbl> 0, 0, 2, 0, 3, 0, 0, 0, 4, 0, 3, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ H4DA7    <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ H4DA8    <dbl> 2, 2, 5, 0, 0, 0, 0, 0, 0, 2, 4, 0, 4, 1, 0, 0, 0, 2, 2, 0, 0…
## $ H4SP5    <dbl> 4, 0, 1, 0, 2, 0, 0, 0, 2, 0, 4, 2, 3, 2, 0, 0, 3, 2, 0, 4, 0…
## $ H4SP6    <dbl> 4, 0, 1, 0, 3, 0, 1, 0, 0, 1, 4, 4, 2, 4, 2, 0, 2, 2, 1, 4, 0…
## $ H4EC5    <dbl> 22000, 9999997, 9999997, 9999997, 9999997, 9999997, 9999997, …
## $ H4LM11   <dbl> 0, 0, 1, 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0…
## $ H4ID5H   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

creating sub dataset, wave 5

wave5_sub <- wave5 %>%
  select(
    AID,
H5OD2A,
H5OD4A,
H5OD4B,
H5OD4C,
H5OD4D,
H5OD4E,
H5OD4F,
H5OD4G,
H5OD11,
H5EC1,
H5SE1,
H5TO1,
H5TO12,
H5ID25,
H5ID26,
H5ID27,
H5ID28,
H5ID29,
H5ID16,
H5EC5A,
H5LM5,
H5ID6G
  )

glimpse(wave5_sub)

## Rows: 4,196
## Columns: 23
## $ AID    <chr> "57101310", "57111071", "57111786", "57113943", "57117997", "57…
## $ H5OD2A <dbl> 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 1, …
## $ H5OD4A <dbl> 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, …
## $ H5OD4B <dbl> 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, …
## $ H5OD4C <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, …
## $ H5OD4D <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ H5OD4E <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ H5OD4F <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ H5OD4G <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ H5OD11 <dbl> 4, 8, 3, 6, 3, 9, 16, 12, 3, 3, 4, 2, 7, 5, 3, 2, 8, 11, 10, 7,…
## $ H5EC1  <dbl> 5, 8, 11, 9, 7, 5, 10, 9, 1, 4, 3, 10, 3, 5, 1, 2, 10, 10, 9, 8…
## $ H5SE1  <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ H5TO1  <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, …
## $ H5TO12 <dbl> 1, 4, 97, 2, 3, 0, 2, 3, 2, 0, 2, 3, 97, 2, 2, 0, 3, 2, 1, 4, 4…
## $ H5ID25 <dbl> 0, 2, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 5, 0, 0, 0, 0, …
## $ H5ID26 <dbl> 0, 0, 0, 3, 3, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, …
## $ H5ID27 <dbl> 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, …
## $ H5ID28 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, …
## $ H5ID29 <dbl> 0, 0, 0, 3, 3, 1, 0, 0, 7, 7, 3, 0, 0, 0, 0, 4, 2, 3, 1, 2, 0, …
## $ H5ID16 <dbl> 4, 0, 2, 4, 0, 2, 0, 2, 3, 0, 0, 0, 4, 1, 4, 2, 2, 2, 2, 0, 0, …
## $ H5EC5A <dbl> 3, 6, 6, 1, 1, 1, 7, 6, 1, 2, 1, 6, 2, 7, 5, 1, 6, 7, 6, 6, 6, …
## $ H5LM5  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, …
## $ H5ID6G <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, …

Sex Sex was obtained from the biological sex variable in the dataset. In Wave IV, the variable BIO_SEX4 was used, and in Wave V the variable H5OD2A was used. Responses that indicated refusal, “don’t know,” or other non-substantive answers were recoded as missing values. The variable was kept as a binary variable representing male and female.

Race Race was constructed differently for the two waves because the datasets recorded race in different ways. In Wave IV, the interviewer’s observation of the respondent’s race was used (H4IR4). In Wave V, race was measured using several race indicator variables (H5OD4A–H5OD4G). These variables were combined to create one race variable. The final race categories were coded as White, Black or African American, American Indian or Alaska Native, and Asian or Pacific Islander. Any other race categories or unclear responses were treated as missing.

Education Education level was recategorized to create three broader groups of educational attainment. In Wave IV the variable H4ED2 was used, and in Wave V the variable H5OD11 was used. The original categories were grouped into three new categories: high school or less, some or completed college, and some or completed graduate education or higher. This grouping allowed the education variable to be comparable across both waves.

Household Income Household income was obtained from the income variable H4EC1 in Wave IV and H5EC1 in Wave V. The original income categories were grouped into two levels. Income values corresponding to $74,999 or less were coded as the lower income group, and income values representing $75,000 or more were coded as the higher income group. This recoding created a simplified measure of income for the analysis.

Sexual Activity Sexual activity status was measured using the variable H4SE6 in Wave IV and H5SE1 in Wave V. The responses were recoded into a binary variable indicating whether the respondent was currently sexually active or not. Any responses indicating refusal or “don’t know” were treated as missing values.

Smoking Status Smoking status was obtained from the smoking behavior variables H4TO3 for Wave IV and H5TO1 for Wave V. The variable was recoded into a binary smoking variable. Responses coded as 7 or 0 were treated as non-smokers, while all other valid smoking responses were classified as smokers. Non-substantive responses were recoded as missing.

Alcohol Consumption Alcohol use was measured using the variable H4TO35 in Wave IV and H5TO12 in Wave V. The variable was recoded into a binary indicator of alcohol consumption. Responses coded as 97 or 0 were considered as no alcohol consumption, while other valid responses indicating alcohol use were coded as alcohol consumption.

Physical Activity Physical activity was measured using five exercise-related variables in Wave IV (H4DA4, H4DA5, H4DA6, H4DA7, and H4DA8). These variables represented participation in different types of physical activities. The variables were combined into one physical activity variable. If a respondent reported participating in any of the activities, the variable was coded as physically active. If no activity was reported, it was coded as not physically active.The same method was applied for Wave V dataset, the variables for Wave V dataset were H5ID25-H5ID29.

Sleep Trouble Sleep disturbance was constructed by combining two variables related to sleep problems, H4SP5 and H4SP6, in Wave IV. These variables asked about difficulty falling asleep and difficulty staying asleep. The responses were combined to create one sleep trouble variable. A value of 0 indicated no sleep trouble, while values from 1 to 5 were recoded to indicate the presence of sleep trouble.

Financial Burden Financial burden was measured using variables related to mortgage. In Wave IV, the variables H4EC5 was used, while in Wave V the variable H5EC5A was used. These variables were recoded into three categories representing no debt, moderate debt, and high debt. This recoding allowed financial strain to be represented as an ordered categorical variable.

Employment Status Employment status was obtained from the variables H4LM11 in Wave IV and H5LM5 in Wave V. The responses were recoded into a binary variable indicating whether the respondent was employed or not employed. Responses coded as 7 or 0 were treated as not employed, while the remaining valid responses were classified as employed.

Depression Depression was used as the outcome variable in the study. In Wave IV, the variable H4ID5H was used, and in Wave V the variable H5ID6G was used. These variables asked whether a doctor or other health care provider had ever told the respondent that they had depression. The responses were treated as a binary variable, with 1 indicating the presence of depression and 0 indicating no depression.

# WAVE 4 SUBSET RECODING

# Recoding the variables as needed

wave4_sub <- wave4_sub %>%      
  mutate(across(
    c(BIO_SEX4, H4SE6, H4SP5, H4SP6, H4ID5H),      #missing data accumulation ALL GENERAL
    ~replace(., . %in% c(6,7,8,9,96,97,98,99), NA)
  )) %>%      

  mutate(across(
    c(H4ED2, H4EC1, H4EC5), #missing data accumulation EDUCATION, INCOME, FIN BURDEN
    ~replace(., . %in% c(96,97,98,99,9999996,9999998), NA)
  )) %>%      

mutate(across(
    c(H4DA4,H4DA5, H4DA6, H4DA7, H4DA8),     #missing data accumulation ALL EXCERCISE
    ~replace(., . %in% c(8,9,96,97,98,99), NA)
  )) %>%      

mutate(across(
    c(H4TO3, H4LM11),     #missing data accumulation SMOKING STAT, EMPLOYMENT
    ~replace(., . %in% c(6,8,9,96,97,98,99), NA)
  )) %>%      

mutate(across(
    c(H4TO35),     #missing data accumulation ALCOHOL STAT
    ~replace(., . %in% c(7,8,9,96,98,99), NA)
  ))

summary(wave4_sub)

##      AID               BIO_SEX4        H4IR4           H4ED2       
##  Length:5114        Min.   :1.00   Min.   :1.000   Min.   : 1.000  
##  Class :character   1st Qu.:1.00   1st Qu.:1.000   1st Qu.: 4.000  
##  Mode  :character   Median :2.00   Median :1.000   Median : 6.000  
##                     Mean   :1.54   Mean   :1.351   Mean   : 5.706  
##                     3rd Qu.:2.00   3rd Qu.:2.000   3rd Qu.: 7.000  
##                     Max.   :2.00   Max.   :4.000   Max.   :13.000  
##                                    NA's   :5       NA's   :1       
##      H4EC1         H4SE6            H4TO3           H4TO35     
##  Min.   : 1    Min.   :0.0000   Min.   :0.000   Min.   : 0.00  
##  1st Qu.: 7    1st Qu.:1.0000   1st Qu.:1.000   1st Qu.: 2.00  
##  Median : 9    Median :1.0000   Median :1.000   Median : 4.00  
##  Mean   : 8    Mean   :0.9446   Mean   :2.905   Mean   :21.75  
##  3rd Qu.:10    3rd Qu.:1.0000   3rd Qu.:7.000   3rd Qu.: 5.00  
##  Max.   :12    Max.   :1.0000   Max.   :7.000   Max.   :97.00  
##  NA's   :353   NA's   :24       NA's   :4       NA's   :10     
##      H4DA4            H4DA5            H4DA6            H4DA7       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.2769   Mean   :0.7008   Mean   :0.8736   Mean   :0.3166  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :7.0000   Max.   :7.0000   Max.   :7.0000   Max.   :7.0000  
##  NA's   :3        NA's   :3        NA's   :3        NA's   :3       
##      H4DA8           H4SP5          H4SP6           H4EC5        
##  Min.   :0.000   Min.   :0.00   Min.   :0.000   Min.   :      0  
##  1st Qu.:0.000   1st Qu.:0.00   1st Qu.:0.000   1st Qu.: 130000  
##  Median :1.000   Median :1.00   Median :1.000   Median :9999997  
##  Mean   :1.975   Mean   :1.14   Mean   :1.314   Mean   :6142953  
##  3rd Qu.:3.000   3rd Qu.:2.00   3rd Qu.:2.000   3rd Qu.:9999997  
##  Max.   :7.000   Max.   :5.00   Max.   :5.000   Max.   :9999997  
##  NA's   :3       NA's   :1      NA's   :1       NA's   :145      
##      H4LM11          H4ID5H      
##  Min.   :0.000   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:0.0000  
##  Median :1.000   Median :0.0000  
##  Mean   :1.803   Mean   :0.1617  
##  3rd Qu.:1.000   3rd Qu.:0.0000  
##  Max.   :7.000   Max.   :1.0000  
##  NA's   :3       NA's   :1

# WAVE 5 SUBSET RECODING

# Recoding the variables as needed

wave5_sub <- wave5_sub %>%      
  mutate(across(
    c(H5OD2A, H5SE1, H5ID16, H5ID6G),      #missing data accumulation ALL GENERAL
    ~replace(., . %in% c(6,7,8,9,96,97,98,99), NA)
  )) %>%      

  mutate(across(
    c(H5OD11, H5EC1, H5EC5A), #missing data accumulation EDUCATION, INCOME, FIN BURDEN
    ~replace(., . %in% c(96,97,98,99,9999996,9999998), NA)
  )) %>%      

mutate(across(
    c(H5ID25,H5ID26,H5ID27,H5ID28,H5ID29),     #missing data accumulation ALL EXCERCISE
    ~replace(., . %in% c(8,9,96,97,98,99), NA)
  )) %>%      

mutate(across(
    c(H5TO1, H5LM5),     #missing data accumulation SMOKING STAT, EMPLOYMENT
    ~replace(., . %in% c(6,8,9,96,97,98,99), NA)
  )) %>%      

mutate(across(
    c(H5TO12),     #missing data accumulation ALCOHOL STAT
    ~replace(., . %in% c(7,8,9,96,98,99), NA)
  ))

summary(wave5_sub)

##      AID                H5OD2A          H5OD4A           H5OD4B      
##  Length:4196        Min.   :1.000   Min.   :0.0000   Min.   :0.0000  
##  Class :character   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Mode  :character   Median :2.000   Median :1.0000   Median :0.0000  
##                     Mean   :1.571   Mean   :0.6797   Mean   :0.2175  
##                     3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##                     Max.   :2.000   Max.   :1.0000   Max.   :1.0000  
##                                     NA's   :12       NA's   :12      
##      H5OD4C            H5OD4D            H5OD4E             H5OD4F       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.000000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.000000   Median :0.00000  
##  Mean   :0.09704   Mean   :0.03728   Mean   :0.009082   Mean   :0.02892  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.000000   Max.   :1.00000  
##  NA's   :12        NA's   :12        NA's   :12         NA's   :12       
##      H5OD4G             H5OD11           H5EC1           H5SE1       
##  Min.   :0.000000   Min.   : 2.000   Min.   : 1.00   Min.   :0.0000  
##  1st Qu.:0.000000   1st Qu.: 6.000   1st Qu.: 5.00   1st Qu.:1.0000  
##  Median :0.000000   Median : 9.000   Median : 8.00   Median :1.0000  
##  Mean   :0.006214   Mean   : 8.156   Mean   : 7.17   Mean   :0.9565  
##  3rd Qu.:0.000000   3rd Qu.:10.000   3rd Qu.: 9.00   3rd Qu.:1.0000  
##  Max.   :1.000000   Max.   :16.000   Max.   :13.00   Max.   :1.0000  
##  NA's   :12         NA's   :5        NA's   :71      NA's   :33      
##      H5TO1            H5TO12           H5ID25           H5ID26      
##  Min.   :0.0000   Min.   : 0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.: 2.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median : 3.000   Median :0.0000   Median :0.0000  
##  Mean   :0.4294   Mean   : 9.616   Mean   :0.6541   Mean   :0.9596  
##  3rd Qu.:1.0000   3rd Qu.: 4.000   3rd Qu.:0.0000   3rd Qu.:2.0000  
##  Max.   :1.0000   Max.   :97.000   Max.   :7.0000   Max.   :7.0000  
##  NA's   :11                        NA's   :143      NA's   :138     
##      H5ID27           H5ID28           H5ID29         H5ID16     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00   1st Qu.:1.000  
##  Median :0.0000   Median :0.0000   Median :2.00   Median :2.000  
##  Mean   :0.6539   Mean   :0.2696   Mean   :2.34   Mean   :1.727  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:4.00   3rd Qu.:3.000  
##  Max.   :7.0000   Max.   :7.0000   Max.   :7.00   Max.   :4.000  
##  NA's   :125      NA's   :134      NA's   :130    NA's   :6      
##      H5EC5A          H5LM5           H5ID6G      
##  Min.   :1.000   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :5.000   Median :1.000   Median :0.0000  
##  Mean   :3.927   Mean   :1.178   Mean   :0.2516  
##  3rd Qu.:6.000   3rd Qu.:1.000   3rd Qu.:1.0000  
##  Max.   :9.000   Max.   :3.000   Max.   :1.0000  
##  NA's   :64      NA's   :10      NA's   :14

# WAVE 4, SORTING CATEGORIES;
wave4_sub <- wave4_sub %>%
  mutate(
    H4TO3  = replace(H4TO3,  H4TO3  == 7, 0),   # smoking: 7 - 0
    H4TO35 = replace(H4TO35, H4TO35 == 97, 0),  # alcohol: 97 - 0
    H4LM11 = replace(H4LM11, H4LM11 == 7, 0),    # employment: 7 - 0
    H4EC5  = replace(H4EC5,  H4EC5  == 9999997, 0) # house mortgage 9999997 - 0
  )

# WAVE 5, SORTING CATEGORIES;
wave5_sub <- wave5_sub %>%
  mutate(
    H5TO1  = replace(H5TO1,  H5TO1  == 7, 0),   # smoking: 7 - 0
    H5TO12 = replace(H5TO12, H5TO12 == 97, 0),  # alcohol: 97 - 0
    H5LM5 = replace(H5LM5, H5LM5 == 7, 0)    # employment: 7 - 0
  )

 # RECODE AND ALCOHOL

table(wave4_sub$H4TO3, useNA = "ifany")

## 
##    0    1 <NA> 
## 2798 2312    4

table(wave5_sub$H5TO1, useNA = "ifany")

## 
##    0    1 <NA> 
## 2388 1797   11

#WAVE 4 AND 5, MERGING VARIABLES;

wave4_sub <- wave4_sub %>%
  mutate(
    Sleep_trouble = ifelse(H4SP5 >= 1 | H4SP6 >= 1, 1, 0)
  )

wave4_sub <- wave4_sub %>%
  mutate(
    Exercise = ifelse(H4DA4 >= 1 | H4DA5 >= 1 | H4DA6 >= 1 | H4DA7 >= 1 | H4DA8 >= 1, 1, 0))


wave5_sub <- wave5_sub %>%
  mutate(
    Exercise = ifelse(H5ID25 >= 1 | H5ID26 >= 1 | H5ID27 >= 1 | H5ID28 >= 1 | H5ID29 >= 1, 1, 0))

#MAKING NEW CATEGORY;

wave4_sub <- wave4_sub %>%
  mutate(
    Mortgage = case_when(
      H4EC5 == 0 ~ 0,
      H4EC5 >= 20 & H4EC5 < 100000 ~ 1,
      H4EC5 >= 100000 ~ 2,
      TRUE ~ NA_real_))

wave4_sub <- wave4_sub %>%
  mutate(Alcohol = case_when(
    H4TO35 == 0 ~ 0,
    H4TO35 %in% 1:6 ~ 1,
    TRUE ~ NA_real_))

wave4_sub <- wave4_sub %>%
  mutate(Sex = case_when(
    BIO_SEX4 == 1 ~ 1,
    BIO_SEX4 == 2 ~ 2,
    TRUE ~ NA_real_))

wave4_sub <- wave4_sub %>%
  mutate(Race  = case_when(
    H4IR4  == 1 ~ 1,
    H4IR4  == 2 ~ 2,
    H4IR4  == 3 ~ 3,
    H4IR4  == 4 ~ 4,
    TRUE ~ NA_real_))

wave4_sub <- wave4_sub %>%
  mutate(Sexually_active = case_when(
    H4SE6 == 0 ~ 0,
    H4SE6 == 1 ~ 1,
    TRUE ~ NA_real_))

wave4_sub <- wave4_sub %>%
  mutate(Employment = case_when(
    H4LM11 == 0 ~ 0,
    H4LM11 == 1 ~ 1,
    TRUE ~ NA_real_))

wave4_sub <- wave4_sub %>%
  mutate(
    Depression = case_when(
      H4ID5H == 1 ~ 1,
      H4ID5H == 0 ~ 0,
      TRUE ~ NA_real_))

wave4_sub <- wave4_sub %>%
  mutate(Smoking = case_when(
    H4TO3 == 0 ~ 0,
    H4TO3 == 1 ~ 1,
    TRUE ~ NA_real_))

wave5_sub <- wave5_sub %>%
  mutate(Sex = case_when(
    H5OD2A == 1 ~ 1,
    H5OD2A == 2 ~ 2,
    TRUE ~ NA_real_))

wave5_sub <- wave5_sub %>%
  mutate(
    Mortgage = case_when(
      H5EC5A == 1 ~ 0,
      H5EC5A >= 2 & H5EC5A <= 5 ~ 1,
      H5EC5A >= 6 & H5EC5A <= 9 ~ 2,
      TRUE ~ NA_real_))

wave5_sub <- wave5_sub %>%
  mutate(Sexually_active = case_when(
    H5SE1 == 0 ~ 0,
    H5SE1 == 1 ~ 1,
    TRUE ~ NA_real_))

wave5_sub <- wave5_sub %>%
  mutate(Smoking = case_when(
    H5TO1 == 0 ~ 0,
    H5TO1 == 1 ~ 1,
    TRUE ~ NA_real_))

wave5_sub <- wave5_sub %>%
  mutate(Alcohol = case_when(
    H5TO12 == 0 ~ 0,
    H5TO12 %in% 1:6 ~ 1,
    TRUE ~ NA_real_))

wave5_sub <- wave5_sub %>%
  mutate(Sleep_trouble = case_when(
    H5ID16 == 0 ~ 0,
    H5ID16 %in% 1:4 ~ 1,
    TRUE ~ NA_real_))

wave5_sub <- wave5_sub %>%
  mutate(Employment = case_when(
    H5LM5 == 2:3 ~ 0,
    H5LM5 %in% 1 ~ 1,
    TRUE ~ NA_real_))

wave5_sub <- wave5_sub %>%
  mutate(Depression = case_when(
    H5ID6G == 0 ~ 0,
    H5ID6G == 1 ~ 1,
    TRUE ~ NA_real_))

AVARAGE INCOME According to National Average Wage Index. (n.d.). Social Security. https://www.ssa.gov/oact/cola/AWI.html, the average annual income of USA in 2024 is 69,846.57. To align with this number, I choose to divide the yearly income category as close to the average number possible, which is 75K. so the 2 categories are below US average and above US average.

# RECATEGORY OF YEARLY INCOME;

wave4_sub <- wave4_sub %>%
  mutate(
    Income_cat = case_when(
      H4EC1 >= 1 & H4EC1 <= 9 ~ 0,
      H4EC1 >= 10 & H4EC1 <= 12 ~ 1,
      TRUE ~ NA_real_))


wave5_sub <- wave5_sub %>%
  mutate(
    Income_cat = case_when(
      H5EC1 >= 1 & H5EC1 <= 9 ~ 0,
      H5EC1 >= 10 & H5EC1 <= 12 ~ 1,
      TRUE ~ NA_real_))

#RECATEGORY OF EDUCATION

wave4_sub <- wave4_sub %>%
  mutate(
    Education_cat = case_when(
      H4ED2 >= 1 & H4ED2 <= 2 ~ 0,
      H4ED2 >= 4 & H4ED2 <= 7 ~ 1,
      H4ED2 >= 8 & H4ED2 <= 13 ~ 2,
      TRUE ~ NA_real_))


wave5_sub <- wave5_sub %>%
  mutate(
    Education_cat = case_when(
      H5OD11 >= 2 & H5OD11 <= 4 ~ 0,
      H5OD11 >= 5 & H5OD11 <= 10 ~ 1,
      H5OD11 >= 11 & H5OD11 <= 16 ~ 2,
      TRUE ~ NA_real_))

# MERGING AD RECODING RACE VARIABLE, WAVE 5;

wave5_sub <- wave5_sub %>%
  mutate(
    Race = case_when(
      H5OD4A == 1 ~ 1,                              # White
      H5OD4B == 1 ~ 2,                              # Black
      H5OD4F == 1 ~ 3,                              # American Indian / Alaska Native
      H5OD4D == 1 | H5OD4E == 1 ~ 4,                 # Asian or Pacific Islander
      TRUE ~ NA_real_                               # Other or missing
      )
  )

table(wave5_sub$race, useNA = "ifany")

## Warning: Unknown or uninitialised column: `race`.

## < table of extent 0 >

# KEEPING THE NEWLY MADE AND NEWLY CODED VARIABLES ONLY,

wave4_work <- wave4_sub %>%
  select(
    Sex,
    Race,
    Education_cat,
    Income_cat,
    Sexually_active,
    Smoking,
    Alcohol,
    Exercise,
    Sleep_trouble,
    Mortgage,
    Employment,
    Depression
  )

wave5_work <- wave5_sub %>%
  select(
    Sex,
    Race,
    Education_cat,
    Income_cat,
    Sexually_active,
    Smoking,
    Alcohol,
    Exercise,
    Sleep_trouble,
    Mortgage,
    Employment,
    Depression
  )

# Labeling the categories
wave4_work <- wave4_work %>%
  mutate(
    Sex = factor(Sex, levels=c(1,2), labels=c("Male","Female")),
    Race = factor(Race, levels=c(1,2,3,4),
                  labels=c("White","Black","American Indian/Alaska Native","Asian/Pacific Islander")),
    Education_cat = factor(Education_cat,
                           levels=c(0,1,2),
                           labels=c("High school or less","Some/Completed college","Graduate or higher")),
    Income_cat = factor(Income_cat, levels=c(0,1),
                        labels=c("≤$74,999","≥$75,000")),
    Sexually_active = factor(Sexually_active, levels=c(0,1),
                             labels=c("No","Yes")),
    Smoking = factor(Smoking, levels=c(0,1),
                     labels=c("Non-smoker","Smoker")),
    Alcohol = factor(Alcohol, levels=c(0,1),
                     labels=c("No","Yes")),
    Exercise = factor(Exercise, levels=c(0,1),
                      labels=c("Inactive","Active")),
    Sleep_trouble = factor(Sleep_trouble, levels=c(0,1),
                           labels=c("No","Yes")),
    Mortgage = factor(Mortgage, levels=c(0,1,2),
                      labels=c("No debt","Moderate debt","High debt")),
    Employment = factor(Employment, levels=c(0,1),
                        labels=c("Not employed","Employed")),
    Depression = factor(Depression, levels=c(0,1),
                        labels=c("No","Yes"))
  )

wave5_work <- wave5_work %>%
 mutate(
    Sex = factor(Sex, levels=c(1,2), labels=c("Male","Female")),
    Race = factor(Race, levels=c(1,2,3,4),
                  labels=c("White","Black","American Indian/Alaska Native","Asian/Pacific Islander")),
    Education_cat = factor(Education_cat,
                           levels=c(0,1,2),
                           labels=c("High school or less","Some/Completed college","Graduate or higher")),
    Income_cat = factor(Income_cat, levels=c(0,1),
                        labels=c("≤$74,999","≥$75,000")),
    Sexually_active = factor(Sexually_active, levels=c(0,1),
                             labels=c("No","Yes")),
    Smoking = factor(Smoking, levels=c(0,1),
                     labels=c("Non-smoker","Smoker")),
    Alcohol = factor(Alcohol, levels=c(0,1),
                     labels=c("No","Yes")),
    Exercise = factor(Exercise, levels=c(0,1),
                      labels=c("Inactive","Active")),
    Sleep_trouble = factor(Sleep_trouble, levels=c(0,1),
                           labels=c("No","Yes")),
    Mortgage = factor(Mortgage, levels=c(0,1,2),
                      labels=c("No debt","Moderate debt","High debt")),
    Employment = factor(Employment, levels=c(0,1),
                        labels=c("Not employed","Employed")),
    Depression = factor(Depression, levels=c(0,1),
                        labels=c("No","Yes"))
  )

Descriptive Statistics

# Create a summary table of the WAVE 4 WORK data

tbl_w4 <- wave4_work %>%
tbl_summary(
  type = list(
    Income_cat ~ "categorical",
    Sexually_active ~ "categorical",
    Smoking ~ "categorical",
    Alcohol ~ "categorical",
    Exercise ~ "categorical",
    Sleep_trouble ~ "categorical",
    Employment ~ "categorical",
    Depression ~ "categorical"
  ),
  missing = "ifany"
)


# Create a summary table of the WAVE 5 WORK data

tbl_w5 <- wave5_work %>%
tbl_summary(
  type = list(
    Income_cat ~ "categorical",
    Sexually_active ~ "categorical",
    Smoking ~ "categorical",
    Alcohol ~ "categorical",
    Exercise ~ "categorical",
    Sleep_trouble ~ "categorical",
    Employment ~ "categorical",
    Depression ~ "categorical"
  ),
  missing = "ifany"
)


tbl_merged <- tbl_merge(
  tbls = list(tbl_w4, tbl_w5),
  tab_spanner = c("**Wave 4**", "**Wave 5**")
)

## The number rows in the tables to be merged do not match, which may result in
## rows appearing out of order.
## ℹ See `tbl_merge()` (`?gtsummary::tbl_merge()`) help file for details. Use
##   `quiet=TRUE` to silence message.

tbl_merged <- modify_caption(
  tbl_merged,
  "**Table 1. Descriptive Characteristics of the Study Sample in Add Health Wave IV and Wave V**"
)

tbl_merged

**Table 1. Descriptive Characteristics of the Study Sample in Add Health Wave IV and Wave V**
Characteristic	Wave 4	Wave 5
Characteristic	N = 5,114¹	N = 4,196¹
Sex
Male	2,353 (46%)	1,802 (43%)
Female	2,761 (54%)	2,394 (57%)
Race
White	3,671 (72%)	2,844 (73%)
Black	1,240 (24%)	863 (22%)
American Indian/Alaska Native	41 (0.8%)	38 (1.0%)
Asian/Pacific Islander	157 (3.1%)	135 (3.5%)
Unknown	5	316
Education_cat
High school or less	399 (9.3%)	826 (20%)
Some/Completed college	3,223 (75%)	2,529 (60%)
Graduate or higher	656 (15%)	836 (20%)
Unknown	836	5
Income_cat
≤$74,999	3,340 (70%)	3,147 (78%)
≥$75,000	1,421 (30%)	871 (22%)
Unknown	353	178
Sexually_active
No	282 (5.5%)	181 (4.3%)
Yes	4,808 (94%)	3,982 (96%)
Unknown	24	33
Smoking
Non-smoker	2,798 (55%)	2,388 (57%)
Smoker	2,312 (45%)	1,797 (43%)
Unknown	4	11
Alcohol
No	1,435 (28%)	651 (16%)
Yes	3,669 (72%)	3,545 (84%)
Unknown	10
Exercise
Inactive	1,226 (24%)	846 (21%)
Active	3,885 (76%)	3,220 (79%)
Unknown	3	130
Sleep_trouble
No	1,603 (31%)	992 (24%)
Yes	3,509 (69%)	3,198 (76%)
Unknown	2	6
Mortgage
No debt	3,253 (65%)	1,409 (34%)
Moderate debt	710 (14%)	1,037 (25%)
High debt	1,006 (20%)	1,686 (41%)
Unknown	145	64
Employment
Not employed	1,750 (34%)	348 (9.1%)
Employed	3,361 (66%)	3,479 (91%)
Unknown	3	369
Depression
No	4,286 (84%)	3,130 (75%)
Yes	827 (16%)	1,052 (25%)
Unknown	1	14
¹ n (%)

library(officer)
library(flextable)

## 
## Attaching package: 'flextable'

## The following object is masked from 'package:gtsummary':
## 
##     continuous_summary

## The following objects are masked from 'package:plotly':
## 
##     highlight, style

## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote

## The following object is masked from 'package:purrr':
## 
##     compose

ft_merge <- as_flex_table(tbl_merged)

save_as_docx(
  "Wave IV and V Merged Table" = ft_merge,
  path = "Merged_Table.docx"
)

getwd()

## [1] "C:/Users/tahia/OneDrive/Desktop/UAlbany PhD/Epi 553/Project/New folder"

# in this step I took help from Chatgpt. I asked it if there is any code by which i can import the table directly as a word document. It provided me with the code and also suggested me to install the necessary packages called officer and flextable. These packages were not present in the package library initially. I used the install option from the package tab and installed them manually. The path this function is created for the word document containing the table, goes to automatically where the rmd file is saved.

Looking for Distribution

table(wave4_work$Depression, useNA = "ifany")

## 
##   No  Yes <NA> 
## 4286  827    1

summary(wave4_work$Depression)

##   No  Yes NA's 
## 4286  827    1

ggplot(wave4_work, aes(x = Depression)) +
  geom_bar(fill = "steelblue") +
  labs(
    title = "Distribution of Depression Diagnosis (Wave IV)",
    x = "Depression Diagnosis",
    y = "Count"
  ) +
  theme_minimal()

ggplot(wave5_work, aes(x = Depression)) +
  geom_bar(fill = "steelblue") +
  labs(
    title = "Distribution of Depression Diagnosis (Wave V)",
    x = "Depression Diagnosis",
    y = "Count"
  ) +
  theme_minimal()

ggplot(wave4_work, aes(x = Employment, fill = Depression)) +
  geom_bar(position = "fill") +
  labs(
    title = "Proportion of Depression by Employment Status (Wave IV)",
    x = "Employment",
    y = "Proportion"
  ) +
  theme_minimal()

ggplot(wave5_work, aes(x = Employment, fill = Depression)) +
  geom_bar(position = "fill") +
  labs(
    title = "Proportion of Depression by Employment Status (Wave V)",
    x = "Employment",
    y = "Proportion"
  ) +
  theme_minimal()

ggplot(wave4_work, aes(x = Sleep_trouble, fill = Depression)) +
  geom_bar(position = "fill") +
  labs(
    title = "Proportion of Depression by Sleep Trouble (Wave IV)",
    x = "Sleep Trouble",
    y = "Proportion"
  ) +
  theme_minimal()

ggplot(wave5_work, aes(x = Sleep_trouble, fill = Depression)) +
  geom_bar(position = "fill") +
  labs(
    title = "Proportion of Depression by Sleep Trouble (Wave V)",
    x = "Sleep Trouble",
    y = "Proportion"
  ) +
  theme_minimal()

Overall Summary

The Add Health Wave IV and Wave V datasets were imported into R for analysis. Relevant variables related to depression and potential risk factors were extracted and combined into analytic datasets for each wave. Variables were selected based on the study objective and included sociodemographic, behavioral, and environmental characteristics. The outcome variable for this study was depression diagnosis, based on the survey question asking whether a doctor, nurse, or other health care provider had ever told the respondent that they had depression. In Wave IV the variable identifier was H4ID5H and in Wave V it was H5ID6G. This variable was recoded into a binary variable indicating the presence or absence of depression.

Several covariates were also selected and recoded to create consistent categories across both waves. These included sex, race, education level, household income, sexual activity, smoking status, alcohol use, physical activity, sleep trouble, employment status, and financial debt. Multi-category variables were collapsed into meaningful groups to simplify interpretation and maintain comparability across waves. Non-substantive responses such as “refused,” “don’t know,” or other invalid responses were treated as missing values.

After cleaning and recoding the variables, descriptive statistics were generated to summarize the characteristics of the analytic sample. Table 1 was created using the gtsummary package in R to report frequencies and percentages for all categorical variables. Exploratory data analysis plots were then generated to examine the distribution of the outcome variable and its relationship with key predictors.

How Do Depression Risk Factors Evolve Over Time

Tahia Sufyani

2026-03-13