#prepare workspace and load packages to be used

rm(list=ls(all=T))
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.1.0
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot)
## corrplot 0.95 loaded
library(haven)
library(epitools)

#load and inspect data Dataset was retrived from IPUMS, produced November 8, 2023, and is titled “Codebook for an Integrated Health Interview Series Data Extract”. You can find the full data codebook at https://live.nhis.datadownload.ipums.org/web/extracts/nhis/2136603/nhis_00011 This data originates from a survey study done at the University of Minnesota focusing on mental health status indicating variables, and also including socioeconomic status variables such as poverty level, receiving welfare, etc.

MH <- read_rds("/Users/kaitlynskeen/Desktop/EPID 7500/IntrotoR_Rproject/data/NHIS_MentalH.rds")
colnames(MH)
##  [1] "YEAR"        "STRATA"      "PSU"         "HHWEIGHT"    "PERNUM"     
##  [6] "PERWEIGHT"   "SAMPWEIGHT"  "FWEIGHT"     "ASTATFLG"    "CSTATFLG"   
## [11] "AGE"         "SEX"         "SEXORIEN"    "MARST"       "RACENEW"    
## [16] "EDUCREC2"    "EDUC"        "POORYN"      "INCFAM97ON2" "WELFMO"     
## [21] "GOTWELF"     "OWNERSHIP"   "HEALTH"      "BMI"         "BMICAT"     
## [26] "PREGNANTNOW" "HYSTEV"      "BWGTGRAM"    "BWGTTOTOZ"   "ARTHGLUPEV" 
## [31] "AUTISMEV"    "BLIND"       "CANCEREV"    "CHEARTDIEV"  "CONGHARTEV" 
## [36] "DIABETICEV"  "FHEADYR"     "HEARTCONEV"  "HEPATEV"     "HYPERTENEV" 
## [41] "ALC1YR"      "ALCLIFE"     "ALCAMT"      "SMOKAGEREG"  "CIGSDAY1"   
## [46] "SMOKFREQNOW" "CSQTRYYR"    "AEFFORT"     "AHOPELESS"   "ANERVOUS"   
## [51] "ARESTLESS"   "ASAD"        "AWORTHLESS"  "WORFREQ"     "WORRX"      
## [56] "WORFEELEVL"  "WRYSTDLIV"   "WRYMEDCST"   "WRYHOUS"     "WRYCCPAY"   
## [61] "WRYHCCST"    "WRYRET"      "WRYCOLL"     "WRYBILLS"    "DEPFREQ"    
## [66] "DEPRX"       "DEPFEELEVL"  "FTODMHI"     "MTODMHI"     "TODDEPRES"  
## [71] "EMODIFF"     "GETALONGAD"  "GOODATTEN"   "UNHAPPY"
head(MH)
## # A tibble: 6 × 74
##    YEAR STRATA    PSU      HHWEIGHT PERNUM PERWEIGHT SAMPWEIGHT FWEIGHT ASTATFLG
##   <dbl> <dbl+lbl> <dbl+lb>    <dbl>  <dbl>     <dbl>      <dbl>   <dbl> <int+lb>
## 1  2018 7103      19           3338      1      3539       3915    3539 1 [Samp…
## 2  2018 7137      38           5188      1      5656          0    5656 3 [Not …
## 3  2018 7137      38           5188      2      5848          0    5656 2 [Samp…
## 4  2018 7137      38           5188      3      5886       6055    5656 0 [NIU] 
## 5  2018 7106      22           5259      1      6009      16978    5415 1 [Samp…
## 6  2018 7106      22           5259      2      5738          0    5415 3 [Not …
## # ℹ 65 more variables: CSTATFLG <int+lbl>, AGE <dbl+lbl>, SEX <int+lbl>,
## #   SEXORIEN <int+lbl>, MARST <int+lbl>, RACENEW <int+lbl>, EDUCREC2 <int+lbl>,
## #   EDUC <int+lbl>, POORYN <int+lbl>, INCFAM97ON2 <int+lbl>, WELFMO <int+lbl>,
## #   GOTWELF <int+lbl>, OWNERSHIP <int+lbl>, HEALTH <int+lbl>, BMI <dbl+lbl>,
## #   BMICAT <int+lbl>, PREGNANTNOW <int+lbl>, HYSTEV <int+lbl>,
## #   BWGTGRAM <dbl+lbl>, BWGTTOTOZ <dbl+lbl>, ARTHGLUPEV <int+lbl>,
## #   AUTISMEV <int+lbl>, BLIND <int+lbl>, CANCEREV <int+lbl>, …
nrow(MH)
## [1] 72831
ncol(MH)
## [1] 74

There are 74 different variables, and 72831 observations where in this case each observation/row represents an individual adult person surveyed)

Using this dataset, I want to explore potential connections between chronic illness and mental health status. The following variables will be used: SEX (Sex, 1=male, 2=female) POORYN (Above or below poverty threshold 1=above, 2=below) CANCEREV (Ever told had cancer, 1=no, 2=yes) CHEARTDIEV (Ever told had coronary heart disease, 1=no, 2=yes) CONGHARTEV (Ever told had congenital heart disease, 1=no, 2=yes) DIABETICEV (Ever told had diabetes, 1=no, 2=yes) HEARTCONEV (Ever told had heart condition/disease, 1=no, 2=yes) HEPATEV (Ever had hepatitis, 1=no, 2=yes) WORRX (Take medication for worried, nervous, or anxious feeings, 1=no, 2=yes) DEPRX (Take medication for depression, 1=no, 2=yes)

My first research question I want to describe is ‘what is the prevalence of chronic illness in this group sampled’. I want to explore differences among sex to see if there are any significant differences.

To do this, I will take a count of each variable which indicates having a chronic illness (ARTHGLUPEV, CANCEREV, CHEARTDIEV, CONGHARTEV, DIABETICEV, HEARTCONEV, HEPATEV). I will then simply get a count of the number of ‘yes’ answers to those questions and divide that number by the total number sampled to determine prevalence of chronic illnesses.

MH1 <- MH %>%
  select(SEX, ARTHGLUPEV, CANCEREV, CHEARTDIEV, CONGHARTEV, DIABETICEV, 
         HEARTCONEV, 
         HEPATEV)
#create "CHRONICILL" variable
MH1 <- MH1 %>%
  mutate(across(where(is.labelled), zap_labels))
MH1 <- MH1 %>%
  mutate(
    CHRONICILL = if_else(
      if_any(
        c(ARTHGLUPEV, CANCEREV, CHEARTDIEV, CONGHARTEV,
          DIABETICEV, HEARTCONEV, HEPATEV),
        ~ as.numeric(.) == 2
      ),
      1L, 0L
    )
  )
#now calculate prevalence of chronic illness among sex
MH1 %>%
  group_by(SEX) %>% 
  summarise(
    n = n(),
    cases = sum(CHRONICILL == 1, na.rm = TRUE),
    prevalence = cases / n
  )
## # A tibble: 2 × 4
##     SEX     n cases prevalence
##   <int> <int> <int>      <dbl>
## 1     1 35549  4765      0.134
## 2     2 37282  6440      0.173
#now make a plot
#first add sex label to data so plot is easier to read
MH1 <- MH1 %>%
  mutate(
    SEX_LABEL = case_when(
      SEX == 1 ~ "Male",
      SEX == 2 ~ "Female",
      TRUE ~ "Other"
    )
  )
plot1_data <- MH1 %>%
  group_by(SEX_LABEL) %>% 
  summarise(
    prevalence = mean(CHRONICILL == 1, na.rm = TRUE) * 100
  )
ggplot(plot1_data, aes(x = SEX_LABEL, y = prevalence)) +
  geom_col(fill = "lightblue") +
  geom_text(aes(label = round(prevalence, 1)), vjust = -0.5) +
  labs(
    title = "Prevalence of Chronic Illnesses by Sex",
    x = "Sex",
    y = "Prevalence (%)"
  ) +
  theme_minimal()

#conduct a chi square test of significance
chitable <- table(MH1$SEX, MH1$CHRONICILL)
chitable
##    
##         0     1
##   1 30784  4765
##   2 30842  6440
chisq.test(chitable)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  chitable
## X-squared = 209.03, df = 1, p-value < 2.2e-16

Calculating the prevalence of chronic illness by sex shows that there is a higher prevalence of chronic illness in females. The next step is to run a chi square test to determine if there is a significant association between chronic illness and sex group. Since I am looking at a difference between only two variables (male vs female), a chi square test is appropriate. After running the chi square test, the results indicate there is a significant association between sex and having a chronic illness.

My next research question I want to explore is the link between having a chronic illness and taking mental health medications for anxiety and/or depression.

To do this, I will filter my data to select mental health meds and recreate my CHRONICILL variable created in the first section.

The next step I will take is to do the same process for mental health medication using these variables (WORRX, DEPRX). If the answer is yes, I will code a new variable ‘MHMEDS’ for the answer. I will then use these variables to determine how often individuals with a chronic illness also take mental health medications. I want to do so by calculating odds ratios since not many individuals surveyed reported they take mental health medications.

#filter data to select variables to be used
MH2 <- MH %>%
  select(ARTHGLUPEV, CANCEREV, CHEARTDIEV, CONGHARTEV, DIABETICEV, 
         HEARTCONEV, HEPATEV, WORRX, DEPRX)
#recreate CHRONICILL variable
MH2 <- MH2 %>%
  mutate(across(where(is.labelled), zap_labels))
MH2 <- MH2 %>%
  mutate(
    CHRONICILL = if_else(
      if_any(
        c(ARTHGLUPEV, CANCEREV, CHEARTDIEV, CONGHARTEV,
          DIABETICEV, HEARTCONEV, HEPATEV),
        ~ as.numeric(.) == 2
      ),
      1L, 0L
    )
  )
#create MHMEDS variable
MH2 <- MH2 %>%
  mutate(across(where(is.labelled), zap_labels))
MH2 <- MH2 %>%
  mutate(
    MHMEDS = if_else(
      if_any(
        c(WORRX, DEPRX),
        ~ as.numeric(.) == 2
      ),
      1L, 0L
    )
  )
##create a plot to visualize
#put in a table
MHmed_table <- table(MH2$MHMEDS, MH2$CHRONICILL)
MHmed_table
##    
##         0     1
##   0 60335  9003
##   1  1291  2202
MH2_plot <- as.data.frame(MHmed_table)
colnames(MH2_plot) <- c("MHMEDS", "CHRONICILL", "Count")
ggplot(MH2_plot, aes(x = MHMEDS, y = Count, fill = CHRONICILL)) +
  geom_bar(stat = "identity", position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    title = "Proportion of Chronic Illness by Mental Health Medication Use",
    x = "Mental Health Medication (0=No, 1=Yes)",
    y = "Proportion",
    fill = "Chronic Illness"
  ) +
  theme_minimal()

#calculate odds ratio to further analyze association
oddsratio(MHmed_table)
## $data
##        
##             0     1 Total
##   0     60335  9003 69338
##   1      1291  2202  3493
##   Total 61626 11205 72831
## 
## $measure
##    odds ratio with 95% C.I.
##     estimate    lower    upper
##   0  1.00000       NA       NA
##   1 11.42948 10.63545 12.28764
## 
## $p.value
##    two-sided
##     midp.exact fisher.exact chi.square
##   0         NA           NA         NA
##   1          0            0          0
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "median-unbiased estimate & mid-p exact CI"

The plot visualization shows that there is an association with taking mental health meds when having a chronic illness. I wanted to get odds ratios to analyze the association further. The results of the odds ratio calculation shows that there is a very strong association, with those who take mental health medications having 11.4 times higher chance of having a chronic illness!! As the confidence intervals do not include 1, this is not due to random chance.

My final research question that I want to explore is looking into if having a chronic illness and taking mental health medications are associated with poverty. To do this, I will recreate the previous CHRONICILL and MHMEDS variables, as well as use the POORYN variable.

#filter data to select variables to be used
MH3 <- MH %>%
  select(POORYN, ARTHGLUPEV, CANCEREV, CHEARTDIEV, CONGHARTEV, DIABETICEV, 
         HEARTCONEV, HEPATEV, WORRX, DEPRX)
#recreate CHRONICILL
MH3 <- MH3 %>%
  mutate(across(where(is.labelled), zap_labels))
MH3 <- MH3 %>%
  mutate(
    CHRONICILL = if_else(
      if_any(
        c(ARTHGLUPEV, CANCEREV, CHEARTDIEV, CONGHARTEV,
          DIABETICEV, HEARTCONEV, HEPATEV),
        ~ as.numeric(.) == 2
      ),
      1L, 0L
    )
  )
#recreate MHMEDS
MH3 <- MH3 %>%
  mutate(across(where(is.labelled), zap_labels))
MH3 <- MH3 %>%
  mutate(
    MHMEDS = if_else(
      if_any(
        c(WORRX, DEPRX),
        ~ as.numeric(.) == 2
      ),
      1L, 0L
    )
  )
#recode POORYN
MH3 <- MH3 %>%
  mutate(POOR = if_else(POORYN == 2, 1, 0))
#now create a three way contingency table to see associations
MHtable3 <- table(MH3$CHRONICILL, MH3$MHMEDS, MH3$POOR)
MHtable3
## , ,  = 0
## 
##    
##         0     1
##   0 54343  1118
##   1  8010  1845
## 
## , ,  = 1
## 
##    
##         0     1
##   0  5992   173
##   1   993   357
#logistic regression
regmodel <- glm(CHRONICILL ~ MHMEDS * POOR, 
             data = MH3, family = binomial)
summary(regmodel)
## 
## Call:
## glm(formula = CHRONICILL ~ MHMEDS * POOR, family = binomial, 
##     data = MH3)
## 
## Coefficients:
##             Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) -1.91463    0.01197 -159.972  < 2e-16 ***
## MHMEDS       2.41556    0.03975   60.776  < 2e-16 ***
## POOR         0.11718    0.03629    3.229  0.00124 ** 
## MHMEDS:POOR  0.10633    0.10647    0.999  0.31793    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 62537  on 72830  degrees of freedom
## Residual deviance: 58128  on 72827  degrees of freedom
## AIC: 58136
## 
## Number of Fisher Scoring iterations: 4
#visualize the association
ggplot(MH3, aes(x = factor(MHMEDS), fill = factor(CHRONICILL))) +
  geom_bar(position = "fill") +
  facet_wrap(~ POOR) +
  labs(x = "Mental Health Med Use",
       y = "Proportion",
       fill = "Chronic Illness",
       title = "Chronic Illness by Med Use, Stratified by Poverty") +
  scale_y_continuous(labels = scales::percent)

To assess the potential association, I want to do a logistic regression to determine if poverty level alters the relationship between chronic illness and mental health meds. The results of this test reveal that there is not a significant difference in using mental health meds when having a chronic illness when controling for poverty level. However, it is interesting to note that when holding mental health med use constant, there is around a 12% higher odds of having a chronic illness for those under the poverty level compared to those above poverty level. sessionInfo()