EPI553_HW03_Azamatova

#Part 0: Data Acquisition and Preparation ***

##Setup and Data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(haven)
library(here)

## here() starts at C:/Users/AA843241/OneDrive - University at Albany - SUNY/Desktop/EPI553/hw03

library(knitr)
library(kableExtra)

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

library(broom)
library(ggeffects)
library(gtsummary)
library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

1 Loading Survey BRFSS 2023 Data

# Load raw HCW_database data
BRFSS_2023 <- read_xpt(
  "C:/Users/AA843241/OneDrive - University at Albany - SUNY/Desktop/EPI553/hw03/LLCP2023.XPT")

# Display dataset dimensions
dim(BRFSS_2023)

## [1] 433323    350

##Select the nine variables listed in the table above. Use select() with backtick notation for names beginning with underscores.

BRFSS_raw <- BRFSS_2023 %>%
  select(PHYSHLTH, MENTHLTH,`_BMI5`,SEXVAR,
         EXERANY2,`_AGEG5YR`,`_INCOMG1`,EDUCA,`_SMOKER3`)

head(BRFSS_raw, 10)

## # A tibble: 10 × 9
##    PHYSHLTH MENTHLTH `_BMI5` SEXVAR EXERANY2 `_AGEG5YR` `_INCOMG1` EDUCA
##       <dbl>    <dbl>   <dbl>  <dbl>    <dbl>      <dbl>      <dbl> <dbl>
##  1       88       88    3047      2        2         13          9     5
##  2       88       88    2856      2        1         13          9     5
##  3        6        2    2231      2        1         13          1     4
##  4        2       88    2744      2        1         12          9     5
##  5       88       88    2585      2        1         12          5     5
##  6        2        3    3018      2        1          9          5     5
##  7        8       77    2441      1        2         13          4     4
##  8        1       88    2727      2        2         12          9     5
##  9        5       88    3347      2        2         13          4     5
## 10       88       88    2296      1        1         12          5     4
## # ℹ 1 more variable: `_SMOKER3` <dbl>

###Select and Recode all variables. Store all categorical variables as labeled factors

brfss_clean <- BRFSS_raw %>%
  mutate(
    # PHYSHLTH:
    physhlth_days = case_when(
      PHYSHLTH == 88                    ~ 0,
      PHYSHLTH == 77                    ~ NA_real_,
      PHYSHLTH == 99                    ~ NA_real_,
      PHYSHLTH >= 1 & PHYSHLTH <= 30   ~ as.numeric(PHYSHLTH),
      TRUE                             ~ NA_real_),
    #MENTHLTH 
    menthlth_days = case_when(
      MENTHLTH == 88                    ~ 0,
      MENTHLTH == 77                    ~ NA_real_,
      MENTHLTH == 99                    ~ NA_real_,
      MENTHLTH >= 1 & MENTHLTH <= 30    ~ as.numeric(MENTHLTH),
      TRUE                             ~ NA_real_),
    
    #BMI
    bmi = case_when(
  `_BMI5` == 9999 ~ NA_real_, TRUE ~ `_BMI5` / 100),
    
     sex = factor(case_when(
      SEXVAR == 1 ~ "Male",
      SEXVAR == 2 ~ "Female",
      TRUE ~ NA_character_), 
      levels = c("Male", "Female")),

#EXERANY2: 1 = “Yes”, 2 = “No”; values 7 and 9 should be recoded to NA.  
    exercise = factor(case_when(
      EXERANY2 == 1 ~ "Yes",
      EXERANY2 == 2 ~ "No",
      EXERANY2 == 7 ~ NA_character_,
      EXERANY2 == 9 ~ NA_character_, 
      TRUE          ~ NA_character_ ), 
      levels = c("No", "Yes")),
    
# 1 = “18-24”, 2 = “25-29”, …, 13 = “80+”; value 14 should be recoded to NA.
    age_group = factor(case_when(
      `_AGEG5YR` == 1  ~ "18-24", 
      `_AGEG5YR` == 2  ~ "25-29", 
      `_AGEG5YR` == 3  ~ "30-34", 
      `_AGEG5YR` == 4  ~ "35-39", 
      `_AGEG5YR` == 5  ~ "40-44",
      `_AGEG5YR` == 6  ~ "45-49", 
      `_AGEG5YR` == 7  ~ "50-54",
      `_AGEG5YR` == 8  ~ "55-59",
      `_AGEG5YR` == 9  ~ "60-64", 
      `_AGEG5YR` == 10 ~ "65-69",
      `_AGEG5YR` == 11 ~ "70-74",
      `_AGEG5YR` == 12 ~ "75-79", 
      `_AGEG5YR` == 13 ~ "80+",
      `_AGEG5YR` == 14 ~ NA_character_,  TRUE ~NA_character_),
      levels = c("18-24", "25-29", "30-34", 
                 "35-39", "40-44", "45-49", 
                 "50-54", "55-59", "60-64", 
                 "65-69", "70-74", "75-79", "80+")),
    
  #_INCOMG1: 1 = “Less than $15,000” through 7 = “$200,000 or more”; value 9 should be recoded to NA.
      income = factor(case_when(
        `_INCOMG1`== 1 ~ "Less than $15,000",
        `_INCOMG1`== 2 ~ "$15,000 to < $25,000",       
        `_INCOMG1`== 3 ~ "$25,000 to < $35,000",
        `_INCOMG1`== 4 ~ "$35,000 to < $50,000",
        `_INCOMG1`== 5 ~ "$50,000 to < $100,000",
        `_INCOMG1`== 6 ~ "$100,000 to < $200,000", 
        `_INCOMG1`== 7 ~  "$200,000 or more",       
        `_INCOMG1`== 9 ~ NA_character_, TRUE ~NA_character_), 
        levels = c("Less than $15,000","$15,000 to < $25,000",
                  "$25,000 to < $35,000","$35,000 to < $50,000",
                  "$50,000 to < $100,000", "$100,000 to < $200,000",
                  "$200,000 or more")),

#EDUCA:  
    education = factor(case_when(
      EDUCA %in% c(1, 2)     ~ "Less than HS",
      EDUCA == 3             ~ "HS diploma or GED",
      EDUCA == 4             ~ "Some college or technical school",
      EDUCA == 5             ~ "College graduate",
      EDUCA == 6             ~ "Graduate or professional degree",
      EDUCA == 9             ~ NA_character_,
      TRUE                   ~ NA_character_), 
      levels = c("Less than HS", "HS diploma or GED", "Some college or technical school", 
                 "College graduate", "Graduate or professional degree")),
      
#_SMOKER3: 1 = “Current daily smoker”, 2 = “Current some-day smoker”, 3 = “Former smoker”, 4 = “Never smoker”; value 9 should be recoded NA
   
   smoking = factor(case_when(
     `_SMOKER3` == 1 ~ "Current daily smoker",
     `_SMOKER3` == 2 ~ "Current some-day smoker",
     `_SMOKER3` == 3 ~ "Former smoker",
     `_SMOKER3` == 4 ~ "Never smoker",
     `_SMOKER3` == 9 ~ NA_character_, TRUE ~ NA_character_),
     levels = c("Never smoker", "Former smoker","Current some-day smoker","Current daily smoker"))
      )
  
  brfss_clean %>%
  select(menthlth_days, physhlth_days, income, sex) %>%
  summarise(across(everything(), list(
    n_missing  = ~ sum(is.na(.)),
    pct_missing = ~ round(mean(is.na(.)) * 100, 1)
  ))) %>%
  pivot_longer(
    everything(),
    names_to      = c("Variable", ".value"),
    names_pattern = "(.+)_(n_missing|pct_missing)"
  ) %>%
  arrange(desc(pct_missing)) %>%
  kable(
    caption   = "Missing Observations BRFSS 2023",
    col.names = c("Variable", "N Missing", "% Missing")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Missing Observations BRFSS 2023
Variable	N Missing	% Missing
income	86623	20.0
physhlth_days	10785	2.5
menthlth_days	8108	1.9
sex	0	0.0

##Create the analytic dataset & Report the final analytic n.

brfss_clean |>
 filter(
    !is.na(physhlth_days),
    !is.na(menthlth_days),
    !is.na(bmi),
    !is.na(sex),
    !is.na(exercise),
    !is.na(age_group),
    !is.na(income),
    !is.na(education),
    !is.na(smoking) )

## # A tibble: 310,768 × 18
##    PHYSHLTH MENTHLTH `_BMI5` SEXVAR EXERANY2 `_AGEG5YR` `_INCOMG1` EDUCA
##       <dbl>    <dbl>   <dbl>  <dbl>    <dbl>      <dbl>      <dbl> <dbl>
##  1        6        2    2231      2        1         13          1     4
##  2       88       88    2585      2        1         12          5     5
##  3        2        3    3018      2        1          9          5     5
##  4        5       88    3347      2        2         13          4     5
##  5       88       88    2296      1        1         12          5     4
##  6       88       88    4184      1        1          8          5     5
##  7       88       88    2965      2        1         10          6     6
##  8       88       88    2375      1        2         12          6     5
##  9       88       88    2597      2        2         12          6     5
## 10        4       88    2819      2        1         13          2     4
## # ℹ 310,758 more rows
## # ℹ 10 more variables: `_SMOKER3` <dbl>, physhlth_days <dbl>,
## #   menthlth_days <dbl>, bmi <dbl>, sex <fct>, exercise <fct>, age_group <fct>,
## #   income <fct>, education <fct>, smoking <fct>

# Reproducible random sample
set.seed(1220)
brfss_analytic <- brfss_clean |>
  drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
 age_group, income, education, smoking) |>
 slice_sample(n = 8000)

# Save for lab activity
saveRDS(brfss_analytic,
  "C:/Users/AA843241/OneDrive - University at Albany - SUNY/Desktop/EPI553/HW03/brfss_clean.rds")

tibble(Metric = c("Observations", "Variables"),
       Value  = c(nrow(brfss_analytic), ncol(brfss_analytic))) |>
  kable(caption = "Analytic Dataset Dimensions") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Analytic Dataset Dimensions
Metric	Value
Observations	8000
Variables	18

##Produce a descriptive statistics table using gtsummary::tbl_summary(), stratified by sex

brfss_analytic |>
  select(menthlth_days, physhlth_days, bmi, sex, exercise,
 age_group, income, education, smoking) |>
  tbl_summary(
    by=sex,
    label = list(
      menthlth_days ~ "Mentally unhealthy days (past 30)",
      physhlth_days ~ "Physically unhealthy days (past 30)",
      bmi           ~ "Body Mass Index",
      exercise      ~ "Any physical activity (past 30 days)",
      age_group     ~ "Age Group",
      income        ~ "Annual household income level",
      education     ~ "Education level",
      smoking    ~ "Smoking status"
    ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) |>
  add_n() |>
  bold_labels() |>
  italicize_levels() |>
  modify_caption("**Table 1. Descriptive Statistics — BRFSS 2020 (n = 5,000)**") |>
  as_flex_table()

**Table 1. Descriptive Statistics — BRFSS 2020 (n = 5,000)**
Characteristic	N	Male N = 3,9361	Female N = 4,0641
Mentally unhealthy days (past 30)	8,000	3.5 (7.5)	5.1 (8.6)
Physically unhealthy days (past 30)	8,000	4.0 (8.4)	4.9 (8.9)
Body Mass Index	8,000	28.7 (6.0)	28.7 (7.0)
Any physical activity (past 30 days)	8,000	3,146 (80%)	3,094 (76%)
Age Group	8,000
18-24		235 (6.0%)	171 (4.2%)
25-29		219 (5.6%)	189 (4.7%)
30-34		253 (6.4%)	210 (5.2%)
35-39		263 (6.7%)	302 (7.4%)
40-44		290 (7.4%)	292 (7.2%)
45-49		266 (6.8%)	252 (6.2%)
50-54		305 (7.7%)	303 (7.5%)
55-59		308 (7.8%)	352 (8.7%)
60-64		408 (10%)	379 (9.3%)
65-69		418 (11%)	483 (12%)
70-74		382 (9.7%)	426 (10%)
75-79		325 (8.3%)	338 (8.3%)
80+		264 (6.7%)	367 (9.0%)
Annual household income level	8,000
Less than $15,000		160 (4.1%)	247 (6.1%)
$15,000 to < $25,000		271 (6.9%)	370 (9.1%)
$25,000 to < $35,000		376 (9.6%)	495 (12%)
$35,000 to < $50,000		482 (12%)	585 (14%)
$50,000 to < $100,000		1,251 (32%)	1,260 (31%)
$100,000 to < $200,000		996 (25%)	869 (21%)
$200,000 or more		400 (10%)	238 (5.9%)
Education level	8,000
Less than HS		75 (1.9%)	49 (1.2%)
HS diploma or GED		130 (3.3%)	122 (3.0%)
Some college or technical school		950 (24%)	877 (22%)
College graduate		1,018 (26%)	1,120 (28%)
Graduate or professional degree		1,763 (45%)	1,896 (47%)
Smoking status	8,000
Never smoker		2,239 (57%)	2,573 (63%)
Former smoker		1,207 (31%)	1,055 (26%)
Current some-day smoker		151 (3.8%)	117 (2.9%)
Current daily smoker		339 (8.6%)	319 (7.8%)
1Mean (SD); n (%)

#Part 1: Multiple Linear Regression (25 Points)

Research question: What is the independent association of each predictor with the number of mentally unhealthy days in the past 30 days?

##Step 1:

Fit a multiple linear regression model with menthlth_days as the outcome and the following predictors: physhlth_days, bmi, sex, exercise, age_group, income, education, and smoking. Display the full summary() output.

model_mlr <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise +age_group +
                + income +education + smoking, data=brfss_analytic)
summary(model_mlr)

## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise + 
##     age_group + +income + education + smoking, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.080  -3.865  -1.597   0.712  30.471 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                6.71372    0.92549   7.254 4.42e-13
## physhlth_days                              0.26558    0.01007  26.384  < 2e-16
## bmi                                        0.03338    0.01321   2.527 0.011510
## sexFemale                                  1.39038    0.16750   8.301  < 2e-16
## exerciseYes                               -0.65116    0.21472  -3.033 0.002432
## age_group25-29                            -1.05660    0.51938  -2.034 0.041950
## age_group30-34                            -1.09395    0.50646  -2.160 0.030803
## age_group35-39                            -1.81103    0.48851  -3.707 0.000211
## age_group40-44                            -2.89307    0.48749  -5.935 3.07e-09
## age_group45-49                            -3.05618    0.49769  -6.141 8.61e-10
## age_group50-54                            -3.51674    0.48323  -7.277 3.72e-13
## age_group55-59                            -4.49597    0.47555  -9.454  < 2e-16
## age_group60-64                            -4.52215    0.45848  -9.863  < 2e-16
## age_group65-69                            -5.57795    0.45019 -12.390  < 2e-16
## age_group70-74                            -6.02536    0.45741 -13.173  < 2e-16
## age_group75-79                            -6.28656    0.47484 -13.239  < 2e-16
## age_group80+                              -6.81968    0.47684 -14.302  < 2e-16
## income$15,000 to < $25,000                -1.63703    0.46899  -3.491 0.000485
## income$25,000 to < $35,000                -2.07637    0.44797  -4.635 3.63e-06
## income$35,000 to < $50,000                -2.54685    0.43819  -5.812 6.40e-09
## income$50,000 to < $100,000               -3.05048    0.41069  -7.428 1.22e-13
## income$100,000 to < $200,000              -3.49984    0.42923  -8.154 4.07e-16
## income$200,000 or more                    -3.78409    0.50036  -7.563 4.38e-14
## educationHS diploma or GED                 0.07991    0.81066   0.099 0.921484
## educationSome college or technical school  1.07679    0.68973   1.561 0.118520
## educationCollege graduate                  1.79091    0.69119   2.591 0.009585
## educationGraduate or professional degree   1.73781    0.69250   2.509 0.012111
## smokingFormer smoker                       0.94710    0.19329   4.900 9.77e-07
## smokingCurrent some-day smoker             1.35011    0.46820   2.884 0.003942
## smokingCurrent daily smoker                2.93681    0.32162   9.131  < 2e-16
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## exerciseYes                               ** 
## age_group25-29                            *  
## age_group30-34                            *  
## age_group35-39                            ***
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000 to < $25,000                ***
## income$25,000 to < $35,000                ***
## income$35,000 to < $50,000                ***
## income$50,000 to < $100,000               ***
## income$100,000 to < $200,000              ***
## income$200,000 or more                    ***
## educationHS diploma or GED                   
## educationSome college or technical school    
## educationCollege graduate                 ** 
## educationGraduate or professional degree  *  
## smokingFormer smoker                      ***
## smokingCurrent some-day smoker            ** 
## smokingCurrent daily smoker               ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.352 on 7970 degrees of freedom
## Multiple R-squared:  0.1853, Adjusted R-squared:  0.1824 
## F-statistic: 62.52 on 29 and 7970 DF,  p-value: < 2.2e-16

##Step 2: Write the fitted regression equation including all terms, with coefficients rounded to three decimal places.

\[\widehat{\text{menthlth_days}} = 6.714 + 0.266\times\text{physhlth\_days} + 0.033\times\text{bmi} + 1.390\times\text{Female} - 0.651\times\text{Ex=Yes} - 1.057\times\text{age25-29} - 1.094\times\text{age30-34} - 1.811\times\text{age35-39} - 2.893\times\text{age40-44} - 3.056\times\text{age45-49} - 3.517\times\text{age50-54} - 4.496\times\text{age55-59} - 4.522\times\text{age60-64} - 5.578\times\text{age65-69} - 6.025\times\text{age70-74} - 6.287\times\text{age75-79} - 6.820\times\text{age80+} - 1.637\times\text{inc2} - 2.076\times\text{inc3} - 2.547\times\text{inc4} - 3.050\times\text{inc5} - 3.500\times\text{inc6} - 3.784\times\text{inc7} + 0.080\times\text{HS} + 1.077\times\text{SomeCol} + 1.791\times\text{CollGrad} + 1.738\times\text{GradProf} + 2.937\times\text{SmkDaily} + 1.350\times\text{SmkSome} + 0.947\times\text{SmkForm}\] >Reference categories: Male, No exercise, Age 18–24, Income < $15,000, Less than HS, Never smoker.

##Step 3: Interpret the following coefficients in plain language. For each, state the direction, magnitude, and meaning in terms of mentally unhealthy days, holding all other variables constant: physhlth_days (continuous predictor) >For every additional +1 day of Days of poor physical health the days of poor mental health increases by 0.266 on average among participants, holding all other variables constant.

bmi (continuous predictor) > For every additional +1 increase of BMI unit $(kg/m^2)$ the Days of poor mental health increases by 0.033 on average among participants, holding all other variables constant.

sex: Female vs. Male (reference) >In comparison with Male participants, Female participants on average have 1.390 more days of poor mental healthe, holding all other variables in constant.

exercise: Yes vs. No (reference) > In comparison with people who do not exercise at all, people who exercise on average have 0.651 (a half a day) lesser days of poor mental health, holding all other variables in constant.

• One age_group coefficient of your choice > In comparison with young adults from 18 to 24 years of age, older adults from 60 to 64 years old on average have 4.522 lesser days of poor mental health, holding all other variables in constant.

• Two income coefficients of your choice, compared to the reference category(Less than $15,000) >In comparison with people with yearly income less than $15,000 , people who earn more than $200,000 a year on average have 3.784 lesser days of poor mental health.

##Step 4: Report and interpret each of the following model-level statistics:

####Hypothesis testing: \[H_0: \beta_1 = \beta_2 = \cdots = \beta_p = 0 \quad \text{(none of the predictors matter)}\] \[H_A: \text{At least one } \beta_j \neq 0\]

glance(model_mlr) %>%
  select(r.squared, adj.r.squared, sigma, statistic, p.value, df, df.residual, nobs) %>%
  mutate(across(where(is.numeric), ~ round(., 4))) %>%
  pivot_longer(everything(), names_to = "Statistic", values_to = "Value") %>%
  mutate(Statistic = dplyr::recode(Statistic,
    "r.squared"     = "R²",
    "adj.r.squared" = "Adjusted R²",
    "sigma"         = "Residual Std. Error (Root MSE)",
    "statistic"     = "F-statistic",
    "p.value"       = "p-value (overall F-test)",
    "df"            = "Model df (p)",
    "df.residual"   = "Residual df (n − p − 1)",
    "nobs"          = "n (observations)"
  )) %>%
  kable(caption = "Table 3. Overall Model Summary") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 3. Overall Model Summary
Statistic	Value
R²	0.1853
Adjusted R²	0.1824
Residual Std. Error (Root MSE)	7.3516
F-statistic	62.5234
p-value (overall F-test)	0.0000
Model df (p)	29.0000
Residual df (n − p − 1)	7970.0000
n (observations)	8000.0000

Interpretation: 1) Without adjustment all predictors in the model explain only 18.53% of days of mental health variability. And adjusted predictors in the model explain 18.24% of days of mental health variability. 2) On average, our model’s predictions are off by about 7.35 days of poor mental health.

#Part 2: Tests of Hypotheses (20 Points)

##Step 1: Obtain Type III (partial) sums of squares for all predictors using car::Anova(model, type = “III”). Display the full output. Identify which predictors have statistically significant partial associations at α = 0.05.

# Type III using car::Anova()
anova_type3 <- Anova(model_mlr, type = "III") 
 anova_type3 %>%
  as.data.frame() %>%
  rownames_to_column("Source") %>%
  mutate(across(where(is.numeric), ~ round(., 4))) %>%
  kable(
    caption = "Table 4. Type III (Partial) Sums of Squares - car::Anova()",
    col.names = c("Source", "Sum of Sq", "df", "F value", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  footnote(general = "Each F-test uses the MSE from the full model. Each variable is tested given ALL other variables.")

Table 4. Type III (Partial) Sums of Squares - car::Anova()
Source	Sum of Sq	df	F value	p-value
(Intercept)	2844.1420	1	52.6240	0.0000
physhlth_days	37622.7952	1	696.1198	0.0000
bmi	345.2408	1	6.3879	0.0115
sex	3723.8662	1	68.9012	0.0000
exercise	497.0434	1	9.1966	0.0024
age_group	30092.1774	12	46.3986	0.0000
income	4733.8943	6	14.5982	0.0000
education	1265.1504	4	5.8521	0.0001
smoking	5101.1076	3	31.4613	0.0000
Residuals	430750.0872	7970	NA	NA
Note:
Each F-test uses the MSE from the full model. Each variable is tested given ALL other variables.

##Step 2 (Chunk test for income): Test whether income collectively improves the modelsignificantly, after accounting for all other predictors:

State H0 and HA for this test clearly.

\[H_1: \beta^* = 0\ \text{(Income doesn't add to the model, given the other variables)}\]

\[H_1: \beta^* \neq 0\]

# Reduced model (excludes sleep_hrs)
m_no_income <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise +age_group +education + smoking, data=brfss_analytic)

# Compare using anova() - this performs the partial F-test
anova(m_no_income, model_mlr) %>%
  as.data.frame() %>%
  rownames_to_column("Model") %>%
  mutate(
    Model = c("Reduced (no income categories)", "Full (with income)"),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption = "Table 5. Partial F-Test: Does income add to the model?",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 5. Partial F-Test: Does income add to the model?
Model	Res. df	RSS	df	Sum of Sq	F	p-value
Reduced (no income categories)	7976	435484.0	NA	NA	NA	NA
Full (with income)	7970	430750.1	6	4733.894	14.5982	0

Interpretation: F-statistic = 14.60, p-value <0.001, df= 6 because without the reference 6 income categories. Conclusion: Since the p-value < 0.05, we reject $H_0$ and conclude that income adds statistically significant information to the prediction of mental health days, given that physhlth_days,bmi, sex, exercise,age_group,education, and smoking are already in the model.

###Step 3 (Chunk test for education): Repeat the same procedure for education as a group of predictors.

# Reduced model (excludes sleep_hrs)
m_no_education <- lm(menthlth_days ~ physhlth_days + bmi + sex + exercise +age_group + income + smoking, data=brfss_analytic)

# Compare using anova() - this performs the partial F-test
anova(m_no_education, model_mlr) %>%
  as.data.frame() %>%
  rownames_to_column("Model") %>%
  mutate(
    Model = c("Reduced (no education categories)", "Full (with education)"),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption = "Table 6. Partial F-Test: Does education add to the model?",
    col.names = c("Model", "Res. df", "RSS", "df", "Sum of Sq", "F", "p-value")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 6. Partial F-Test: Does education add to the model?
Model	Res. df	RSS	df	Sum of Sq	F	p-value
Reduced (no education categories)	7974	432015.2	NA	NA	NA	NA
Full (with education)	7970	430750.1	4	1265.15	5.8521	1e-04

Interpretation: F-statistic = 5.85, p-value <0.001, df= 4 because without the reference 4 education categories. Conclusion: Since the p-value < 0.05, we reject $H_0$ and conclude that education adds statistically significant information to the prediction of mental health days, given that physhlth_days,bmi, sex, exercise,age_group, income, and smoking are already in the model.

###Step 4: Write a 3 to 5 sentence synthesis interpreting the Type III results and your chunk test findings. Which predictors make the strongest independent contributions? What do the chunk tests add beyond the individual t-tests from summary()?

#Part3: Interaction Analysis

Background: Effect modification occurs when the association between a predictor and an outcome differs across levels of another variable. You will test whether the association between current smoking and mentally unhealthy days differs between men and women. ##Step 1: Create a binary smoking variable called smoker_current: >• “Current smoker” includes current daily smokers and current some-day smokers. >• “Non-smoker” includes former smokers and never smokers. Use “Non-smoker” as the reference category.

brfss_new <- brfss_analytic %>%
  mutate(
    smoker_current = factor(
      ifelse(smoking %in% c("Current daily smoker", "Current some-day smoker"), 1, 0),
      levels = c(0, 1),
      labels = c("Non-smoker", "Current Smoker")
    ))

brfss_new |>
  select(menthlth_days, smoker_current) |>
  tbl_summary(
    label = list(
      menthlth_days ~ "Mentally unhealthy days (past 30)" ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) |>
  add_n() |>
  bold_labels() |>
  italicize_levels() |>
  modify_caption("**Table 1. Descriptive Statistics — BRFSS 2020 (n = 5,000)**") |>
  as_flex_table()

**Table 1. Descriptive Statistics — BRFSS 2020 (n = 5,000)**
Characteristic	N	N = 8,0001
Mentally unhealthy days (past 30)	8,000	4.3 (8.1)
smoker_current	8,000
Non-smoker		7,074 (88%)
Current Smoker		926 (12%)
1Mean (SD); n (%)

##Step 2: Fit two models: >• Model A (main effects only): regress menthlth_days on physhlth_days, bmi, sex, smoker_current, exercise, age_group, income, and education. >• Model B (with interaction): same as Model A, but use sex * smoker_current in the formula to include the interaction term.

modelA <- lm(menthlth_days ~physhlth_days + bmi + sex + smoker_current + exercise+ age_group+ income +education, data=brfss_new)

modelB <- lm(menthlth_days ~physhlth_days + bmi +  sex*smoker_current + exercise+ age_group+ income +education, data=brfss_new)

tidy(modelB, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Interaction Model: sex*smoker_current + rest of predictors → Mental Health Days",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Interaction Model: sex*smoker_current + rest of predictors → Mental Health Days
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	6.8994	0.9286	7.4301	0.0000	5.0791	8.7196
physhlth_days	0.2686	0.0101	26.6788	0.0000	0.2489	0.2883
bmi	0.0331	0.0132	2.5017	0.0124	0.0072	0.0590
sexFemale	1.1855	0.1775	6.6784	0.0000	0.8376	1.5335
smoker_currentCurrent Smoker	1.5208	0.3654	4.1621	0.0000	0.8045	2.2371
exerciseYes	-0.6728	0.2150	-3.1301	0.0018	-1.0942	-0.2515
age_group25-29	-0.9202	0.5196	-1.7709	0.0766	-1.9388	0.0984
age_group30-34	-0.8924	0.5059	-1.7640	0.0778	-1.8841	0.0993
age_group35-39	-1.5929	0.4875	-3.2673	0.0011	-2.5485	-0.6372
age_group40-44	-2.6286	0.4856	-5.4127	0.0000	-3.5806	-1.6766
age_group45-49	-2.8425	0.4969	-5.7209	0.0000	-3.8165	-1.8686
age_group50-54	-3.2778	0.4820	-6.8012	0.0000	-4.2226	-2.3331
age_group55-59	-4.2499	0.4740	-8.9652	0.0000	-5.1791	-3.3206
age_group60-64	-4.2640	0.4567	-9.3364	0.0000	-5.1593	-3.3688
age_group65-69	-5.2506	0.4466	-11.7563	0.0000	-6.1261	-4.3751
age_group70-74	-5.7111	0.4544	-12.5686	0.0000	-6.6018	-4.8203
age_group75-79	-5.9076	0.4702	-12.5646	0.0000	-6.8292	-4.9859
age_group80+	-6.4995	0.4736	-13.7239	0.0000	-7.4278	-5.5711
income$15,000 to < $25,000	-1.6357	0.4700	-3.4804	0.0005	-2.5570	-0.7144
income$25,000 to < $35,000	-2.0746	0.4486	-4.6243	0.0000	-2.9540	-1.1952
income$35,000 to < $50,000	-2.5455	0.4392	-5.7964	0.0000	-3.4064	-1.6847
income$50,000 to < $100,000	-3.0430	0.4116	-7.3935	0.0000	-3.8498	-2.2362
income$100,000 to < $200,000	-3.5097	0.4300	-8.1623	0.0000	-4.3526	-2.6668
income$200,000 or more	-3.8405	0.5010	-7.6651	0.0000	-4.8226	-2.8583
educationHS diploma or GED	0.1256	0.8124	0.1546	0.8772	-1.4669	1.7180
educationSome college or technical school	1.1179	0.6912	1.6172	0.1059	-0.2371	2.4729
educationCollege graduate	1.8179	0.6928	2.6239	0.0087	0.4598	3.1760
educationGraduate or professional degree	1.6691	0.6943	2.4040	0.0162	0.3081	3.0300
sexFemale:smoker_currentCurrent Smoker	1.2833	0.5171	2.4819	0.0131	0.2697	2.2968

Interpretation: This model produces 1 interaction term compared to the reference category (“smoker_current: Non-smoker”). The main effect slope for sex is 1.185, meaning that in comparisong with Male, Female have 1.185 more days of poor mental health. The main effect slofe for current smoker is 1.52, meaning that in comparison with non-smokers, current smokers have 1.52 more dayf of poor mental health. Therefore current smoker-females experience additional 1.283 days of poor mentally health, p = 0.0131 the association is statistically significant. This pattern makes substantive sense: Females who smoke may experience more poor mental health days, or vice versa women with poor mental health may have nicotine addiction.

##Step 3: Test whether the interaction is statistically significant: ###Use anova(model_A, model_B) to compare the two models.

anova(modelA, modelB) |>
  tidy() |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(caption = "Test for Coincidence: Does Smoker current Affect the Sex-Mental Health Relationship at All?") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Test for Coincidence: Does Smoker current Affect the Sex-Mental Health Relationship at All?
term	df.residual	rss	df	sumsq	statistic	p.value
menthlth_days ~ physhlth_days + bmi + sex + smoker_current + exercise + age_group + income + education	7972	432508.9	NA	NA	NA	NA
menthlth_days ~ physhlth_days + bmi + sex * smoker_current + exercise + age_group + income + education	7971	432174.9	1	333.9749	6.1598	0.0131

###19.State H0 and HA for this test.

To test whether smoker_currect $\times$ sex is significant overall, we use a partial F-test comparing the model with and without the interaction terms:

\[H_0: \beta_ = 0 \quad \text{(sex–smoking interaction is zero, slopes are equal acros ssex no interaction)}\]

\[H_A: \beta \neq 0 \quad \text{(slopes differ, interaction is present)}\]

###20.Report the F-statistic, degrees of freedom, and p-value. > F-statistic = t^2, as t-test for sexFemale:smoker_currentCurrent Smoker = 2.4819, then F-statistic is 6.16, df intercet=1, df_residual= 7971 , df=7970, p=0.0131

###21.State your conclusion at α = 0.05. >As p < 0.05, the smoker_cureent x sex interaction significantly improves model fit, meaning the association between smoking and mentally unhealthy days differs by sex. Based on the output above, we reject H_0 (F = 6.16, df = 1 and 7970, p = 0.013), and conclude that the sex x smoking interaction is statistically significant at α = 0.05.

##Step 4: Interpret the interaction using the coefficients from Model B: 22.Report the estimated effect of current smoking on mentally unhealthy days among men (the coefficient for smoker_current). Interpret in plain language. 23.Compute and report the estimated effect among women (smoker_current coefficient plus the interaction coefficient). Interpret in plain language.

# Fit separate models for males and females
mod_male <- lm(menthlth_days ~physhlth_days + bmi + smoker_current + exercise+ 
                 age_group+ income +education, data=brfss_new |> filter(sex == "Male"))
mod_female <- lm(menthlth_days ~physhlth_days + bmi + smoker_current + exercise+ 
                   age_group+ income +education, data=brfss_new|> filter(sex == "Female"))

# Compare coefficients
bind_rows(
  tidy(mod_male, conf.int = TRUE) |> mutate(Stratum = "Male"),
  tidy(mod_female, conf.int = TRUE) |> mutate(Stratum = "Female")
) |>
  filter(term == "smoker_currentCurrent Smoker") |>
  select(Stratum, estimate, std.error, conf.low, conf.high, p.value) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Stratified Analysis: Smoker_current → Mental Health Days, by Sex",
    col.names = c("Stratum", "Estimate", "SE", "CI Lower", "CI Upper", "p-value")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Stratified Analysis: Smoker_current → Mental Health Days, by Sex
Stratum	Estimate	SE	CI Lower	CI Upper	p-value
Male	1.4125	0.3517	0.7229	2.1021	1e-04
Female	2.9140	0.4164	2.0976	3.7304	0e+00

coefs <- coef(modelB)

eff_men <- round(coefs["smoker_currentCurrent Smoker"], 3)

eff_women <- round(
  coefs["smoker_currentCurrent Smoker"] +
  coefs["sexFemale:smoker_currentCurrent Smoker"], 3)

tibble(
  Group    = c("Men (reference)", "Women"),
  Estimate = c(eff_men, eff_women)
) |>
  kable(
    caption   = "Estimated Effect of Current Smoking on Mentally Unhealthy Days by Sex (Model B)",
    col.names = c("Group", "Estimated Difference (days)")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Estimated Effect of Current Smoking on Mentally Unhealthy Days by Sex (Model B)
Group	Estimated Difference (days)
Men (reference)	1.521
Women	2.804

##Step 5.

Visualize the interaction using ggeffects::ggpredict(model_B, terms = c(“smoker_current”, “sex”)). The plot should show predicted mentally unhealthy days for each combination of smoking status and sex, with labeled axes, a descriptive title, and a legend identifying men and women.

ggpredict(modelB, terms = c("smoker_current", "sex"))%>%

plot() +
  geom_line(linewidth = 1.2) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.12, color = NA) +
  labs(
    title    = "Predicted Mental Health Days by Current Smoking status and Sex",
    subtitle = "From interaction model: smoker_current * sex",
    x        = "Current smoking status",
    y        = "Predicted Mentally Unhealthy Days",
    color    = "Sex",
    fill     = "Sex"
  ) +
  theme_minimal(base_size = 13)

## Ignoring unknown labels:
## • linetype : "sex"
## • shape : "sex"

##Step 6

Write 3 to 5 sentences interpreting the interaction for a general public health audience. Do not include coefficient values or p-values. Focus on the substantive finding and its implications

Overall, women report on average 1.2 more mentally unhealthy days per month than men, and current smokers report on average 1.5 more mentally unhealthy days than non-smokers. However, these associations are not uniform across sex. Women who currently smoke experience greater mental health burden compared to non-smoking women, while among men the difference between smokers and non-smokers is comparatively smaller. These findings suggest that the mental health consequences of smoking fall disproportionately on women. Accordingly, smoking cessation programs targeting women should extend beyond physical health messaging and address the mental health benefits of quitting. But this is cross-sectional data, meaning we could not extablish temporality of factors. Vice-versa women who experience more mentally unhealthy days might pick up on smoking as unhealthy coping mechanism. Further researchers need to focus on social, hormonal, and stress factors among women while studying smoking risk factor and mental health.

#Part 4: Reflection (15 Points) Write a focused reflection of 250 to 400 words in continuous prose (not as a bulleted list) addressing all three topics below.

What do the results suggest about the determinants of mental health among U.S. adults? >The BRFSS 2023 survey data suggests that more days of poor mental health days, higher BMI, female sex and any smoking habit current of former are associated with increased days of poor mental health. Whereas. Whereas any physical activity in the past month, the older age groups (>=25 y.o) and higher incomes (>15,000 $) are associated with less days of poor mental health, suggesting the preventive effect. Interestingly, College graduates and people with professional degree have more days of poor mental health in comparison with people who have less than High Scholl education level. All these results are statistically significant (p<0.05, p<0.001). We have found out that

Which predictors were most strongly associated? Which were not statistically significant? > For example, participants 80 years and older reported on average 6.8 less days of poor mental health than young adults from 18 to 24. And people with yearly income $200,000 or more reported 3.8 less days of poor mental health than people with yearly income < $15,000 in the past months. But women reported on average more 1.4 days of poor mental health than men in the past months.Also current daily smokers reported on average almost 3 more days of poor mental health than non-smokers. It was also interesting to find out that women who currently smoke have 1.3 days more of poor mentally health per months in addition. Need to note that this is cross-sectional data, meaning we can not extablish temporality and causation of factors, meaning that poor mental health days and all predictors were measured at the same point of time. Without adjustment all predictors in the model explain only 18.53% of days of mental health variability. And adjusted predictors in the model explain 18.24% of days of mental health variability. We also calculated the adjusted R^2 because the regular R^2 never decreases when adding predictors, even if those predictors are completely useless. Therefore we need to adjust R^2 for all predictors in the model. In this study we calculated t-tests for all predictors, and then perdormed chunk test for education and income. A chunk test tells whether a variable is needed in the model at all.Since the p-value < 0.05, we conclude that education and income adds statistically significant information to the prediction of mental health days, given that physhlth_days,bmi, sex, exercise,age_group, income, and smoking are already in the model.

I used Claude for Part1, Step 2 to write the fitted regression equation including all terms, because the one that I have made initially wouldn’t be presented after publishing, even if it seemed fine in rmd file. I verified it by knitting and publishing. Also I used claude for Part 3, Step 4: Interpret the interaction using the coefficients from Model B. Here is my prompt: I ran a regression model (Model B) with an interaction between current smoking and sex. Here is the output: modelB <- lm(menthlth_days ~physhlth_days + bmi + sexsmoker_current + exercise+ age_group+ income +education, data=brfss_new) how do I report the estimated effect of current smoking on mentally unhealthy days among men (the coefficient for smoker_current). Interpret in plain language. AND Compute and report the estimated effect among women (smoker_current coefficient plus the interaction coefficient).It gave me example of chunk and then after changing the names of varibles for terms in “Interaction Model: sexsmoker_current + rest of predictors → Mental Health Days” output it worked.

EPI553_HW03_Azamatova_Aigerim

Azamatova Aigerim

2026-04-05

1 Loading Survey BRFSS 2023 Data