Part 0: Data Acquisition and Preparation

library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(gtsummary)
library(ggeffects)
brfss_raw <- read_xpt("LLCP2023.XPT")
dim(brfss_raw)
## [1] 433323    350

The 2023 BRFSS dataset was successfully imported using read_xpt(). The dataset contains 433,323 observations and 350 variables.

brfss_sub <- brfss_raw %>%
  select(
    MENTHLTH,
    PHYSHLTH,
    `_BMI5`,
    SEXVAR,
    EXERANY2,
    `_AGEG5YR`,
    `_INCOMG1`,
    EDUCA,
    `_SMOKER3`
  )
brfss_clean <- brfss_sub %>%
  mutate(
    menthlth_days = case_when(
      MENTHLTH == 88 ~ 0,
      MENTHLTH %in% c(77, 99) ~ NA_real_,
      MENTHLTH %in% 1:30 ~ as.numeric(MENTHLTH)
    ),
    
    physhlth_days = case_when(
      PHYSHLTH == 88 ~ 0,
      PHYSHLTH %in% c(77, 99) ~ NA_real_,
      PHYSHLTH %in% 1:30 ~ as.numeric(PHYSHLTH)
    ),
    
    bmi = case_when(
      `_BMI5` == 9999 ~ NA_real_,
      TRUE ~ as.numeric(`_BMI5`) / 100
    ),
    
    sex = factor(
      SEXVAR,
      levels = c(1, 2),
      labels = c("Male", "Female")
    ),
    
    exercise = factor(case_when(
      EXERANY2 == 1 ~ "Yes",
      EXERANY2 == 2 ~ "No",
      EXERANY2 %in% c(7, 9) ~ NA_character_
    )),
    
    age_group = factor(case_when(
      `_AGEG5YR` %in% 1:13 ~ as.character(`_AGEG5YR`),
      `_AGEG5YR` == 14 ~ NA_character_
    ),
    levels = as.character(1:13),
    labels = c(
      "18-24", "25-29", "30-34", "35-39", "40-44", "45-49",
      "50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80+"
    )),
    
    income = factor(case_when(
      `_INCOMG1` %in% 1:7 ~ as.character(`_INCOMG1`),
      `_INCOMG1` == 9 ~ NA_character_
    ),
    levels = as.character(1:7),
    labels = c(
      "Less than $15,000",
      "$15,000-$24,999",
      "$25,000-$34,999",
      "$35,000-$49,999",
      "$50,000-$99,999",
      "$100,000-$199,999",
      "$200,000 or more"
    )),
    
    education = factor(case_when(
      EDUCA %in% c(1, 2) ~ "Less than high school",
      EDUCA == 3 ~ "High school diploma or GED",
      EDUCA == 4 ~ "Some college or technical school",
      EDUCA == 5 ~ "College graduate",
      EDUCA == 6 ~ "Graduate or professional degree",
      EDUCA == 9 ~ NA_character_
    )),
    
    smoking = factor(case_when(
      `_SMOKER3` == 1 ~ "Current daily smoker",
      `_SMOKER3` == 2 ~ "Current some-day smoker",
      `_SMOKER3` == 3 ~ "Former smoker",
      `_SMOKER3` == 4 ~ "Never smoker",
      `_SMOKER3` == 9 ~ NA_character_
    ))
  ) %>%
  select(
    menthlth_days, physhlth_days, bmi, sex, exercise,
    age_group, income, education, smoking
  )

All nine variables were recoded according to the assignment instructions. Continuous variables were cleaned and categorical variables were converted to factors with appropriate labels.

missing_report <- brfss_clean %>%
  summarise(
    menthlth_n_miss = sum(is.na(menthlth_days)),
    menthlth_pct_miss = mean(is.na(menthlth_days)) * 100,
    
    physhlth_n_miss = sum(is.na(physhlth_days)),
    physhlth_pct_miss = mean(is.na(physhlth_days)) * 100,
    
    bmi_n_miss = sum(is.na(bmi)),
    bmi_pct_miss = mean(is.na(bmi)) * 100,
    
    smoking_n_miss = sum(is.na(smoking)),
    smoking_pct_miss = mean(is.na(smoking)) * 100
  )

missing_report
## # A tibble: 1 × 8
##   menthlth_n_miss menthlth_pct_miss physhlth_n_miss physhlth_pct_miss bmi_n_miss
##             <int>             <dbl>           <int>             <dbl>      <int>
## 1            8108              1.87           10785              2.49      40535
## # ℹ 3 more variables: bmi_pct_miss <dbl>, smoking_n_miss <int>,
## #   smoking_pct_miss <dbl>

Before creating the analytic dataset, I assessed missingness for key variables. There was some missing data across mentally unhealthy days, physical health days, BMI, and smoking. These missing values were handled by removing incomplete observations in the analytic sample.

set.seed(1220)

brfss_analytic <- brfss_clean %>%
  drop_na(menthlth_days, physhlth_days, bmi, sex, exercise, smoking) %>%
  slice_sample(n = 8000)

nrow(brfss_analytic)
## [1] 8000
names(brfss_analytic)
## [1] "menthlth_days" "physhlth_days" "bmi"           "sex"          
## [5] "exercise"      "age_group"     "income"        "education"    
## [9] "smoking"

After removing observations with missing values on the analysis variables and setting a seed for reproducibility, a random analytic sample of 8,000 respondents was selected.

tbl_summary(brfss_analytic, by = sex)
Characteristic Male
N = 3,956
1
Female
N = 4,044
1
menthlth_days 0 (0, 3) 0 (0, 5)
physhlth_days 0 (0, 3) 0 (0, 5)
bmi 28 (25, 32) 27 (23, 32)
exercise 3,118 (79%) 3,003 (74%)
age_group

    18-24 277 (7.1%) 192 (4.8%)
    25-29 229 (5.8%) 207 (5.2%)
    30-34 238 (6.1%) 196 (4.9%)
    35-39 285 (7.3%) 257 (6.4%)
    40-44 264 (6.7%) 252 (6.3%)
    45-49 274 (7.0%) 281 (7.0%)
    50-54 268 (6.8%) 299 (7.5%)
    55-59 323 (8.3%) 323 (8.1%)
    60-64 345 (8.8%) 389 (9.7%)
    65-69 434 (11%) 443 (11%)
    70-74 381 (9.7%) 448 (11%)
    75-79 295 (7.5%) 332 (8.3%)
    80+ 302 (7.7%) 389 (9.7%)
    Unknown 41 36
income

    Less than $15,000 141 (4.1%) 211 (6.3%)
    $15,000-$24,999 243 (7.1%) 353 (10%)
    $25,000-$34,999 315 (9.2%) 390 (12%)
    $35,000-$49,999 440 (13%) 507 (15%)
    $50,000-$99,999 1,068 (31%) 1,018 (30%)
    $100,000-$199,999 896 (26%) 691 (20%)
    $200,000 or more 319 (9.3%) 206 (6.1%)
    Unknown 534 668
education

    College graduate 971 (25%) 1,134 (28%)
    Graduate or professional degree 1,753 (44%) 1,757 (44%)
    High school diploma or GED 130 (3.3%) 145 (3.6%)
    Less than high school 79 (2.0%) 71 (1.8%)
    Some college or technical school 1,012 (26%) 931 (23%)
    Unknown 11 6
smoking

    Current daily smoker 328 (8.3%) 312 (7.7%)
    Current some-day smoker 143 (3.6%) 138 (3.4%)
    Former smoker 1,233 (31%) 984 (24%)
    Never smoker 2,252 (57%) 2,610 (65%)
1 Median (Q1, Q3); n (%)

Table 1 presents the descriptive characteristics of the analytic sample stratified by sex.

Part 1: Multiple Linear Regression

model_full <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + education + smoking,
  data = brfss_analytic
)

summary(model_full)
## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex + exercise + 
##     age_group + income + education + smoking, data = brfss_analytic)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.5285  -4.0855  -1.8014   0.7326  31.1855 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                               10.41381    0.81846  12.724  < 2e-16
## physhlth_days                              0.25166    0.01126  22.342  < 2e-16
## bmi                                        0.03406    0.01484   2.295 0.021766
## sexFemale                                  1.57494    0.19117   8.238  < 2e-16
## exerciseYes                               -0.80025    0.24126  -3.317 0.000915
## age_group25-29                            -0.60493    0.57604  -1.050 0.293690
## age_group30-34                            -1.34121    0.57389  -2.337 0.019467
## age_group35-39                            -1.38508    0.54370  -2.548 0.010871
## age_group40-44                            -2.31874    0.55246  -4.197 2.74e-05
## age_group45-49                            -3.20491    0.55166  -5.810 6.55e-09
## age_group50-54                            -3.11723    0.54420  -5.728 1.06e-08
## age_group55-59                            -4.05950    0.53368  -7.607 3.20e-14
## age_group60-64                            -4.82970    0.52129  -9.265  < 2e-16
## age_group65-69                            -5.46558    0.50572 -10.808  < 2e-16
## age_group70-74                            -5.89672    0.51326 -11.489  < 2e-16
## age_group75-79                            -6.18662    0.54514 -11.349  < 2e-16
## age_group80+                              -6.36616    0.54056 -11.777  < 2e-16
## income$15,000-$24,999                     -1.35299    0.52349  -2.585 0.009771
## income$25,000-$34,999                     -1.75831    0.51337  -3.425 0.000618
## income$35,000-$49,999                     -2.28505    0.49673  -4.600 4.30e-06
## income$50,000-$99,999                     -2.73663    0.47013  -5.821 6.12e-09
## income$100,000-$199,999                   -3.12692    0.49218  -6.353 2.25e-10
## income$200,000 or more                    -3.57507    0.57632  -6.203 5.86e-10
## educationGraduate or professional degree  -0.15323    0.24286  -0.631 0.528110
## educationHigh school diploma or GED       -0.71677    0.56879  -1.260 0.207656
## educationLess than high school            -1.87172    0.74728  -2.505 0.012279
## educationSome college or technical school -0.33269    0.26897  -1.237 0.216157
## smokingCurrent some-day smoker            -0.84977    0.59834  -1.420 0.155592
## smokingFormer smoker                      -1.20145    0.38179  -3.147 0.001657
## smokingNever smoker                       -2.33667    0.36530  -6.397 1.70e-10
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## exerciseYes                               ***
## age_group25-29                               
## age_group30-34                            *  
## age_group35-39                            *  
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000-$24,999                     ** 
## income$25,000-$34,999                     ***
## income$35,000-$49,999                     ***
## income$50,000-$99,999                     ***
## income$100,000-$199,999                   ***
## income$200,000 or more                    ***
## educationGraduate or professional degree     
## educationHigh school diploma or GED          
## educationLess than high school            *  
## educationSome college or technical school    
## smokingCurrent some-day smoker               
## smokingFormer smoker                      ** 
## smokingNever smoker                       ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.707 on 6717 degrees of freedom
##   (1253 observations deleted due to missingness)
## Multiple R-squared:  0.1659, Adjusted R-squared:  0.1623 
## F-statistic: 46.08 on 29 and 6717 DF,  p-value: < 2.2e-16

Regression Equation

The fitted multiple linear regression model can be written as:

menthlth_days = β₀ + β₁(physhlth_days) + β₂(bmi) + β₃(sexFemale) + β₄(exerciseYes) + … + ε

Using estimated coefficients (rounded to 3 decimal places):

menthlth_days = 10.414 + 0.256(physhlth_days) + 0.034(bmi) + 1.575(sexFemale) − 0.800(exerciseYes) + … + ε

glance(model_full)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic   p.value    df  logLik    AIC    BIC
##       <dbl>         <dbl> <dbl>     <dbl>     <dbl> <dbl>   <dbl>  <dbl>  <dbl>
## 1     0.166         0.162  7.71      46.1 1.28e-238    29 -23337. 46736. 46947.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
sigma(model_full)
## [1] 7.707032

The model R-squared was 0.165, meaning that about 16.5% of the variation in mentally unhealthy days was explained by the predictors.

The adjusted R-squared was 0.162.

The residual standard error was approximately 7.71, indicating that predicted values were off by about 7.71 mentally unhealthy days on average.

The overall F-test was 46.08 with 29 and 6717 degrees of freedom (p < 0.001), indicating that the model as a whole was statistically significant.

tidy(model_full)
## # A tibble: 30 × 5
##    term           estimate std.error statistic   p.value
##    <chr>             <dbl>     <dbl>     <dbl>     <dbl>
##  1 (Intercept)     10.4       0.818      12.7  1.15e- 36
##  2 physhlth_days    0.252     0.0113     22.3  1.02e-106
##  3 bmi              0.0341    0.0148      2.29 2.18e-  2
##  4 sexFemale        1.57      0.191       8.24 2.08e- 16
##  5 exerciseYes     -0.800     0.241      -3.32 9.15e-  4
##  6 age_group25-29  -0.605     0.576      -1.05 2.94e-  1
##  7 age_group30-34  -1.34      0.574      -2.34 1.95e-  2
##  8 age_group35-39  -1.39      0.544      -2.55 1.09e-  2
##  9 age_group40-44  -2.32      0.552      -4.20 2.74e-  5
## 10 age_group45-49  -3.20      0.552      -5.81 6.55e-  9
## # ℹ 20 more rows

Holding all other variables constant, each additional physically unhealthy day was associated with an average increase of 0.26 mentally unhealthy days.

Holding all else constant, a one-unit increase in BMI was associated with 0.03 more mentally unhealthy days.

Holding all other variables constant, females reported 1.57 more mentally unhealthy days on average compared with males.

Respondents who reported any exercise had 0.80 fewer mentally unhealthy days on average compared with those who did not exercise.

Part 2: Tests of Hypotheses

Anova(model_full, type = "III")
## Anova Table (Type III tests)
## 
## Response: menthlth_days
##               Sum Sq   Df  F value    Pr(>F)    
## (Intercept)     9616    1 161.8925 < 2.2e-16 ***
## physhlth_days  29650    1 499.1802 < 2.2e-16 ***
## bmi              313    1   5.2668  0.021766 *  
## sex             4031    1  67.8684 < 2.2e-16 ***
## exercise         653    1  11.0019  0.000915 ***
## age_group      25927   12  36.3740 < 2.2e-16 ***
## income          3447    6   9.6721 1.258e-10 ***
## education        452    4   1.9027  0.107064    
## smoking         3426    3  19.2258 2.075e-12 ***
## Residuals     398979 6717                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Using Type III partial sums of squares, I evaluated the independent contribution of each predictor after adjusting for all other variables in the model.

Predictors with p-values less than 0.05 were considered to have statistically significant partial associations with mentally unhealthy days.

brfss_hyp <- brfss_analytic %>%
  drop_na(menthlth_days, physhlth_days, bmi, sex, exercise,
          age_group, income, education, smoking)

model_full <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + education + smoking,
  data = brfss_hyp
)

model_no_income <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + education + smoking,
  data = brfss_hyp
)

anova(model_no_income, model_full)
## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     education + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     income + education + smoking
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   6723 402426                                  
## 2   6717 398979  6      3447 9.6721 1.258e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

H0: All income coefficients = 0
HA: At least one income coefficient ≠ 0

The comparison between the reduced model (excluding income) and the full model shows a statistically significant difference (p < 0.001).

This indicates that income significantly improves model fit and should be retained in the model.

The comparison between the reduced model (excluding education) and the full model is not statistically significant (p = 0.107).

This suggests that education does not significantly improve model fit and could be removed from the model.

model_no_education <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + exercise +
    age_group + income + smoking,
  data = brfss_hyp
)

anova(model_no_education, model_full)
## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     income + smoking
## Model 2: menthlth_days ~ physhlth_days + bmi + sex + exercise + age_group + 
##     income + education + smoking
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1   6721 399431                           
## 2   6717 398979  4    452.06 1.9027 0.1071

The null hypothesis for the education chunk test was that all education coefficients were equal to zero. The alternative hypothesis was that at least one education coefficient was not equal to zero.

The partial F-test comparing the reduced model (without education) to the full model resulted in an F-statistic of 1.90 with 4 and 6717 degrees of freedom (p = 0.107).

At α = 0.05, I failed to reject the null hypothesis, indicating that education does not improve model fit as a group after adjusting for the other variables.

Part 3: Interaction Analysis

brfss_analytic <- brfss_analytic %>%
  mutate(
    smoker_current = factor(
      case_when(
        smoking %in% c("Current daily smoker", "Current some-day smoker") ~ "Current smoker",
        smoking %in% c("Former smoker", "Never smoker") ~ "Non-smoker"
      ),
      levels = c("Non-smoker", "Current smoker")
    )
  )

table(brfss_analytic$smoker_current, useNA = "ifany")
## 
##     Non-smoker Current smoker 
##           7079            921

The four-category smoking variable was collapsed into a binary variable, smoker_current. Current daily smokers and current some-day smokers were coded as current smokers, while former smokers and never smokers were coded as non-smokers. Non-smoker was used as the reference category.

model_A <- lm(
  menthlth_days ~ physhlth_days + bmi + sex + smoker_current +
    exercise + age_group + income + education,
  data = brfss_analytic
)

model_B <- lm(
  menthlth_days ~ physhlth_days + bmi + sex * smoker_current +
    exercise + age_group + income + education,
  data = brfss_analytic
)

summary(model_B)
## 
## Call:
## lm(formula = menthlth_days ~ physhlth_days + bmi + sex * smoker_current + 
##     exercise + age_group + income + education, data = brfss_analytic)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.7193  -4.0653  -1.7907   0.5594  30.6336 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                                8.27317    0.77275  10.706  < 2e-16
## physhlth_days                              0.25397    0.01127  22.537  < 2e-16
## bmi                                        0.03476    0.01486   2.339 0.019343
## sexFemale                                  1.30893    0.20258   6.461 1.11e-10
## smoker_currentCurrent smoker               0.96148    0.41707   2.305 0.021180
## exerciseYes                               -0.83293    0.24153  -3.449 0.000567
## age_group25-29                            -0.45666    0.57622  -0.793 0.428088
## age_group30-34                            -1.17300    0.57379  -2.044 0.040962
## age_group35-39                            -1.12215    0.54224  -2.069 0.038541
## age_group40-44                            -2.01938    0.54993  -3.672 0.000242
## age_group45-49                            -2.91126    0.54962  -5.297 1.22e-07
## age_group50-54                            -2.79361    0.54158  -5.158 2.56e-07
## age_group55-59                            -3.78191    0.53154  -7.115 1.24e-12
## age_group60-64                            -4.54549    0.51910  -8.757  < 2e-16
## age_group65-69                            -5.15970    0.50279 -10.262  < 2e-16
## age_group70-74                            -5.52022    0.50909 -10.843  < 2e-16
## age_group75-79                            -5.79764    0.54066 -10.723  < 2e-16
## age_group80+                              -5.99566    0.53654 -11.175  < 2e-16
## income$15,000-$24,999                     -1.31893    0.52407  -2.517 0.011868
## income$25,000-$34,999                     -1.71870    0.51408  -3.343 0.000833
## income$35,000-$49,999                     -2.21506    0.49733  -4.454 8.57e-06
## income$50,000-$99,999                     -2.65879    0.47067  -5.649 1.68e-08
## income$100,000-$199,999                   -3.07511    0.49280  -6.240 4.64e-10
## income$200,000 or more                    -3.57454    0.57723  -6.193 6.27e-10
## educationGraduate or professional degree  -0.26206    0.24226  -1.082 0.279403
## educationHigh school diploma or GED       -0.58424    0.56903  -1.027 0.304590
## educationLess than high school            -1.92177    0.74838  -2.568 0.010253
## educationSome college or technical school -0.29661    0.26936  -1.101 0.270869
## sexFemale:smoker_currentCurrent smoker     1.44209    0.58378   2.470 0.013526
##                                              
## (Intercept)                               ***
## physhlth_days                             ***
## bmi                                       *  
## sexFemale                                 ***
## smoker_currentCurrent smoker              *  
## exerciseYes                               ***
## age_group25-29                               
## age_group30-34                            *  
## age_group35-39                            *  
## age_group40-44                            ***
## age_group45-49                            ***
## age_group50-54                            ***
## age_group55-59                            ***
## age_group60-64                            ***
## age_group65-69                            ***
## age_group70-74                            ***
## age_group75-79                            ***
## age_group80+                              ***
## income$15,000-$24,999                     *  
## income$25,000-$34,999                     ***
## income$35,000-$49,999                     ***
## income$50,000-$99,999                     ***
## income$100,000-$199,999                   ***
## income$200,000 or more                    ***
## educationGraduate or professional degree     
## educationHigh school diploma or GED          
## educationLess than high school            *  
## educationSome college or technical school    
## sexFemale:smoker_currentCurrent smoker    *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.719 on 6718 degrees of freedom
##   (1253 observations deleted due to missingness)
## Multiple R-squared:  0.1632, Adjusted R-squared:  0.1597 
## F-statistic: 46.79 on 28 and 6718 DF,  p-value: < 2.2e-16

Model A included only main effects, while Model B added the interaction term between sex and current smoking using sex * smoker_current

anova(model_A, model_B)
## Analysis of Variance Table
## 
## Model 1: menthlth_days ~ physhlth_days + bmi + sex + smoker_current + 
##     exercise + age_group + income + education
## Model 2: menthlth_days ~ physhlth_days + bmi + sex * smoker_current + 
##     exercise + age_group + income + education
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1   6719 400648                              
## 2   6718 400285  1    363.59 6.1022 0.01353 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The null hypothesis was that there is no interaction between sex and current smoking, meaning that the association between current smoking and mentally unhealthy days is the same for males and females.

The alternative hypothesis was that the interaction term is not equal to zero.

The partial F-test comparing Model A and Model B resulted in an F-statistic of 6.10 with 1 and 6718 degrees of freedom (p = 0.0135).

At α = 0.05, I rejected the null hypothesis, indicating that the association between current smoking and mentally unhealthy days differs by sex.

coef(model_B)
##                               (Intercept) 
##                                8.27317047 
##                             physhlth_days 
##                                0.25396580 
##                                       bmi 
##                                0.03476085 
##                                 sexFemale 
##                                1.30893325 
##              smoker_currentCurrent smoker 
##                                0.96148125 
##                               exerciseYes 
##                               -0.83292911 
##                            age_group25-29 
##                               -0.45666216 
##                            age_group30-34 
##                               -1.17299902 
##                            age_group35-39 
##                               -1.12214553 
##                            age_group40-44 
##                               -2.01938410 
##                            age_group45-49 
##                               -2.91126327 
##                            age_group50-54 
##                               -2.79360556 
##                            age_group55-59 
##                               -3.78190967 
##                            age_group60-64 
##                               -4.54549146 
##                            age_group65-69 
##                               -5.15969531 
##                            age_group70-74 
##                               -5.52022406 
##                            age_group75-79 
##                               -5.79763994 
##                              age_group80+ 
##                               -5.99565838 
##                     income$15,000-$24,999 
##                               -1.31893291 
##                     income$25,000-$34,999 
##                               -1.71869954 
##                     income$35,000-$49,999 
##                               -2.21506000 
##                     income$50,000-$99,999 
##                               -2.65878512 
##                   income$100,000-$199,999 
##                               -3.07510879 
##                    income$200,000 or more 
##                               -3.57453693 
##  educationGraduate or professional degree 
##                               -0.26206215 
##       educationHigh school diploma or GED 
##                               -0.58423626 
##            educationLess than high school 
##                               -1.92177070 
## educationSome college or technical school 
##                               -0.29660903 
##    sexFemale:smoker_currentCurrent smoker 
##                                1.44209240
coefs <- coef(model_B)

effect_men <- coefs["smoker_currentCurrent smoker"]
effect_women <- coefs["smoker_currentCurrent smoker"] + coefs["sexFemale:smoker_currentCurrent smoker"]

effect_men
## smoker_currentCurrent smoker 
##                    0.9614813
effect_women
## smoker_currentCurrent smoker 
##                     2.403574
effect_women - effect_men
## smoker_currentCurrent smoker 
##                     1.442092

Among males, current smoking was associated with 0.96 more mentally unhealthy days, holding all other variables constant.

Among females, the estimated effect of current smoking was 2.40 more mentally unhealthy days, holding all other variables constant.

Together, these estimates suggest that the association between smoking and mentally unhealthy days was stronger among females by about 1.44 days.

plot_dat <- ggpredict(model_B, terms = c("smoker_current", "sex"))

plot(plot_dat) +
  labs(
    title = "Predicted mentally unhealthy days by smoking status and sex",
    x = "Smoking status",
    y = "Predicted mentally unhealthy days",
    color = "Sex"
  )
## Ignoring unknown labels:
## • linetype : "sex"
## • shape : "sex"

The interaction plot shows the predicted number of mentally unhealthy days for each combination of smoking status and sex.

The relationship between smoking and mental health appeared to differ somewhat by sex. Current smokers reported more mentally unhealthy days than non-smokers in both groups, but the size of that difference was greater among females.

This suggests that the mental health burden associated with smoking may not be the same for everyone. From a public health perspective, smoking-related mental health interventions may need to account for differences across population groups.

Part 4: Reflection

This analysis showed that mentally unhealthy days were associated with several behavioral and demographic factors, particularly physical health, smoking, sex, age group, and income. Individuals with more physically unhealthy days and current smokers reported more mentally unhealthy days, while exercise and higher income were associated with fewer mentally unhealthy days.

A key limitation of this analysis is that the BRFSS is cross-sectional, meaning that causal relationships cannot be established. Additionally, unmeasured confounding variables such as stress, access to healthcare, or social support may influence the observed associations.

Adjusted R-squared is important because it accounts for the number of predictors in the model and prevents overestimating model fit. Chunk tests are useful because they assess whether groups of variables (like income or education) contribute to the model overall, rather than relying only on individual coefficient tests.

I used AI tools to assist with code troubleshooting and structuring interpretations, but I verified all results and interpretations independently.