Part 0: Data Preparation

library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(gtsummary)
library(leaps)
library(pROC)
library(ResourceSelection)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))
brfss_raw <- read_xpt("LLCP2023.XPT")

dim(brfss_raw)
## [1] 433323    350

The raw 2023 BRFSS dataset was imported using read_xpt(). The dataset contains 433,323 observations and 350 variables.

For this assignment, I selected all 11 available candidate predictors from the updated assignment table after SLEPTIM1 was removed: mental health days, BMI, sex, exercise, depression history, age group, income, education, smoking status, general health, and marital status. I chose these predictors because physical health burden can be related to mental health, body weight, health behaviors, general health status, and social or demographic factors. This gives a broad set of individual, behavioral, and health-related predictors for the maximum model.

brfss_sub <- brfss_raw %>%
  select(
    PHYSHLTH, MENTHLTH, `_BMI5`, SEXVAR, EXERANY2, ADDEPEV3,
    `_AGEG5YR`, `_INCOMG1`, EDUCA, `_SMOKER3`, GENHLTH, MARITAL
  )
brfss_clean <- brfss_sub %>%
  mutate(
    physhlth_days = case_when(
      PHYSHLTH == 88 ~ 0,
      PHYSHLTH %in% c(77, 99) ~ NA_real_,
      PHYSHLTH %in% 1:30 ~ as.numeric(PHYSHLTH),
      TRUE ~ NA_real_
    ),
    
    menthlth_days = case_when(
      MENTHLTH == 88 ~ 0,
      MENTHLTH %in% c(77, 99) ~ NA_real_,
      MENTHLTH %in% 1:30 ~ as.numeric(MENTHLTH),
      TRUE ~ NA_real_
    ),
    
    bmi = case_when(
      `_BMI5` == 9999 ~ NA_real_,
      TRUE ~ as.numeric(`_BMI5`) / 100
    ),
    
    sex = factor(
      case_when(
        SEXVAR == 1 ~ "Male",
        SEXVAR == 2 ~ "Female",
        TRUE ~ NA_character_
      ),
      levels = c("Male", "Female")
    ),
    
    exercise = factor(
      case_when(
        EXERANY2 == 1 ~ "Yes",
        EXERANY2 == 2 ~ "No",
        EXERANY2 %in% c(7, 9) ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("No", "Yes")
    ),
    
    depression = factor(
      case_when(
        ADDEPEV3 == 1 ~ "Yes",
        ADDEPEV3 == 2 ~ "No",
        ADDEPEV3 %in% c(7, 9) ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("No", "Yes")
    ),
    
    age_group = factor(
      case_when(
        `_AGEG5YR` %in% 1:3 ~ "18-34",
        `_AGEG5YR` %in% 4:6 ~ "35-49",
        `_AGEG5YR` %in% 7:9 ~ "50-64",
        `_AGEG5YR` %in% 10:13 ~ "65+",
        `_AGEG5YR` == 14 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("18-34", "35-49", "50-64", "65+")
    ),
    
    income = factor(
      case_when(
        `_INCOMG1` %in% 1:2 ~ "Less than $25K",
        `_INCOMG1` %in% 3:4 ~ "$25K-$49K",
        `_INCOMG1` %in% 5:6 ~ "$50K-$99K",
        `_INCOMG1` == 7 ~ "$100K+",
        `_INCOMG1` == 9 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Less than $25K", "$25K-$49K", "$50K-$99K", "$100K+")
    ),
    
    education = factor(
      case_when(
        EDUCA %in% 1:3 ~ "Less than HS",
        EDUCA == 4 ~ "HS/GED",
        EDUCA == 5 ~ "Some college",
        EDUCA == 6 ~ "College graduate",
        EDUCA == 9 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Less than HS", "HS/GED", "Some college", "College graduate")
    ),
    
    smoking = factor(
      case_when(
        `_SMOKER3` %in% 1:2 ~ "Current",
        `_SMOKER3` == 3 ~ "Former",
        `_SMOKER3` == 4 ~ "Never",
        `_SMOKER3` == 9 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Never", "Former", "Current")
    ),
    
    gen_health = factor(
      case_when(
        GENHLTH %in% 1:2 ~ "Excellent/Very Good",
        GENHLTH == 3 ~ "Good",
        GENHLTH %in% 4:5 ~ "Fair/Poor",
        GENHLTH %in% c(7, 9) ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Excellent/Very Good", "Good", "Fair/Poor")
    ),
    
    marital = factor(
      case_when(
        MARITAL %in% c(1, 6) ~ "Married/Partnered",
        MARITAL %in% 2:4 ~ "Previously married",
        MARITAL == 5 ~ "Never married",
        MARITAL == 9 ~ NA_character_,
        TRUE ~ NA_character_
      ),
      levels = c("Married/Partnered", "Previously married", "Never married")
    )
  ) %>%
  select(
    physhlth_days, menthlth_days, bmi, sex, exercise, depression,
    age_group, income, education, smoking, gen_health, marital
  )
missing_report <- brfss_clean %>%
  summarise(
    across(
      c(physhlth_days, menthlth_days, bmi, depression, gen_health),
      list(
        n_missing = ~sum(is.na(.)),
        pct_missing = ~mean(is.na(.)) * 100
      ),
      .names = "{.col}_{.fn}"
    )
  ) %>%
  pivot_longer(
    everything(),
    names_to = c("variable", ".value"),
    names_pattern = "(.*)_(n_missing|pct_missing)"
  )

missing_report %>%
  kable(digits = 2, caption = "Missingness Before Complete-Case Exclusion") %>%
  kable_styling(full_width = FALSE)
Missingness Before Complete-Case Exclusion
variable n_missing pct_missing
physhlth_days 10785 2.49
menthlth_days 8108 1.87
bmi 40535 9.35
depression 2587 0.60
gen_health 1262 0.29

Before removing missing data, 10,785 observations were missing on physical health days, which was 2.49% of the raw dataset. Mental health days had 8,108 missing observations, or 1.87%, while BMI had 40,535 missing observations, or 9.35%. Depression history and general health had smaller amounts of missing data.

set.seed(1220)

brfss_analytic <- brfss_clean %>%
  drop_na() %>%
  slice_sample(n = 8000)

nrow(brfss_analytic)
## [1] 8000

After complete-case exclusion and random sampling, the final analytic sample included 8,000 respondents.

brfss_analytic %>%
  tbl_summary(
    statistic = list(all_continuous() ~ "{median} ({p25}, {p75})")
  )
Characteristic N = 8,0001
physhlth_days 0 (0, 4)
menthlth_days 0 (0, 5)
bmi 27 (24, 32)
sex
    Male 3,971 (50%)
    Female 4,029 (50%)
exercise 6,157 (77%)
depression 1,776 (22%)
age_group
    18-34 1,294 (16%)
    35-49 1,657 (21%)
    50-64 2,109 (26%)
    65+ 2,940 (37%)
income
    Less than $25K 1,090 (14%)
    $25K-$49K 1,931 (24%)
    $50K-$99K 4,297 (54%)
    $100K+ 682 (8.5%)
education
    Less than HS 391 (4.9%)
    HS/GED 1,887 (24%)
    Some college 2,115 (26%)
    College graduate 3,607 (45%)
smoking
    Never 4,808 (60%)
    Former 2,230 (28%)
    Current 962 (12%)
gen_health
    Excellent/Very Good 3,926 (49%)
    Good 2,648 (33%)
    Fair/Poor 1,426 (18%)
marital
    Married/Partnered 4,582 (57%)
    Previously married 2,050 (26%)
    Never married 1,368 (17%)
1 Median (Q1, Q3); n (%)

The analytic sample had a median of 0 physically unhealthy days and 0 mentally unhealthy days. About half of the sample was male and half was female. Most respondents reported exercising in the past 30 days, and the largest age group was adults 65 years and older.

Part 1: Model Selection for Linear Regression

Step 1: Maximum Model

max_mod <- lm(
  physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health + marital,
  data = brfss_analytic
)

summary(max_mod)
## 
## Call:
## lm(formula = physhlth_days ~ menthlth_days + bmi + sex + exercise + 
##     depression + age_group + income + education + smoking + gen_health + 
##     marital, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.795  -2.839  -1.145   0.690  31.047 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                2.20132    0.62746   3.508 0.000453 ***
## menthlth_days              0.18362    0.01135  16.172  < 2e-16 ***
## bmi                       -0.01877    0.01287  -1.459 0.144638    
## sexFemale                  0.23449    0.16389   1.431 0.152538    
## exerciseYes               -1.93289    0.20408  -9.471  < 2e-16 ***
## depressionYes              0.77536    0.21528   3.602 0.000318 ***
## age_group35-49             0.76935    0.28365   2.712 0.006696 ** 
## age_group50-64             1.40108    0.27874   5.026 5.11e-07 ***
## age_group65+               1.72725    0.27852   6.202 5.87e-10 ***
## income$25K-$49K           -1.08440    0.27627  -3.925 8.74e-05 ***
## income$50K-$99K           -1.52385    0.27607  -5.520 3.50e-08 ***
## income$100K+              -1.31580    0.39821  -3.304 0.000956 ***
## educationHS/GED            0.29371    0.40089   0.733 0.463806    
## educationSome college      1.00198    0.40273   2.488 0.012867 *  
## educationCollege graduate  1.05262    0.40445   2.603 0.009269 ** 
## smokingFormer              0.43777    0.18776   2.332 0.019749 *  
## smokingCurrent             0.46414    0.26617   1.744 0.081241 .  
## gen_healthGood             1.40982    0.18635   7.565 4.30e-14 ***
## gen_healthFair/Poor       10.06509    0.25239  39.879  < 2e-16 ***
## maritalPreviously married -0.26701    0.20679  -1.291 0.196680    
## maritalNever married      -0.41244    0.24899  -1.656 0.097666 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.112 on 7979 degrees of freedom
## Multiple R-squared:  0.3258, Adjusted R-squared:  0.3242 
## F-statistic: 192.8 on 20 and 7979 DF,  p-value: < 2.2e-16
max_fit <- tibble(
  r_squared = glance(max_mod)$r.squared,
  adj_r_squared = glance(max_mod)$adj.r.squared,
  AIC = AIC(max_mod),
  BIC = BIC(max_mod)
)

max_fit %>%
  kable(digits = 3, caption = "Maximum Linear Model Fit Statistics") %>%
  kable_styling(full_width = FALSE)
Maximum Linear Model Fit Statistics
r_squared adj_r_squared AIC BIC
0.326 0.324 54115.05 54268.77

The maximum linear regression model explained about 32.6% of the variation in physically unhealthy days (R-squared = 0.326). The adjusted R-squared was 0.324, meaning the model still explained about 32.4% of the variation after accounting for the number of predictors. The AIC was 54115.05 and the BIC was 54268.77. Overall, the full model explains a meaningful amount of variation in physical health burden, although a large amount of variation remains unexplained.

Step 2: Best Subsets Regression

best_sub <- regsubsets(
  physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health + marital,
  data = brfss_analytic,
  nvmax = length(coef(max_mod)) - 1
)

best_sum <- summary(best_sub)

best_table <- tibble(
  model_size = seq_along(best_sum$adjr2),
  adj_r_squared = best_sum$adjr2,
  bic = best_sum$bic
)

best_table %>%
  kable(digits = 3, caption = "Best Subsets Criteria by Model Size") %>%
  kable_styling(full_width = FALSE)
Best Subsets Criteria by Model Size
model_size adj_r_squared bic
1 0.263 -2420.699
2 0.296 -2778.124
3 0.307 -2905.972
4 0.313 -2966.096
5 0.316 -2989.725
6 0.318 -3007.984
7 0.320 -3016.833
8 0.321 -3021.258
9 0.321 -3021.914
10 0.322 -3021.173
11 0.323 -3018.289
12 0.323 -3016.992
13 0.324 -3013.964
14 0.324 -3008.540
15 0.324 -3002.541
16 0.324 -2995.632
17 0.324 -2988.571
18 0.324 -2981.413
19 0.324 -2974.060
20 0.324 -2965.611
ggplot(best_table, aes(x = model_size, y = adj_r_squared)) +
  geom_point() +
  geom_line() +
  labs(
    title = "Adjusted R-squared by Model Size",
    x = "Number of Predictors",
    y = "Adjusted R-squared"
  )

ggplot(best_table, aes(x = model_size, y = bic)) +
  geom_point() +
  geom_line() +
  labs(
    title = "BIC by Model Size",
    x = "Number of Predictors",
    y = "BIC"
  )

best_adjr2_size <- which.max(best_sum$adjr2)
best_bic_size <- which.min(best_sum$bic)

best_adjr2_size
## [1] 19
best_bic_size
## [1] 9
coef(best_sub, id = best_adjr2_size)
##               (Intercept)             menthlth_days                       bmi 
##                2.42326705                0.18384599               -0.01869124 
##                 sexFemale               exerciseYes             depressionYes 
##                0.23712114               -1.92576399                0.77637429 
##            age_group35-49            age_group50-64              age_group65+ 
##                0.76294842                1.39939197                1.73028710 
##           income$25K-$49K           income$50K-$99K              income$100K+ 
##               -1.07158786               -1.50068492               -1.29329261 
##     educationSome college educationCollege graduate             smokingFormer 
##                0.75452784                0.80071745                0.43731142 
##            smokingCurrent            gen_healthGood       gen_healthFair/Poor 
##                0.46323888                1.40757016               10.05466396 
## maritalPreviously married      maritalNever married 
##               -0.26397878               -0.40454371
coef(best_sub, id = best_bic_size)
##         (Intercept)       menthlth_days         exerciseYes       depressionYes 
##           1.4036767           0.1881763          -1.8794631           0.8954395 
##      age_group35-49      age_group50-64        age_group65+     income$50K-$99K 
##           1.0035144           1.6583700           2.0548414          -0.5080191 
##      gen_healthGood gen_healthFair/Poor 
##           1.3542065          10.0640869

The best adjusted R-squared model occurred at model size 19, while the best BIC model occurred at model size 9. The two criteria did not agree exactly. Adjusted R-squared favored a larger model because it rewards improved explanatory power while only lightly penalizing complexity. BIC favored the simpler 9-predictor model because it penalizes extra predictors more strongly. I would be cautious about relying only on best subsets here because regsubsets() can select some dummy variables from a factor while excluding other levels of the same factor, which makes interpretation less clean.

Step 3: Stepwise Selection Methods

null_mod <- lm(physhlth_days ~ 1, data = brfss_analytic)

back_mod <- step(max_mod, direction = "backward", trace = FALSE)

fwd_mod <- step(
  null_mod,
  scope = formula(max_mod),
  direction = "forward",
  trace = FALSE
)

both_mod <- step(
  null_mod,
  scope = formula(max_mod),
  direction = "both",
  trace = FALSE
)

formula(back_mod)
## physhlth_days ~ menthlth_days + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health
formula(fwd_mod)
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking + sex
formula(both_mod)
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking + sex
step_compare <- tibble(
  Method = c("Backward", "Forward", "Stepwise"),
  Formula = c(
    paste(deparse(formula(back_mod)), collapse = " "),
    paste(deparse(formula(fwd_mod)), collapse = " "),
    paste(deparse(formula(both_mod)), collapse = " ")
  ),
  Adj_R2 = c(
    glance(back_mod)$adj.r.squared,
    glance(fwd_mod)$adj.r.squared,
    glance(both_mod)$adj.r.squared
  ),
  AIC = c(AIC(back_mod), AIC(fwd_mod), AIC(both_mod)),
  BIC = c(BIC(back_mod), BIC(fwd_mod), BIC(both_mod))
)

step_compare %>%
  kable(digits = 3, caption = "Comparison of Stepwise Selection Methods") %>%
  kable_styling(full_width = FALSE)
Comparison of Stepwise Selection Methods
Method Formula Adj_R2 AIC BIC
Backward physhlth_days ~ menthlth_days + sex + exercise + depression + age_group + income + education + smoking + gen_health 0.324 54114.57 54247.32
Forward physhlth_days ~ gen_health + menthlth_days + exercise + age_group + income + depression + education + smoking + sex 0.324 54114.57 54247.32
Stepwise physhlth_days ~ gen_health + menthlth_days + exercise + age_group + income + depression + education + smoking + sex 0.324 54114.57 54247.32

Backward elimination, forward selection, and stepwise selection all produced the same final model. The final model retained general health, mental health days, exercise, age group, income, depression history, education, smoking, and sex. BMI and marital status were excluded by the stepwise methods, suggesting they did not improve the AIC enough after accounting for the other predictors. The three methods had the same adjusted R-squared of about 0.324, the same AIC of 54114.57, and the same BIC of 54247.32. Since the three methods agreed, I felt more confident using this as the final linear model.

Step 4: Final Model Selection and Interpretation

final_lm <- both_mod

tidy(final_lm, conf.int = TRUE) %>%
  kable(digits = 3, caption = "Final Linear Regression Model") %>%
  kable_styling(full_width = FALSE)
Final Linear Regression Model
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 1.385 0.487 2.843 0.004 0.430 2.341
gen_healthGood 1.364 0.184 7.416 0.000 1.003 1.724
gen_healthFair/Poor 10.001 0.249 40.225 0.000 9.514 10.488
menthlth_days 0.184 0.011 16.175 0.000 0.161 0.206
exerciseYes -1.905 0.203 -9.379 0.000 -2.303 -1.507
age_group35-49 0.837 0.270 3.098 0.002 0.308 1.367
age_group50-64 1.478 0.258 5.721 0.000 0.971 1.984
age_group65+ 1.837 0.251 7.310 0.000 1.344 2.329
income$25K-$49K -1.032 0.275 -3.756 0.000 -1.571 -0.494
income$50K-$99K -1.388 0.266 -5.224 0.000 -1.909 -0.867
income$100K+ -1.112 0.385 -2.890 0.004 -1.867 -0.358
depressionYes 0.747 0.215 3.477 0.001 0.326 1.168
educationHS/GED 0.258 0.401 0.643 0.520 -0.528 1.043
educationSome college 0.964 0.402 2.394 0.017 0.175 1.753
educationCollege graduate 1.031 0.404 2.550 0.011 0.239 1.824
smokingFormer 0.440 0.188 2.344 0.019 0.072 0.808
smokingCurrent 0.478 0.265 1.804 0.071 -0.041 0.997
sexFemale 0.233 0.163 1.426 0.154 -0.087 0.553
glance(final_lm)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic p.value    df  logLik    AIC    BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>   <dbl>  <dbl>  <dbl>
## 1     0.325         0.324  7.11      226.       0    17 -27038. 54115. 54247.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

I selected the stepwise model as the final linear model because it balanced model fit and parsimony using AIC while handling categorical predictors as complete factor terms. This model was also supported by backward elimination and forward selection, since all three stepwise methods selected the same final set of predictors.

In the final model, respondents reporting Fair/Poor general health had about 10.00 more physically unhealthy days than those reporting Excellent/Very Good health, holding all other variables constant. Each additional mentally unhealthy day was associated with about 0.18 more physically unhealthy days, holding all other variables constant. Respondents who reported exercise had about 1.91 fewer physically unhealthy days than those who did not exercise, holding the other variables constant.

Income also showed a protective pattern. Compared with respondents earning less than $25K, those earning $50K-$99K had about 1.39 fewer physically unhealthy days, holding all other variables constant. Adults aged 65 and older had about 1.84 more physically unhealthy days compared with adults aged 18-34, adjusting for the other predictors.

Step 5: LINE Assumption Check

par(mfrow = c(2, 2))
plot(final_lm)

par(mfrow = c(1, 1))

The Residuals vs. Fitted plot shows some patterning rather than a completely random cloud, suggesting that the linearity and equal variance assumptions may not be perfectly satisfied. This is not surprising because the outcome is a count of unhealthy days bounded between 0 and 30.

The Normal Q-Q plot shows clear deviation from the straight line, especially in the tails. This suggests the residuals are not perfectly normally distributed. Again, this makes sense because many respondents report 0 physically unhealthy days, while a smaller group reports many days.

The Scale-Location plot shows changing spread across the fitted values, which suggests some non-constant variance. The Residuals vs. Leverage plot does not show obvious extreme points with very large Cook’s distance, so there does not appear to be one small set of observations dominating the model.

Overall, the LINE assumptions are only partially satisfied. The strongest concerns are non-normal residuals and non-constant variance, which are expected because physically unhealthy days is bounded and skewed. The model is still useful as an approximate descriptive model, but these limitations should be acknowledged.

Part 2: Logistic Regression

Step 1: Create the Binary Outcome

brfss_analytic <- brfss_analytic %>%
  mutate(
    frequent_distress = factor(
      if_else(physhlth_days >= 14, "Yes", "No"),
      levels = c("No", "Yes")
    )
  )

distress_prev <- brfss_analytic %>%
  count(frequent_distress) %>%
  mutate(percent = 100 * n / sum(n))

distress_prev %>%
  kable(digits = 1, caption = "Prevalence of Frequent Physical Distress") %>%
  kable_styling(full_width = FALSE)
Prevalence of Frequent Physical Distress
frequent_distress n percent
No 6948 86.8
Yes 1052 13.2

Frequent physical distress was defined as reporting 14 or more physically unhealthy days in the past 30 days. In this sample, 1,052 respondents, or 13.2%, had frequent physical distress, while 6,948 respondents, or 86.8%, did not. The outcome is imbalanced because the majority of respondents are in the “No” category, but there are still enough “Yes” cases to fit a logistic regression model.

Step 2: Simple Logistic Regression

mod_simple <- glm(
  frequent_distress ~ gen_health,
  data = brfss_analytic,
  family = binomial
)

summary(mod_simple)
## 
## Call:
## glm(formula = frequent_distress ~ gen_health, family = binomial, 
##     data = brfss_analytic)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -3.39831    0.09021  -37.67   <2e-16 ***
## gen_healthGood       1.14179    0.11198   10.20   <2e-16 ***
## gen_healthFair/Poor  3.28880    0.10465   31.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6227.7  on 7999  degrees of freedom
## Residual deviance: 4754.1  on 7997  degrees of freedom
## AIC: 4760.1
## 
## Number of Fisher Scoring iterations: 6
tidy(mod_simple, conf.int = TRUE, exponentiate = TRUE) %>%
  kable(digits = 3, caption = "Simple Logistic Regression Odds Ratios") %>%
  kable_styling(full_width = FALSE)
Simple Logistic Regression Odds Ratios
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.033 0.090 -37.672 0 0.028 0.040
gen_healthGood 3.132 0.112 10.197 0 2.521 3.911
gen_healthFair/Poor 26.811 0.105 31.428 0 21.915 33.038

I chose general health as the simple logistic regression predictor because it is directly related to physical health burden and was one of the strongest predictors in the linear regression model.

In the simple logistic regression model, respondents with Good general health had 3.13 times the odds of frequent physical distress compared with those reporting Excellent/Very Good health. The 95% confidence interval was 2.52 to 3.91, which does not include 1.0, so this association was statistically significant at alpha = 0.05.

Respondents with Fair/Poor general health had 26.81 times the odds of frequent physical distress compared with those reporting Excellent/Very Good health. The 95% confidence interval was 21.92 to 33.04, which also excludes 1.0. This shows a very strong unadjusted association between general health status and frequent physical distress.

Step 3: Multiple Logistic Regression

final_terms <- attr(terms(final_lm), "term.labels")

log_formula <- as.formula(
  paste("frequent_distress ~", paste(final_terms, collapse = " + "))
)

mod_logistic <- glm(
  log_formula,
  data = brfss_analytic,
  family = binomial
)

summary(mod_logistic)
## 
## Call:
## glm(formula = log_formula, family = binomial, data = brfss_analytic)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -3.869370   0.237416 -16.298  < 2e-16 ***
## gen_healthGood             0.883594   0.115593   7.644 2.11e-14 ***
## gen_healthFair/Poor        2.676571   0.113511  23.580  < 2e-16 ***
## menthlth_days              0.044803   0.004351  10.298  < 2e-16 ***
## exerciseYes               -0.577926   0.085063  -6.794 1.09e-11 ***
## age_group35-49             0.522042   0.157941   3.305 0.000949 ***
## age_group50-64             0.775775   0.146727   5.287 1.24e-07 ***
## age_group65+               0.980003   0.144115   6.800 1.05e-11 ***
## income$25K-$49K           -0.190466   0.110542  -1.723 0.084887 .  
## income$50K-$99K           -0.439184   0.111295  -3.946 7.94e-05 ***
## income$100K+              -0.451308   0.220972  -2.042 0.041115 *  
## depressionYes              0.258162   0.095637   2.699 0.006946 ** 
## educationHS/GED            0.034581   0.164741   0.210 0.833738    
## educationSome college      0.315308   0.164722   1.914 0.055597 .  
## educationCollege graduate  0.266869   0.171000   1.561 0.118610    
## smokingFormer              0.199548   0.089583   2.228 0.025913 *  
## smokingCurrent             0.205214   0.115062   1.784 0.074504 .  
## sexFemale                  0.167499   0.080816   2.073 0.038208 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6227.7  on 7999  degrees of freedom
## Residual deviance: 4423.9  on 7982  degrees of freedom
## AIC: 4459.9
## 
## Number of Fisher Scoring iterations: 6
tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) %>%
  kable(digits = 3, caption = "Multiple Logistic Regression Adjusted Odds Ratios") %>%
  kable_styling(full_width = FALSE)
Multiple Logistic Regression Adjusted Odds Ratios
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 0.021 0.237 -16.298 0.000 0.013 0.033
gen_healthGood 2.420 0.116 7.644 0.000 1.933 3.042
gen_healthFair/Poor 14.535 0.114 23.580 0.000 11.670 18.215
menthlth_days 1.046 0.004 10.298 0.000 1.037 1.055
exerciseYes 0.561 0.085 -6.794 0.000 0.475 0.663
age_group35-49 1.685 0.158 3.305 0.001 1.240 2.304
age_group50-64 2.172 0.147 5.287 0.000 1.635 2.908
age_group65+ 2.664 0.144 6.800 0.000 2.017 3.551
income$25K-$49K 0.827 0.111 -1.723 0.085 0.666 1.027
income$50K-$99K 0.645 0.111 -3.946 0.000 0.519 0.802
income$100K+ 0.637 0.221 -2.042 0.041 0.408 0.971
depressionYes 1.295 0.096 2.699 0.007 1.072 1.560
educationHS/GED 1.035 0.165 0.210 0.834 0.751 1.434
educationSome college 1.371 0.165 1.914 0.056 0.995 1.899
educationCollege graduate 1.306 0.171 1.561 0.119 0.937 1.832
smokingFormer 1.221 0.090 2.228 0.026 1.024 1.455
smokingCurrent 1.228 0.115 1.784 0.075 0.979 1.536
sexFemale 1.182 0.081 2.073 0.038 1.009 1.386

The multiple logistic regression model used the same predictors as the final linear regression model. Adjusted odds ratios above 1 indicate higher odds of frequent physical distress, while adjusted odds ratios below 1 indicate lower odds, holding all other variables constant.

In the adjusted logistic regression model, respondents with Fair/Poor general health had 14.54 times the odds of frequent physical distress compared with those reporting Excellent/Very Good health, holding all other variables constant. Respondents with Good general health had 2.42 times the odds compared with the Excellent/Very Good group.

Each additional mentally unhealthy day was associated with 1.05 times the odds of frequent physical distress, holding all other variables constant. Respondents who reported exercise had lower odds of frequent physical distress than those who did not exercise (aOR = 0.56), adjusting for the other predictors.

Age and income also showed important patterns. Adults aged 65+ had 2.66 times the odds of frequent physical distress compared with adults aged 18-34, holding other variables constant. Respondents earning $50K-$99K had lower odds of frequent physical distress than those earning less than $25K (aOR = 0.65), adjusting for the other predictors.

Step 4: Likelihood Ratio Test

# Test whether general health improves the logistic regression model as a group.

mod_reduced <- glm(
  frequent_distress ~ menthlth_days + exercise + age_group + income +
    depression + education + smoking + sex,
  data = brfss_analytic,
  family = binomial
)

anova(mod_reduced, mod_logistic, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: frequent_distress ~ menthlth_days + exercise + age_group + income + 
##     depression + education + smoking + sex
## Model 2: frequent_distress ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking + sex
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      7984     5171.9                          
## 2      7982     4423.9  2   748.04 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

H0: General health does not improve the logistic regression model.
HA: General health improves the logistic regression model.

The likelihood ratio test compared the reduced model without general health to the full logistic model with general health included. The test was statistically significant, with a deviance difference of 748.04 on 2 degrees of freedom and p < 0.001. Therefore, I rejected the null hypothesis and concluded that general health significantly improves the logistic regression model.

Step 5: ROC Curve and AUC

roc_obj <- roc(brfss_analytic$frequent_distress, fitted(mod_logistic))

plot(
  roc_obj,
  main = "ROC Curve: Frequent Physical Distress Model",
  print.auc = TRUE
)

auc(roc_obj)
## Area under the curve: 0.8555
ci.auc(roc_obj)
## 95% CI: 0.8427-0.8683 (DeLong)

The logistic regression model had an AUC of 0.856, with a 95% confidence interval from 0.843 to 0.868. This indicates excellent discrimination. In other words, the model does a strong job distinguishing respondents with frequent physical distress from respondents without frequent physical distress.

Step 6: Hosmer-Lemeshow Goodness-of-Fit Test

hl <- hoslem.test(
  as.numeric(brfss_analytic$frequent_distress) - 1,
  fitted(mod_logistic),
  g = 10
)

hl
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  as.numeric(brfss_analytic$frequent_distress) - 1, fitted(mod_logistic)
## X-squared = 6.8688, df = 8, p-value = 0.5509

H0: The logistic regression model fits the data well.
HA: The logistic regression model does not fit the data well.

The Hosmer-Lemeshow test was not statistically significant, with X-squared = 6.87, 8 degrees of freedom, and p = 0.551. Since the p-value was greater than 0.05, I failed to reject the null hypothesis. This suggests that there is not strong evidence of poor model fit.

The Hosmer-Lemeshow test complements the ROC/AUC results because it evaluates calibration, while ROC/AUC evaluates discrimination. In this case, the AUC suggested excellent discrimination, and the Hosmer-Lemeshow test did not show evidence of poor calibration.

Part 3: Reflection

This analysis examined factors associated with physical health burden among U.S. adults using both linear and logistic regression. One of the strongest predictors in both models was general health status. In the linear model, respondents reporting Fair/Poor general health had about 10 more physically unhealthy days than those reporting Excellent/Very Good health. In the logistic model, Fair/Poor general health was also strongly associated with frequent physical distress, with an adjusted odds ratio of about 14.54. Mental health days, exercise, income, depression history, and age group were also important predictors. Exercise appeared protective in both models, while worse mental health and worse general health were associated with greater physical health burden.

The linear and logistic models answer related but different questions. The linear regression model estimates average differences in the number of physically unhealthy days, which is useful when the outcome is treated as a continuous measure. The logistic regression model estimates the odds of frequent physical distress, which is useful when the goal is to identify people with high physical health burden using the 14-day threshold. Linear regression was evaluated using R-squared, adjusted R-squared, AIC, BIC, and diagnostic plots. Logistic regression was evaluated using adjusted odds ratios, a likelihood ratio test, ROC/AUC, and the Hosmer-Lemeshow test.

A major limitation is that BRFSS is cross-sectional, so these results should be interpreted as associations rather than causal effects. Potential unmeasured confounders include chronic disease history, access to healthcare, disability status, employment status, pain severity, neighborhood environment, and social support.

For AI transparency, I used ChatGPT only for debugging and checking my code structure. The main problem I encountered was making sure my HW4 analytic dataset was created correctly after selecting and recoding all required variables. In a previous assignment, I had used drop_na() on only some variables, which changed the analytic sample, so I wanted to make sure I did not repeat that mistake. My prompt to the AI tool was: “My assignment says to remove all rows with missing values using drop_na() after recoding the variables. Should I use drop_na() on all variables in brfss_clean, and how can I check that my sample size is correct?” The AI suggested applying drop_na() after all variables were selected and recoded, then using set.seed(1220) before slice_sample(n = 8000).

I verified the suggestion by comparing it to the HW4 instructions, which specifically said to remove all rows with any missing values using drop_na() and then draw a random sample of 8,000. I also checked my output in RStudio to confirm that nrow(brfss_analytic) returned 8000, that all categorical variables displayed meaningful labels instead of numeric codes, and that the final models used the same variables from my cleaned analytic dataset. I also confirmed that SLEPTIM1 was not included because the updated assignment instructions removed it from the predictor pool.