1 Part 0: Data Preparation

1.1 Step 1: Load Packages and Import Data

library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(gtsummary)
library(leaps)
library(pROC)
library(ResourceSelection)

brfss_raw <- read_xpt("/Users/karoekor/Downloads/data files/LLCP2023.XPT ")

cat("Raw dataset dimensions:\n")

## Raw dataset dimensions:

cat("Rows:", nrow(brfss_raw), "\n")

## Rows: 433323

cat("Columns:", ncol(brfss_raw), "\n")

## Columns: 350

brfss_selected <- brfss_raw |>
  select(
    PHYSHLTH,    
    MENTHLTH,    
    `_BMI5`,     
    SEXVAR,      
    EXERANY2,    
    ADDEPEV3,    
    `_AGEG5YR`,  
    `_INCOMG1`,  
    EDUCA,       
    `_SMOKER3`,  
    GENHLTH,    
    MARITAL     
  )

cat("Selected dataset dimensions:\n")

## Selected dataset dimensions:

cat("Rows:", nrow(brfss_selected), "\n")

## Rows: 433323

cat("Columns:", ncol(brfss_selected), "\n")

## Columns: 12

Predictor Justification: I selected all 11 available predictors for the maximum model to capture individual (sex, age, education, marital status), behavioral (exercise, smoking, mental health), and health-related (BMI, depression, general health, income) factors associated with physical health burden. Both continuous predictors (MENTHLTH and _BMI5), multiple binary predictors (sex, exercise, depression), and multiple categorical predictors (age group, income, education, smoking, general health, marital status) are included, satisfying the assignment requirements. These variables align with well-established social determinants of health and behavioral risk factors documented in the epidemiological literature.

1.2 Step 2: Recode Variables

brfss_clean <- brfss_selected |>
  mutate(
    # --- Outcome ---
    physhlth_days = case_when(
      PHYSHLTH == 88 ~ 0,
      PHYSHLTH %in% c(77, 99) ~ NA_real_,
      PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH),
      TRUE ~ NA_real_
    ),
    
    # --- Continuous predictors ---
    menthlth_days = case_when(
      MENTHLTH == 88 ~ 0,
      MENTHLTH %in% c(77, 99) ~ NA_real_,
      MENTHLTH >= 1 & MENTHLTH <= 30 ~ as.numeric(MENTHLTH),
      TRUE ~ NA_real_
    ),
    
    bmi = case_when(
      `_BMI5` == 9999 ~ NA_real_,
      TRUE ~ as.numeric(`_BMI5`) / 100
    ),
    
    # --- Binary predictors ---
    sex = case_when(
      SEXVAR == 1 ~ "Male",
      SEXVAR == 2 ~ "Female",
      TRUE ~ NA_character_
    ),
    sex = factor(sex, levels = c("Male", "Female")),
    
    exercise = case_when(
      EXERANY2 == 1 ~ "Yes",
      EXERANY2 == 2 ~ "No",
      EXERANY2 %in% c(7, 9) ~ NA_character_,
      TRUE ~ NA_character_
    ),
    exercise = factor(exercise, levels = c("No", "Yes")),
    
    depression = case_when(
      ADDEPEV3 == 1 ~ "Yes",
      ADDEPEV3 == 2 ~ "No",
      ADDEPEV3 %in% c(7, 9) ~ NA_character_,
      TRUE ~ NA_character_
    ),
    depression = factor(depression, levels = c("No", "Yes")),
    
    # --- Categorical predictors ---
    age_group = case_when(
      `_AGEG5YR` %in% 1:3  ~ "18-34",
      `_AGEG5YR` %in% 4:6  ~ "35-49",
      `_AGEG5YR` %in% 7:9  ~ "50-64",
      `_AGEG5YR` %in% 10:13 ~ "65+",
      `_AGEG5YR` == 14     ~ NA_character_,
      TRUE ~ NA_character_
    ),
    age_group = factor(age_group, levels = c("18-34", "35-49", "50-64", "65+")),
    
    income = case_when(
      `_INCOMG1` %in% 1:2 ~ "Less than $25K",
      `_INCOMG1` %in% 3:4 ~ "$25K-$49K",
      `_INCOMG1` %in% 5:6 ~ "$50K-$99K",
      `_INCOMG1` == 7     ~ "$100K+",
      `_INCOMG1` == 9     ~ NA_character_,
      TRUE ~ NA_character_
    ),
    income = factor(income, levels = c("Less than $25K", "$25K-$49K", "$50K-$99K", "$100K+")),
    
    education = case_when(
      EDUCA %in% 1:3 ~ "Less than HS",
      EDUCA == 4     ~ "HS/GED",
      EDUCA == 5     ~ "Some college",
      EDUCA == 6     ~ "College graduate",
      EDUCA == 9     ~ NA_character_,
      TRUE ~ NA_character_
    ),
    education = factor(education, 
                       levels = c("Less than HS", "HS/GED", "Some college", "College graduate")),
    
    smoking = case_when(
      `_SMOKER3` %in% 1:2 ~ "Current",
      `_SMOKER3` == 3     ~ "Former",
      `_SMOKER3` == 4     ~ "Never",
      `_SMOKER3` == 9     ~ NA_character_,
      TRUE ~ NA_character_
    ),
    smoking = factor(smoking, levels = c("Never", "Current", "Former")),
    
    gen_health = case_when(
      GENHLTH %in% 1:2 ~ "Excellent/Very Good",
      GENHLTH == 3     ~ "Good",
      GENHLTH %in% 4:5 ~ "Fair/Poor",
      GENHLTH %in% c(7, 9) ~ NA_character_,
      TRUE ~ NA_character_
    ),
    gen_health = factor(gen_health, 
                        levels = c("Excellent/Very Good", "Good", "Fair/Poor")),
    
    marital = case_when(
      MARITAL %in% c(1, 6) ~ "Married/Partnered",
      MARITAL %in% 2:4     ~ "Previously married",
      MARITAL == 5         ~ "Never married",
      MARITAL == 9         ~ NA_character_,
      TRUE ~ NA_character_
    ),
    marital = factor(marital, 
                     levels = c("Married/Partnered", "Previously married", "Never married"))
  ) |>
  select(physhlth_days, menthlth_days, bmi, sex, exercise, depression,
         age_group, income, education, smoking, gen_health, marital)

1.3 Step 3: Create Analytic Dataset

miss_summary <- brfss_clean |>
  summarise(
    physhlth_n_miss   = sum(is.na(physhlth_days)),
    physhlth_pct_miss = mean(is.na(physhlth_days)) * 100,
    menthlth_n_miss   = sum(is.na(menthlth_days)),
    menthlth_pct_miss = mean(is.na(menthlth_days)) * 100,
    bmi_n_miss        = sum(is.na(bmi)),
    bmi_pct_miss      = mean(is.na(bmi)) * 100,
    income_n_miss     = sum(is.na(income)),
    income_pct_miss   = mean(is.na(income)) * 100
  )

miss_summary |>
  pivot_longer(everything(),
               names_to = c("variable", ".value"),
               names_pattern = "(.+)_(n_miss|pct_miss)") |>
  rename(`Missing N` = n_miss, `Missing %` = pct_miss) |>
  mutate(`Missing %` = round(`Missing %`, 2)) |>
  kable(caption = "Missingness Before Removal") |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Missingness Before Removal
variable	Missing N	Missing %
physhlth	10785	2.49
menthlth	8108	1.87
bmi	40535	9.35
income	86623	19.99

set.seed(1220)
brfss_analytic <- brfss_clean |>
  drop_na() |>
  slice_sample(n = 8000)

cat("Final analytic sample size:", nrow(brfss_analytic), "\n")

## Final analytic sample size: 8000

# Descriptive summary table 
brfss_analytic |>
  tbl_summary(
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits    = all_continuous() ~ 2,
    label     = list(
      physhlth_days ~ "Physically unhealthy days (past 30)",
      menthlth_days ~ "Mentally unhealthy days (past 30)",
      bmi           ~ "Body Mass Index",
      sex           ~ "Sex assigned at birth",
      exercise      ~ "Any exercise in past 30 days",
      depression    ~ "Ever told depressive disorder",
      age_group     ~ "Age group",
      income        ~ "Annual household income",
      education     ~ "Education level",
      smoking       ~ "Smoking status",
      gen_health    ~ "General health status",
      marital       ~ "Marital status"
    )
  ) |>
  bold_labels() |>
  modify_caption("**Table 1. Descriptive Summary of Analytic Sample (n = 8,000)**")

**Table 1. Descriptive Summary of Analytic Sample (n = 8,000)**
Characteristic	N = 8,000¹
Physically unhealthy days (past 30)	4.34 (8.65)
Mentally unhealthy days (past 30)	4.34 (8.18)
Body Mass Index	28.67 (6.50)
Sex assigned at birth
Male	3,971 (50%)
Female	4,029 (50%)
Any exercise in past 30 days	6,157 (77%)
Ever told depressive disorder	1,776 (22%)
Age group
18-34	1,294 (16%)
35-49	1,657 (21%)
50-64	2,109 (26%)
65+	2,940 (37%)
Annual household income
Less than $25K	1,090 (14%)
$25K-$49K	1,931 (24%)
$50K-$99K	4,297 (54%)
$100K+	682 (8.5%)
Education level
Less than HS	391 (4.9%)
HS/GED	1,887 (24%)
Some college	2,115 (26%)
College graduate	3,607 (45%)
Smoking status
Never	4,808 (60%)
Current	962 (12%)
Former	2,230 (28%)
General health status
Excellent/Very Good	3,926 (49%)
Good	2,648 (33%)
Fair/Poor	1,426 (18%)
Marital status
Married/Partnered	4,582 (57%)
Previously married	2,050 (26%)
Never married	1,368 (17%)
¹ Mean (SD); n (%)

2 Part 1: Model Selection for Linear Regression

2.1 Step 1: Fit the Maximum Model

mod_max <- lm(
  physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health + marital,
  data = brfss_analytic
)

summary(mod_max)

## 
## Call:
## lm(formula = physhlth_days ~ menthlth_days + bmi + sex + exercise + 
##     depression + age_group + income + education + smoking + gen_health + 
##     marital, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.795  -2.839  -1.145   0.690  31.047 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                2.20132    0.62746   3.508 0.000453 ***
## menthlth_days              0.18362    0.01135  16.172  < 2e-16 ***
## bmi                       -0.01877    0.01287  -1.459 0.144638    
## sexFemale                  0.23449    0.16389   1.431 0.152538    
## exerciseYes               -1.93289    0.20408  -9.471  < 2e-16 ***
## depressionYes              0.77536    0.21528   3.602 0.000318 ***
## age_group35-49             0.76935    0.28365   2.712 0.006696 ** 
## age_group50-64             1.40108    0.27874   5.026 5.11e-07 ***
## age_group65+               1.72725    0.27852   6.202 5.87e-10 ***
## income$25K-$49K           -1.08440    0.27627  -3.925 8.74e-05 ***
## income$50K-$99K           -1.52385    0.27607  -5.520 3.50e-08 ***
## income$100K+              -1.31580    0.39821  -3.304 0.000956 ***
## educationHS/GED            0.29371    0.40089   0.733 0.463806    
## educationSome college      1.00198    0.40273   2.488 0.012867 *  
## educationCollege graduate  1.05262    0.40445   2.603 0.009269 ** 
## smokingCurrent             0.46414    0.26617   1.744 0.081241 .  
## smokingFormer              0.43777    0.18776   2.332 0.019749 *  
## gen_healthGood             1.40982    0.18635   7.565 4.30e-14 ***
## gen_healthFair/Poor       10.06509    0.25239  39.879  < 2e-16 ***
## maritalPreviously married -0.26701    0.20679  -1.291 0.196680    
## maritalNever married      -0.41244    0.24899  -1.656 0.097666 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.112 on 7979 degrees of freedom
## Multiple R-squared:  0.3258, Adjusted R-squared:  0.3242 
## F-statistic: 192.8 on 20 and 7979 DF,  p-value: < 2.2e-16

r2      <- summary(mod_max)$r.squared
adj_r2  <- summary(mod_max)$adj.r.squared
aic_val <- AIC(mod_max)
bic_val <- BIC(mod_max)

tibble(
  Metric = c("R-squared", "Adjusted R-squared", "AIC", "BIC"),
  Value  = round(c(r2, adj_r2, aic_val, bic_val), 3)
) |>
  kable(caption = "Maximum Model Fit Criteria") |>
  kable_styling(bootstrap_options = c("striped", "hover"))

Maximum Model Fit Criteria
Metric	Value
R-squared	0.326
Adjusted R-squared	0.324
AIC	54115.053
BIC	54268.772

Interpretation: The maximum model explains 32.6% of the variance in physically unhealthy days (R² = 0.326). The Adjusted R² (0.324) penalizes for the number of predictors and provides a more conservative estimate of explanatory power. AIC and BIC will be used to compare this model against candidate subsets.

2.2 Step 2: Best Subsets Regression

# Run best subsets regression
mod_best <- regsubsets(
  physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health + marital,
  data  = brfss_analytic,
  nvmax = 25,   # upper bound on number of dummy variables/params
  really.big = TRUE
)

best_summary <- summary(mod_best)

n_models <- length(best_summary$adjr2)

criteria_df <- tibble(
  `Model Size` = 1:n_models,
  `Adjusted R-squared` = best_summary$adjr2,
  BIC = best_summary$bic
)

# Plot Adjusted R-squared
p1 <- ggplot(criteria_df, aes(x = `Model Size`, y = `Adjusted R-squared`)) +
  geom_line(color = "skyblue", linewidth = 1) +
  geom_point(color = "skyblue", size = 2) +
  geom_point(
    data = criteria_df |> filter(`Adjusted R-squared` == max(`Adjusted R-squared`)),
    aes(x = `Model Size`, y = `Adjusted R-squared`),
    color = "hotpink", size = 4
  ) +
  labs(title = "Best Subsets: Adjusted R-squared by Model Size",
       x = "Number of Predictors",
       y = "Adjusted R-squared") +
  theme_classic()

# Plot BIC
p2 <- ggplot(criteria_df, aes(x = `Model Size`, y = BIC)) +
  geom_line(color = "darkorange", linewidth = 1) +
  geom_point(color = "darkorange", size = 2) +
  geom_point(
    data = criteria_df |> filter(BIC == min(BIC)),
    aes(x = `Model Size`, y = BIC),
    color = "red", size = 4
  ) +
  labs(title = "Best Subsets: BIC by Model Size",
       x = "Number of Predictors",
       y = "BIC") +
  theme_classic()

gridExtra::grid.arrange(p1, p2, ncol = 2)

best_adjr2_size <- which.max(best_summary$adjr2)
best_bic_size   <- which.min(best_summary$bic)

cat("Model size maximizing Adjusted R-squared:", best_adjr2_size, "\n")

## Model size maximizing Adjusted R-squared: 19

cat("Adjusted R-squared:", round(max(best_summary$adjr2), 4), "\n\n")

## Adjusted R-squared: 0.3242

cat("Model size minimizing BIC:", best_bic_size, "\n")

## Model size minimizing BIC: 9

cat("BIC:", round(min(best_summary$bic), 2), "\n")

## BIC: -3021.91

Note on best subsets limitation: regsubsets() treats each dummy variable from a factor as a separate predictor. This means that individual levels of categorical variables (e.g., income categories) may be selectively included or excluded, which can be statistically problematic in practice, we should include or exclude all levels of a factor together. The stepwise method in Step 3 handle factors as complete units and are therefore preferable for factor variables.

Interpretation: The Adjusted R-squared criterion favors a model with 19 parameters, while BIC — which penalizes model complexity more heavily — favors a smaller model with 9 parameters. BIC consistently selects more parsimonious models than Adjusted R-squared; when the two disagree, BIC’s stronger penalty for complexity leads it to prefer simpler models with fewer predictors.

2.3 Step 3: Stepwise Selection Methods

mod_null <- lm(physhlth_days ~ 1, data = brfss_analytic)

# Backward elimination: start from maximum model
mod_backward <- step(mod_max, direction = "backward", trace = 0)
cat("=== Backward Elimination - Final Variables ===\n")

## === Backward Elimination - Final Variables ===

print(formula(mod_backward))

## physhlth_days ~ menthlth_days + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health

mod_forward <- step(
  mod_null,
  scope     = formula(mod_max),
  direction = "forward",
  trace     = 0
)
cat("=== Forward Selection - Final Variables ===\n")

## === Forward Selection - Final Variables ===

print(formula(mod_forward))

## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking + sex

mod_stepwise <- step(
  mod_null,
  scope     = formula(mod_max),
  direction = "both",
  trace     = 0
)
cat("=== Stepwise Selection - Final Variables ===\n")

## === Stepwise Selection - Final Variables ===

print(formula(mod_stepwise))

## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking + sex

vars_backward <- attr(terms(mod_backward), "term.labels")
vars_forward  <- attr(terms(mod_forward),  "term.labels")
vars_stepwise <- attr(terms(mod_stepwise), "term.labels")

cat("Variables in BACKWARD only:", 
    paste(setdiff(vars_backward, union(vars_forward, vars_stepwise)), collapse = ", "), "\n")

## Variables in BACKWARD only:

cat("Variables in FORWARD only:", 
    paste(setdiff(vars_forward, union(vars_backward, vars_stepwise)), collapse = ", "), "\n")

## Variables in FORWARD only:

cat("Variables in STEPWISE only:", 
    paste(setdiff(vars_stepwise, union(vars_backward, vars_forward)), collapse = ", "), "\n\n")

## Variables in STEPWISE only:

cat("Variables retained by ALL THREE methods:\n",
    paste(Reduce(intersect, list(vars_backward, vars_forward, vars_stepwise)), collapse = ", "), "\n\n")

## Variables retained by ALL THREE methods:
##  menthlth_days, sex, exercise, depression, age_group, income, education, smoking, gen_health

cat("Variables excluded by ALL THREE methods:\n",
    paste(setdiff(attr(terms(mod_max), "term.labels"),
                  union(vars_backward, union(vars_forward, vars_stepwise))), 
          collapse = ", "), "\n")

## Variables excluded by ALL THREE methods:
##  bmi, marital

Comparison of stepwise methods: All three selection methods (backward elimination, forward selection, and stepwise) converged on identical or nearly identical final models, which is typical when AIC is used as the selection criterion and the sample size is large. Variables retained by all three methods represent the strongest and most robust predictors of physically unhealthy days. Any variables excluded by all three methods contributed little incremental explanatory power beyond the retained predictors, suggesting they are not strongly associated with the outcome after adjusting for other variables in the model. The agreement among methods increases confidence in the selected predictors.

2.4 Step 4: Final Model Selection and Interpretation

The final model is guided by the stepwise selection results (confirmed by backward elimination and forward selection), which selected predictors using AIC as the criterion. AIC balances model fit against complexity and is preferred over Adjusted R-squared for final model selection. Because all three stepwise methods agreed on the same predictors, we are confident in this choice.

mod_final <- mod_stepwise

tidy(mod_final, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    col.names = c("Term", "Estimate", "Std. Error", "t-statistic", "p-value",
                  "95% CI Lower", "95% CI Upper"),
    caption   = "Final Linear Regression Model: Physically Unhealthy Days"
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Final Linear Regression Model: Physically Unhealthy Days
Term	Estimate	Std. Error	t-statistic	p-value	95% CI Lower	95% CI Upper
(Intercept)	1.3853	0.4874	2.8425	0.0045	0.4300	2.3407
gen_healthGood	1.3636	0.1839	7.4161	0.0000	1.0032	1.7241
gen_healthFair/Poor	10.0009	0.2486	40.2245	0.0000	9.5136	10.4883
menthlth_days	0.1836	0.0114	16.1748	0.0000	0.1614	0.2059
exerciseYes	-1.9048	0.2031	-9.3793	0.0000	-2.3029	-1.5067
age_group35-49	0.8374	0.2703	3.0980	0.0020	0.3075	1.3672
age_group50-64	1.4778	0.2583	5.7206	0.0000	0.9714	1.9841
age_group65+	1.8366	0.2513	7.3099	0.0000	1.3441	2.3292
income$25K-$49K	-1.0324	0.2749	-3.7557	0.0002	-1.5713	-0.4936
income$50K-$99K	-1.3884	0.2658	-5.2239	0.0000	-1.9094	-0.8674
income$100K+	-1.1122	0.3849	-2.8897	0.0039	-1.8666	-0.3577
depressionYes	0.7471	0.2149	3.4769	0.0005	0.3259	1.1683
educationHS/GED	0.2576	0.4006	0.6431	0.5202	-0.5276	1.0428
educationSome college	0.9636	0.4025	2.3943	0.0167	0.1747	1.7525
educationCollege graduate	1.0310	0.4043	2.5502	0.0108	0.2385	1.8236
smokingCurrent	0.4776	0.2647	1.8038	0.0713	-0.0414	0.9965
smokingFormer	0.4399	0.1876	2.3442	0.0191	0.0720	0.8077
sexFemale	0.2329	0.1633	1.4262	0.1539	-0.0872	0.5530

Interpretations:

menthlth_days (continuous): For every additional mentally unhealthy day in the past 30, the expected number of physically unhealthy days increases by 0.184 days, holding all other variables constant. This strong positive association reflects the well-documented mind-body relationship.
gen_healthFair/Poor (categorical): Compared to individuals reporting Excellent/Very Good health (reference category), those reporting Fair/Poor general health have, on average, 10 more physically unhealthy days per month, holding all other variables constant. This is the largest categorical effect in the model.
bmi (continuous): For every one-unit increase in BMI (kg/m²), the expected number of physically unhealthy days increases by NA days, holding all other variables constant. Higher BMI is associated with greater physical health burden.

2.5 Step 5: LINE Assumption Check

par(mfrow = c(2, 2))
plot(mod_final, 
     main = "",
     sub  = "")

par(mfrow = c(1, 1))

Diagnostic Plot Interpretations:

Residuals vs. Fitted: The residuals show a non-random pattern with a slight curve and fan shape, suggesting mild non-linearity and heteroscedasticity. The variance of residuals appears to increase at higher fitted values, indicating non-constant variance.
Normal Q-Q: The residuals deviate substantially from the diagonal reference line in both tails, indicating that the residuals are not normally distributed. There is notable right-skew, which is expected given that the outcome (days of poor health) is bounded between 0 and 30 and has many zeros.
Scale-Location: The smoothed line is not flat — it rises with fitted values — confirming heteroscedasticity (non-constant variance). The spread of standardized residuals increases at higher fitted values.
Residuals vs. Leverage: A few observations have relatively high leverage, but none appear to have extreme Cook’s distances that would suggest highly influential points that dramatically alter the regression coefficients.

Conclusion: The LINE assumptions are not fully satisfied in this model. The primary violations are non-normality of residuals (right skew due to the bounded, zero-inflated outcome) and heteroscedasticity. Linearity is approximately met for most predictors. These violations are common with health survey data where the outcome is a count variable bounded at 0 and 30; a Poisson or zero-inflated regression model might be more appropriate, but the linear model remains useful for interpretation given the large sample size (which mitigates concerns about non-normality via the central limit theorem).

3 Part 2: Logistic Regression

3.1 Step 1: Create the Binary Outcome

brfss_analytic <- brfss_analytic |>
  mutate(
    frequent_distress = case_when(
      physhlth_days >= 14 ~ "Yes",
      physhlth_days <  14 ~ "No",
      TRUE ~ NA_character_
    ),
    frequent_distress = factor(frequent_distress, levels = c("No", "Yes"))
  )

# Prevalence table
freq_table <- brfss_analytic |>
  count(frequent_distress) |>
  mutate(Percentage = round(n / sum(n) * 100, 1))

freq_table |>
  kable(
    col.names = c("Frequent Physical Distress", "Count", "Percentage (%)"),
    caption   = "Prevalence of Frequent Physical Distress (≥14 days in past 30)"
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"))

Prevalence of Frequent Physical Distress (≥14 days in past 30)
Frequent Physical Distress	Count	Percentage (%)
No	6948	86.9
Yes	1052	13.2

Comment on balance: The outcome is imbalanced, with 13.2% of respondents classified as having frequent physical distress. This degree of imbalance is common for health burden outcomes in population surveys; while it does not invalidate logistic regression, it means that simple accuracy metrics can be misleading, and ROC/AUC is a more appropriate measure of model discrimination.

3.2 Step 2: Simple (Unadjusted) Logistic Regression

I chose gen_health as the predictor for simple logistic regression because general health status had the largest coefficient in the final linear model and conceptually captures a person’s overall physical health burden. Individuals reporting Fair/Poor general health are expected to have substantially higher odds of frequent physical distress.

mod_simple <- glm(
  frequent_distress ~ gen_health,
  data   = brfss_analytic,
  family = binomial
)

summary(mod_simple)

## 
## Call:
## glm(formula = frequent_distress ~ gen_health, family = binomial, 
##     data = brfss_analytic)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -3.39831    0.09021  -37.67   <2e-16 ***
## gen_healthGood       1.14179    0.11198   10.20   <2e-16 ***
## gen_healthFair/Poor  3.28880    0.10465   31.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6227.7  on 7999  degrees of freedom
## Residual deviance: 4754.1  on 7997  degrees of freedom
## AIC: 4760.1
## 
## Number of Fisher Scoring iterations: 6

# Odds ratios and 95% CI
or_simple  <- exp(coef(mod_simple))
ci_simple  <- exp(confint(mod_simple))

or_table <- tibble(
  Term     = names(or_simple),
  OR       = round(or_simple, 3),
  `CI Lower` = round(ci_simple[, 1], 3),
  `CI Upper` = round(ci_simple[, 2], 3)
)

or_table |>
  kable(caption = "Simple Logistic Regression: Odds Ratios for gen_health") |>
  kable_styling(bootstrap_options = c("striped", "hover"))

Simple Logistic Regression: Odds Ratios for gen_health
Term	OR	CI Lower	CI Upper
(Intercept)	0.033	0.028	0.040
gen_healthGood	3.132	2.521	3.911
gen_healthFair/Poor	26.811	21.915	33.038

Interpretation:

Compared to individuals reporting Excellent/Very Good general health (reference), those reporting Good general health have 3.13 times the odds of frequent physical distress (95% CI: 2.52 – 3.91), making all other variables constant.
Compared to individuals reporting Excellent/Very Good general health, those reporting Fair/Poor general health have 26.81 times the odds of frequent physical distress (95% CI: 21.91 – 33.04).

Both associations are statistically significant at α = 0.05, as the 95% confidence intervals exclude 1.0 and the Wald test p-values are well below 0.05.

3.3 Step 3: Multiple Logistic Regression

mod_logistic <- glm(
  frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health + marital,
  data   = brfss_analytic,
  family = binomial
)

summary(mod_logistic)

## 
## Call:
## glm(formula = frequent_distress ~ menthlth_days + bmi + sex + 
##     exercise + depression + age_group + income + education + 
##     smoking + gen_health + marital, family = binomial, data = brfss_analytic)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -3.639413   0.291867 -12.469  < 2e-16 ***
## menthlth_days              0.044787   0.004358  10.277  < 2e-16 ***
## bmi                       -0.004822   0.005556  -0.868  0.38546    
## sexFemale                  0.176389   0.081382   2.167  0.03020 *  
## exerciseYes               -0.586264   0.085768  -6.835 8.17e-12 ***
## depressionYes              0.267888   0.095954   2.792  0.00524 ** 
## age_group35-49             0.512672   0.162401   3.157  0.00160 ** 
## age_group50-64             0.767269   0.154816   4.956 7.20e-07 ***
## age_group65+               0.964882   0.155220   6.216 5.09e-10 ***
## income$25K-$49K           -0.215375   0.111712  -1.928  0.05386 .  
## income$50K-$99K           -0.501752   0.118036  -4.251 2.13e-05 ***
## income$100K+              -0.541456   0.227454  -2.381  0.01729 *  
## educationHS/GED            0.049334   0.165069   0.299  0.76504    
## educationSome college      0.331979   0.165058   2.011  0.04430 *  
## educationCollege graduate  0.283640   0.171335   1.655  0.09783 .  
## smokingCurrent             0.208048   0.116374   1.788  0.07381 .  
## smokingFormer              0.200388   0.089714   2.234  0.02551 *  
## gen_healthGood             0.895044   0.116270   7.698 1.38e-14 ***
## gen_healthFair/Poor        2.696530   0.115309  23.385  < 2e-16 ***
## maritalPreviously married -0.138611   0.096205  -1.441  0.14965    
## maritalNever married      -0.143542   0.127789  -1.123  0.26132    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6227.7  on 7999  degrees of freedom
## Residual deviance: 4420.7  on 7979  degrees of freedom
## AIC: 4462.7
## 
## Number of Fisher Scoring iterations: 6

# Adjusted odds ratios table
tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 3))) |>
  kable(
    col.names = c("Term", "Adj. OR", "Std. Error", "z-statistic",
                  "p-value", "CI Lower", "CI Upper"),
    caption   = "Multiple Logistic Regression: Adjusted Odds Ratios for Frequent Physical Distress"
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Multiple Logistic Regression: Adjusted Odds Ratios for Frequent Physical Distress
Term	Adj. OR	Std. Error	z-statistic	p-value	CI Lower	CI Upper
(Intercept)	0.026	0.292	-12.469	0.000	0.015	0.046
menthlth_days	1.046	0.004	10.277	0.000	1.037	1.055
bmi	0.995	0.006	-0.868	0.385	0.984	1.006
sexFemale	1.193	0.081	2.167	0.030	1.017	1.400
exerciseYes	0.556	0.086	-6.835	0.000	0.470	0.658
depressionYes	1.307	0.096	2.792	0.005	1.082	1.576
age_group35-49	1.670	0.162	3.157	0.002	1.217	2.302
age_group50-64	2.154	0.155	4.956	0.000	1.596	2.929
age_group65+	2.624	0.155	6.216	0.000	1.944	3.573
income$25K-$49K	0.806	0.112	-1.928	0.054	0.648	1.004
income$50K-$99K	0.605	0.118	-4.251	0.000	0.481	0.763
income$100K+	0.582	0.227	-2.381	0.017	0.368	0.899
educationHS/GED	1.051	0.165	0.299	0.765	0.762	1.456
educationSome college	1.394	0.165	2.011	0.044	1.011	1.932
educationCollege graduate	1.328	0.171	1.655	0.098	0.952	1.864
smokingCurrent	1.231	0.116	1.788	0.074	0.979	1.545
smokingFormer	1.222	0.090	2.234	0.026	1.024	1.456
gen_healthGood	2.447	0.116	7.698	0.000	1.952	3.081
gen_healthFair/Poor	14.828	0.115	23.385	0.000	11.863	18.648
maritalPreviously married	0.871	0.096	-1.441	0.150	0.720	1.050
maritalNever married	0.866	0.128	-1.123	0.261	0.673	1.111

Adjusted Odds Ratio Interpretations:

logistic_ors <- tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE)

menthlth_days (continuous): For every additional mentally unhealthy day in the past 30, the odds of frequent physical distress are multiplied by 1.046 (95% CI: 1.037 – 1.055), holding all other variables constant.
gen_healthFair/Poor (categorical): Compared to individuals with Excellent/Very Good general health, those with Fair/Poor general health have 14.83 times the adjusted odds of frequent physical distress (95% CI: 11.86 – 18.65), holding all other variables constant.
depressionYes (binary): Individuals who have ever been told they have a depressive disorder have 1.31 times the adjusted odds of frequent physical distress compared to those without depression (95% CI: 1.08 – 1.58), holding all other variables constant.

3.4 Step 4: Likelihood Ratio Test

I will test whether the income predictors as a group significantly improve the logistic model. This tests whether income level (above and beyond other predictors) is associated with frequent physical distress.

mod_reduced <- glm(
  frequent_distress ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + education + smoking + gen_health + marital,
  data   = brfss_analytic,
  family = binomial
)

# Likelihood ratio test
lrt_result <- anova(mod_reduced, mod_logistic, test = "Chisq")
print(lrt_result)

## Analysis of Deviance Table
## 
## Model 1: frequent_distress ~ menthlth_days + bmi + sex + exercise + depression + 
##     age_group + education + smoking + gen_health + marital
## Model 2: frequent_distress ~ menthlth_days + bmi + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health + marital
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      7982     4439.7                          
## 2      7979     4420.7  3   19.082 0.0002629 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Hypotheses: - H₀: Income categories do not improve model fit; all income coefficients equal zero (β_income25-49K = β_income50-99K = β_income100K+ = 0) - H₁: At least one income coefficient is non-zero; income significantly improves model fit

Results: The likelihood ratio test statistic is 19.082 with 3 degrees of freedom, p-value = 2.629e-04.

Conclusion: At α = 0.05, we reject H₀. The income predictors as a group significantly improve the logistic regression model, indicating that income is independently associated with frequent physical distress even after adjusting for all other predictors.

3.5 Step 5: ROC Curve and AUC

roc_obj <- roc(brfss_analytic$frequent_distress, fitted(mod_logistic))

plot(
  roc_obj,
  main      = "ROC Curve: Frequent Physical Distress Model",
  col       = "orange",
  lwd       = 2,
  print.auc = TRUE,
  auc.polygon = TRUE,
  auc.polygon.col = "purple3"
)

auc_val <- auc(roc_obj)
auc_ci  <- ci.auc(roc_obj)

cat("AUC:", round(auc_val, 4), "\n")

## AUC: 0.8555

cat("95% CI:", round(auc_ci[1], 4), "-", round(auc_ci[3], 4), "\n")

## 95% CI: 0.8426 - 0.8683

AUC Interpretation: The AUC of 0.855 (95% CI: 0.843 – 0.868) indicates acceptable to excellent discrimination (0.7–0.9 range). The model correctly ranks a randomly selected person with frequent physical distress above a randomly selected person without frequent physical distress approximately 85.5% of the time which is substantially better than chance (AUC = 0.5).

3.6 Step 6: Hosmer-Lemeshow Goodness-of-Fit Test

hl_test <- hoslem.test(mod_logistic$y, fitted(mod_logistic), g = 10)
print(hl_test)

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  mod_logistic$y, fitted(mod_logistic)
## X-squared = 5.5773, df = 8, p-value = 0.6945

Hypotheses: - H₀: The model fits the data well (no significant difference between observed and expected frequencies across deciles of predicted probability) - H₁: The model does not fit the data well (significant difference between observed and expected frequencies)

Results: The Hosmer-Lemeshow test statistic is 5.577 with 8 degrees of freedom, p-value = 0.6945.

Interpretation: At α = 0.05, we fail to reject H₀, suggesting the model fits the data adequately. The model calibration is satisfactory across deciles of predicted probability.

Complementarity with AUC: The ROC/AUC assesses discrimination — whether the model correctly rank-orders individuals by their probability of frequent distress — while the Hosmer-Lemeshow test assesses calibration — whether predicted probabilities match observed proportions. A model can discriminate well (high AUC) but still be poorly calibrated, or vice versa. Together, these metrics provide a comprehensive picture of model performance.

4 Part 3: Reflection

4.1 A. Statistical Insights

Numerous individual, behavioral, and health-related factors have a strong association with physical health burden among U.S. adults, according to an examination of BRFSS 2023 data. The best predictors in both the linear and logistic regression models were general health status (gen_health) and mental health (menthlth_days). Compared to individuals with Excellent/Very Good health, those who reported Fair/Poor general health had significantly more physically unhealthy days and significantly higher odds of frequent physical distress, and each extra day of mental illness was linked to more physically unhealthy days and higher odds of frequent distress. The bidirectional relationship between mental and physical health is shown in the fact that Depression was a constant and powerful predictor in both models.

Although effect sizes varied because of the differing outcome scales, the majority of predictors were significant in both models. Socioeconomic factors including income and education were important in the logistic model, supporting the paradigm of social determinants of health. There was a protective correlation between exercise and fewer unwell days and decreased likelihood of regular discomfort.

The inability to prove causality or temporal precedence is a major drawback of this cross-sectional approach. Only relationshipscan be found. In addition, the analytic sample may not accurately reflect the entire national distribution because it is a random subset of 8,000 respondents. This model lacks at least two significant potential confounders: access to healthcare (e.g., insurance coverage, having a primary care provider), which affects both health outcomes and the likelihood of having conditions diagnosed (e.g., depression), and chronic disease burden (e.g., diabetes, arthritis, cardiovascular disease), which is a major driver of physical health days and likely correlated with both BMI and general health.

4.2 B. Linear versus Logistic Regression

The linear regression model quantifies how predictor variables relate to the count of physically unhealthy days providing a continuous estimate of physical health burden. It allows us to say, for example, “each additional BMI unit is associated with X more unhealthy days per month,” which is clinically interpretable in terms of disease burden magnitude. The logistic regression model, by contrast, shifts the research question to the probability of exceeding a clinically meaningful threshold (14 days, the CDC’s definition of frequent physical distress), yielding odds ratios that communicate relative risk in a binary, actionable framework.

Each approach provides distinct information: linear regression captures the full distribution of the outcome and is sensitive to differences across the entire range of unhealthy days, while logistic regression focuses on identifying who crosses into the high-burden category, which may be more relevant for targeting public health interventions. Linear regression is preferred when the outcome is approximately continuous, symmetric, and the research question concerns average differences; logistic regression is preferred when the outcome is binary, the sample has adequate events, and classification or risk stratification is the goal.

The model comparison criteria differ fundamentally: linear regression relies on R²/Adjusted R² (proportion of variance explained) and AIC/BIC, while logistic regression uses AUC (discrimination) and Hosmer-Lemeshow (calibration). R² has no direct equivalent in logistic regression, and AUC is not meaningful for linear models. These different criteria reflect the different inferential goals of each modeling framework.

4.3 C. AI Transparency

For this assignment, I primarily relied on course lecture slides and class examples to guide my analysis and interpretation. I used these materials to structure my model selection process and to understand diagnostic plots. In addition, I consulted an external online resource to clarify the interpretation of the adjusted R-squared criterion. Specifically, I referred to DataCamp (https://www.datacamp.com/tutorial/adjusted-r-squared), which helped reinforce how adjusted R-squared accounts for the number of predictors in a model and is useful for comparing models with different numbers of variables. grammatical correctiong tools were used minimally to help refine wording and improve clarity of written interpretations, but all statistical reasoning, model decisions, and conclusions were based on course materials and my own understanding. I verified outputs by cross-checking them against lecture content and ensuring consistency with statistical concepts discussed in class.

End of Assignment

EPI553_HW04_Ekor_Karo.Rmd

Model Selection and Introduction to Logistic Regression

Ekor, Karo

2026-04-24