#Research Question What individual, behavioral, and health factors are associated with physical health burden among U.S. adults? How do findings compare when the outcome is modeled as the number of physically unhealthy days (continuous) versus as an indicator of frequent physical distress (binary: 14 or more days in the past 30)? #Dataset 2023 Behavioral Risk Factor Surveillance System (BRFSS)

Setup and Data

library(tidyverse)
library(haven)
library(broom)
library(forecast)
library(leaps)
library(knitr)
library(kableExtra)
library(broom)
library(gtsummary)
library(car)
library(ggeffects)
library(ResourceSelection)  # for Hosmer-Lemeshow
library(pROC)               # for ROC/AUC
library(performance)        # for model performance metrics
library(sjPlot)
library(modelsummary)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

Part 0: Data Preparation

# Step 1: Import data
brfss_raw <- read_xpt("C:/Users/userp/OneDrive/Рабочий стол/HSTA553/R files/LLCP2023.XPT")

# Report dimensions
dim(brfss_raw)

## [1] 433323    350

nrow(brfss_raw)

## [1] 433323

ncol(brfss_raw)

## [1] 350

# Select only needed variables
brfss_working <- brfss_raw %>%
  select(
    PHYSHLTH,
    MENTHLTH,
    `_BMI5`,
    SEXVAR,
    EXERANY2,
    ADDEPEV3,
    `_AGEG5YR`,
    `_INCOMG1`,
    EDUCA,
    `_SMOKER3`,
    GENHLTH,
    MARITAL
  )

# Step 2: Recode variables
brfss_clean <- brfss_working %>%
  mutate(
    # outcome
    physhlth_days = case_when(
      PHYSHLTH == 88 ~ 0,
      PHYSHLTH %in% c(77, 99) ~ NA_real_,
      PHYSHLTH >= 1 & PHYSHLTH <= 30 ~ as.numeric(PHYSHLTH),
      TRUE ~ NA_real_
    ),

    # continuous predictors
    menthlth_days = case_when(
      MENTHLTH == 88 ~ 0,
      MENTHLTH %in% c(77, 99) ~ NA_real_,
      MENTHLTH >= 1 & MENTHLTH <= 30 ~ as.numeric(MENTHLTH),
      TRUE ~ NA_real_
    ),

    bmi = case_when(
      `_BMI5` == 9999 ~ NA_real_,
      TRUE ~ as.numeric(`_BMI5`) / 100
    ),

    # binary predictors
    sex = case_when(
      SEXVAR == 1 ~ "Male",
      SEXVAR == 2 ~ "Female",
      TRUE ~ NA_character_
    ),

    exercise = case_when(
      EXERANY2 == 1 ~ "Yes",
      EXERANY2 == 2 ~ "No",
      EXERANY2 %in% c(7, 9) ~ NA_character_,
      TRUE ~ NA_character_
    ),

    depression = case_when(
      ADDEPEV3 == 1 ~ "Yes",
      ADDEPEV3 == 2 ~ "No",
      ADDEPEV3 %in% c(7, 9) ~ NA_character_,
      TRUE ~ NA_character_
    ),

    # categorical predictors
    age_group = case_when(
      `_AGEG5YR` %in% 1:3 ~ "18-34",
      `_AGEG5YR` %in% 4:6 ~ "35-49",
      `_AGEG5YR` %in% 7:9 ~ "50-64",
      `_AGEG5YR` %in% 10:13 ~ "65+",
      `_AGEG5YR` == 14 ~ NA_character_,
      TRUE ~ NA_character_
    ),

    income = case_when(
      `_INCOMG1` %in% 1:2 ~ "Less than $25K",
      `_INCOMG1` %in% 3:4 ~ "$25K-$49K",
      `_INCOMG1` %in% 5:6 ~ "$50K-$99K",
      `_INCOMG1` == 7 ~ "$100K+",
      `_INCOMG1` == 9 ~ NA_character_,
      TRUE ~ NA_character_
    ),

    education = case_when(
      EDUCA %in% 1:3 ~ "Less than HS",
      EDUCA == 4 ~ "HS/GED",
      EDUCA == 5 ~ "Some college",
      EDUCA == 6 ~ "College graduate",
      EDUCA == 9 ~ NA_character_,
      TRUE ~ NA_character_
    ),

    smoking = case_when(
      `_SMOKER3` %in% 1:2 ~ "Current",
      `_SMOKER3` == 3 ~ "Former",
      `_SMOKER3` == 4 ~ "Never",
      `_SMOKER3` == 9 ~ NA_character_,
      TRUE ~ NA_character_
    ),

    gen_health = case_when(
      GENHLTH %in% 1:2 ~ "Excellent/Very Good",
      GENHLTH == 3 ~ "Good",
      GENHLTH %in% 4:5 ~ "Fair/Poor",
      GENHLTH %in% c(7, 9) ~ NA_character_,
      TRUE ~ NA_character_
    ),

    marital = case_when(
      MARITAL %in% c(1, 6) ~ "Married/Partnered",
      MARITAL %in% 2:4 ~ "Previously married",
      MARITAL == 5 ~ "Never married",
      MARITAL == 9 ~ NA_character_,
      TRUE ~ NA_character_
    )
  ) %>%
  mutate(
    sex = factor(sex, levels = c("Male", "Female")),
    exercise = factor(exercise, levels = c("No", "Yes")),
    depression = factor(depression, levels = c("No", "Yes")),
    age_group = factor(age_group, levels = c("18-34", "35-49", "50-64", "65+")),
    income = factor(income, levels = c("Less than $25K", "$25K-$49K", "$50K-$99K", "$100K+")),
    education = factor(education, levels = c("Less than HS", "HS/GED", "Some college", "College graduate")),
    smoking = factor(smoking, levels = c("Never", "Former", "Current")),
    gen_health = factor(gen_health, levels = c("Excellent/Very Good", "Good", "Fair/Poor")),
    marital = factor(marital, levels = c("Married/Partnered", "Previously married", "Never married"))
  ) %>%
  select(
    physhlth_days,
    menthlth_days,
    bmi,
    sex,
    exercise,
    depression,
    age_group,
    income,
    education,
    smoking,
    gen_health,
    marital
  )

# Step 3: Missingness before removing missing data
missing_summary <- tibble(
  variable = c("physhlth_days", "menthlth_days", "bmi"),
  missing_n = c(
    sum(is.na(brfss_clean$physhlth_days)),
    sum(is.na(brfss_clean$menthlth_days)),
    sum(is.na(brfss_clean$bmi))
  ),
  missing_pct = c(
    mean(is.na(brfss_clean$physhlth_days)) * 100,
    mean(is.na(brfss_clean$menthlth_days)) * 100,
    mean(is.na(brfss_clean$bmi)) * 100
  )
)

missing_summary

## # A tibble: 3 × 3
##   variable      missing_n missing_pct
##   <chr>             <int>       <dbl>
## 1 physhlth_days     10785        2.49
## 2 menthlth_days      8108        1.87
## 3 bmi               40535        9.35

# Create analytic dataset
set.seed(1220)

brfss_analytic <- brfss_clean |>
  drop_na() |>
  slice_sample(n = 8000)

# Final analytic sample size
nrow(brfss_analytic)

## [1] 8000

# Descriptive summary table
tbl_summary(
  brfss_analytic,
  statistic = list(
    all_continuous() ~ "{mean} ({sd})",
    all_categorical() ~ "{n} ({p}%)"
  )
)

Characteristic	N = 8,000¹
physhlth_days	4 (9)
menthlth_days	4 (8)
bmi	29 (7)
sex
Male	3,971 (50%)
Female	4,029 (50%)
exercise	6,157 (77%)
depression	1,776 (22%)
age_group
18-34	1,294 (16%)
35-49	1,657 (21%)
50-64	2,109 (26%)
65+	2,940 (37%)
income
Less than $25K	1,090 (14%)
$25K-$49K	1,931 (24%)
$50K-$99K	4,297 (54%)
$100K+	682 (8.5%)
education
Less than HS	391 (4.9%)
HS/GED	1,887 (24%)
Some college	2,115 (26%)
College graduate	3,607 (45%)
smoking
Never	4,808 (60%)
Former	2,230 (28%)
Current	962 (12%)
gen_health
Excellent/Very Good	3,926 (49%)
Good	2,648 (33%)
Fair/Poor	1,426 (18%)
marital
Married/Partnered	4,582 (57%)
Previously married	2,050 (26%)
Never married	1,368 (17%)
¹ Mean (SD); n (%)

Part 1: Model Selection for Linear Regression

#Step 1: Fit the Maximum Model

# Fit maximum model
max_model <- lm(
  physhlth_days ~ menthlth_days + bmi +
    sex + exercise + depression +
    age_group + income + education +
    smoking + gen_health + marital,
  data = brfss_analytic
)

summary(max_model)

## 
## Call:
## lm(formula = physhlth_days ~ menthlth_days + bmi + sex + exercise + 
##     depression + age_group + income + education + smoking + gen_health + 
##     marital, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.795  -2.839  -1.145   0.690  31.047 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                2.20132    0.62746   3.508 0.000453 ***
## menthlth_days              0.18362    0.01135  16.172  < 2e-16 ***
## bmi                       -0.01877    0.01287  -1.459 0.144638    
## sexFemale                  0.23449    0.16389   1.431 0.152538    
## exerciseYes               -1.93289    0.20408  -9.471  < 2e-16 ***
## depressionYes              0.77536    0.21528   3.602 0.000318 ***
## age_group35-49             0.76935    0.28365   2.712 0.006696 ** 
## age_group50-64             1.40108    0.27874   5.026 5.11e-07 ***
## age_group65+               1.72725    0.27852   6.202 5.87e-10 ***
## income$25K-$49K           -1.08440    0.27627  -3.925 8.74e-05 ***
## income$50K-$99K           -1.52385    0.27607  -5.520 3.50e-08 ***
## income$100K+              -1.31580    0.39821  -3.304 0.000956 ***
## educationHS/GED            0.29371    0.40089   0.733 0.463806    
## educationSome college      1.00198    0.40273   2.488 0.012867 *  
## educationCollege graduate  1.05262    0.40445   2.603 0.009269 ** 
## smokingFormer              0.43777    0.18776   2.332 0.019749 *  
## smokingCurrent             0.46414    0.26617   1.744 0.081241 .  
## gen_healthGood             1.40982    0.18635   7.565 4.30e-14 ***
## gen_healthFair/Poor       10.06509    0.25239  39.879  < 2e-16 ***
## maritalPreviously married -0.26701    0.20679  -1.291 0.196680    
## maritalNever married      -0.41244    0.24899  -1.656 0.097666 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.112 on 7979 degrees of freedom
## Multiple R-squared:  0.3258, Adjusted R-squared:  0.3242 
## F-statistic: 192.8 on 20 and 7979 DF,  p-value: < 2.2e-16

# AIC and BIC
AIC(max_model)

## [1] 54115.05

BIC(max_model)

## [1] 54268.77

#A maximum linear regression model was fit including all selected predictors. The model had an R-squared of 0.326, indicating that approximately 32.6% of the variability in physically unhealthy days is explained by the predictors in the model. The adjusted R-squared was 0.324, which is very close to the R-squared, suggesting that the model does not suffer substantially from overfitting despite including multiple predictors. #The AIC and BIC values for the model were 54,115.05 and 54,268.77, respectively. These values will be used to compare alternative models, with lower values indicating better model fit while accounting for model complexity.

#Step 2: Best Subsets Regression

best_sub <- regsubsets(
  physhlth_days ~ menthlth_days + bmi +
    sex + exercise + depression +
    age_group + income + education +
    smoking + gen_health + marital,
  data = brfss_analytic,
  nvmax = 20   
)

best_summary <- summary(best_sub)

adjr2 <- best_summary$adjr2
bic <- best_summary$bic

best_table <- data.frame(
  model_size = 1:length(adjr2),
  adjr2 = adjr2,
  bic = bic
)

best_table

##    model_size     adjr2       bic
## 1           1 0.2626622 -2420.699
## 2           2 0.2955836 -2778.124
## 3           3 0.3074432 -2905.972
## 4           4 0.3133145 -2966.096
## 5           5 0.3160228 -2989.725
## 6           6 0.3182630 -3007.984
## 7           7 0.3196962 -3016.833
## 8           8 0.3207508 -3021.258
## 9           9 0.3214842 -3021.914
## 10         10 0.3220983 -3021.173
## 11         11 0.3225305 -3018.289
## 12         12 0.3230967 -3016.992
## 13         13 0.3235160 -3013.964
## 14         14 0.3237326 -3008.540
## 15         15 0.3239005 -3002.541
## 16         16 0.3239915 -2995.632
## 17         17 0.3240695 -2988.571
## 18         18 0.3241394 -2981.413
## 19         19 0.3241928 -2974.060
## 20         20 0.3241536 -2965.611

#Best subsets regression was performed using regsubsets(), with the maximum number of predictors set equal to the number of parameters in the full model. #The model with the highest adjusted R-squared included 10 predictors, indicating that this model explains the greatest proportion of variability in physically unhealthy days after accounting for model complexity. #The model with the lowest BIC included 9 predictors, suggesting that this model provides the best balance between goodness-of-fit and model parsimony. #The two criteria did not agree on the optimal model size. Adjusted R-squared favored a slightly larger model, while BIC favored a simpler model with fewer predictors.

#Step 3: Stepwise Selection Methods

# Maximum model (already created earlier)
max_model <- lm(
  physhlth_days ~ menthlth_days + bmi +
    sex + exercise + depression +
    age_group + income + education +
    smoking + gen_health + marital,
  data = brfss_analytic
)

# Null (intercept-only) model
null_model <- lm(physhlth_days ~ 1, data = brfss_analytic)

# 1. Backward elimination
backward_model <- step(max_model, direction = "backward", trace = 0)

# 2. Forward selection
forward_model <- step(null_model,
                      scope = formula(max_model),
                      direction = "forward",
                      trace = 0)

# 3. Stepwise selection
stepwise_model <- step(null_model,
                       scope = formula(max_model),
                       direction = "both",
                       trace = 0)

formula(backward_model)

## physhlth_days ~ menthlth_days + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health

formula(forward_model)

## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking + sex

formula(stepwise_model)

## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking + sex

#The final backward elimination model included the following predictors: menthlth_days, sex, exercise, depression, age_group, income, education, smoking, and gen_health. #The final forward selection model included: gen_health, menthlth_days, exercise, age_group, income, depression, education, smoking, and sex. #The final stepwise selection model included: gen_health, menthlth_days, exercise, age_group, income, depression, education, smoking, and sex.

#Step 4: Final Model Selection and Interpretation

library(broom)

tidy(stepwise_model, conf.int = TRUE)

## # A tibble: 18 × 7
##    term                estimate std.error statistic   p.value conf.low conf.high
##    <chr>                  <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 (Intercept)            1.39     0.487      2.84  4.49e-  3   0.430      2.34 
##  2 gen_healthGood         1.36     0.184      7.42  1.33e- 13   1.00       1.72 
##  3 gen_healthFair/Poor   10.0      0.249     40.2   2.57e-322   9.51      10.5  
##  4 menthlth_days          0.184    0.0114    16.2   6.29e- 58   0.161      0.206
##  5 exerciseYes           -1.90     0.203     -9.38  8.49e- 21  -2.30      -1.51 
##  6 age_group35-49         0.837    0.270      3.10  1.96e-  3   0.308      1.37 
##  7 age_group50-64         1.48     0.258      5.72  1.10e-  8   0.971      1.98 
##  8 age_group65+           1.84     0.251      7.31  2.93e- 13   1.34       2.33 
##  9 income$25K-$49K       -1.03     0.275     -3.76  1.74e-  4  -1.57      -0.494
## 10 income$50K-$99K       -1.39     0.266     -5.22  1.80e-  7  -1.91      -0.867
## 11 income$100K+          -1.11     0.385     -2.89  3.87e-  3  -1.87      -0.358
## 12 depressionYes          0.747    0.215      3.48  5.10e-  4   0.326      1.17 
## 13 educationHS/GED        0.258    0.401      0.643 5.20e-  1  -0.528      1.04 
## 14 educationSome coll…    0.964    0.402      2.39  1.67e-  2   0.175      1.75 
## 15 educationCollege g…    1.03     0.404      2.55  1.08e-  2   0.239      1.82 
## 16 smokingFormer          0.440    0.188      2.34  1.91e-  2   0.0720     0.808
## 17 smokingCurrent         0.478    0.265      1.80  7.13e-  2  -0.0414     0.997
## 18 sexFemale              0.233    0.163      1.43  1.54e-  1  -0.0872     0.553

#The final model was selected based on agreement across the stepwise selection methods (backward, forward, and stepwise), all of which identified the same set of predictors. This model is also consistent with the best subsets results, which suggested an optimal model size of 9–10 predictors. Therefore, the stepwise model was chosen because it provides a good balance between model fit and parsimony while appropriately handling categorical variables as complete factors.

#Step 5: LINE Assumption Check Diagnostic plots were produced using the final linear model to assess the LINE assumptions. #Residuals vs. Fitted: The residuals appear to be randomly scattered around zero with no strong systematic pattern, suggesting that the linearity assumption is reasonably satisfied. There is slight evidence of increasing spread at higher fitted values, indicating mild heteroscedasticity. #Normal Q-Q: The residuals generally follow the reference line, indicating approximate normality. However, there are deviations in the tails, suggesting that extreme values may not be perfectly normally distributed. #Scale-Location: The spread of residuals is relatively consistent across fitted values, although there is a slight upward trend, indicating mild non-constant variance. Overall, the assumption of homoscedasticity appears to be reasonably met. #Residuals vs. Leverage: There are a few observations with higher leverage, but none appear to have excessively large Cook’s distance. This suggests that there are no highly influential observations that unduly affect the model.

#Overall, the LINE assumptions appear to be reasonably satisfied for the final model. While there is some mild deviation from normality in the tails and slight evidence of non-constant variance, these violations are not severe. Therefore, the linear regression model is considered appropriate for this analysis.

Part 2: Logistic Regression

#Step 1: Create the Binary Outcome

brfss_analytic <- brfss_analytic %>%
  mutate(
    frequent_distress = factor(
      if_else(physhlth_days >= 14, "Yes", "No"),
      levels = c("No", "Yes")
    )
  )

table(brfss_analytic$frequent_distress)

## 
##   No  Yes 
## 6948 1052

prop.table(table(brfss_analytic$frequent_distress)) * 100

## 
##    No   Yes 
## 86.85 13.15

#A binary outcome variable, frequent_distress, was created, where individuals reporting 14 or more physically unhealthy days were coded as “Yes” and those reporting fewer than 14 days were coded as “No”. #In the analytic sample, 6,948 (86.85%) of participants did not report frequent physical distress, while 1,052 (13.15%) reported frequent physical distress. #The outcome is somewhat imbalanced, with a smaller proportion of individuals experiencing frequent distress; however, there are sufficient observations in both groups to support logistic regression analysis.

#Step 2: Simple (Unadjusted) Logistic Regression

# Simple logistic regression with general health
mod_simple <- glm(
  frequent_distress ~ gen_health,
  data = brfss_analytic,
  family = binomial
)

summary(mod_simple)

## 
## Call:
## glm(formula = frequent_distress ~ gen_health, family = binomial, 
##     data = brfss_analytic)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -3.39831    0.09021  -37.67   <2e-16 ***
## gen_healthGood       1.14179    0.11198   10.20   <2e-16 ***
## gen_healthFair/Poor  3.28880    0.10465   31.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6227.7  on 7999  degrees of freedom
## Residual deviance: 4754.1  on 7997  degrees of freedom
## AIC: 4760.1
## 
## Number of Fisher Scoring iterations: 6

# Odds ratios
exp(coef(mod_simple))

##         (Intercept)      gen_healthGood gen_healthFair/Poor 
##          0.03342985          3.13235705         26.81066762

# 95% CI for OR
exp(confint(mod_simple))

##                           2.5 %      97.5 %
## (Intercept)          0.02787183  0.03970661
## gen_healthGood       2.52057996  3.91109619
## gen_healthFair/Poor 21.91471664 33.03776143

#General health status was selected as the predictor because it showed a strong association with physically unhealthy days in the linear regression model and is conceptually related to overall physical well-being. A simple logistic regression model was fit with frequent physical distress as the outcome and general health as the predictor. #Compared to individuals reporting excellent or very good health, those reporting fair or poor health had 26.81 times higher odds of frequent physical distress, 95% CI [21.91, 33.04], holding all other variables constant (in this unadjusted model, no other predictors are included). Additionally, individuals reporting good health had 3.13 times higher odds of frequent physical distress compared to those reporting excellent or very good health, 95% CI [2.52, 3.91]. #The associations were statistically significant at α = 0.05, as the p-values were less than 0.001 and the 95% confidence intervals do not include 1.0.

###Step 3: Multiple Logistic Regression

mod_logistic <- glm(
  frequent_distress ~ menthlth_days + sex + exercise + 
    depression + age_group + income + education + 
    smoking + gen_health,
  data = brfss_analytic,
  family = binomial
)

summary(mod_logistic)

## 
## Call:
## glm(formula = frequent_distress ~ menthlth_days + sex + exercise + 
##     depression + age_group + income + education + smoking + gen_health, 
##     family = binomial, data = brfss_analytic)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -3.869370   0.237416 -16.298  < 2e-16 ***
## menthlth_days              0.044803   0.004351  10.298  < 2e-16 ***
## sexFemale                  0.167499   0.080816   2.073 0.038208 *  
## exerciseYes               -0.577926   0.085063  -6.794 1.09e-11 ***
## depressionYes              0.258162   0.095637   2.699 0.006946 ** 
## age_group35-49             0.522042   0.157941   3.305 0.000949 ***
## age_group50-64             0.775775   0.146727   5.287 1.24e-07 ***
## age_group65+               0.980003   0.144115   6.800 1.05e-11 ***
## income$25K-$49K           -0.190466   0.110542  -1.723 0.084887 .  
## income$50K-$99K           -0.439184   0.111295  -3.946 7.94e-05 ***
## income$100K+              -0.451308   0.220972  -2.042 0.041115 *  
## educationHS/GED            0.034581   0.164741   0.210 0.833738    
## educationSome college      0.315308   0.164722   1.914 0.055597 .  
## educationCollege graduate  0.266869   0.171000   1.561 0.118610    
## smokingFormer              0.199548   0.089583   2.228 0.025913 *  
## smokingCurrent             0.205214   0.115062   1.784 0.074504 .  
## gen_healthGood             0.883594   0.115593   7.644 2.11e-14 ***
## gen_healthFair/Poor        2.676571   0.113511  23.580  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6227.7  on 7999  degrees of freedom
## Residual deviance: 4423.9  on 7982  degrees of freedom
## AIC: 4459.9
## 
## Number of Fisher Scoring iterations: 6

tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE) |>
  kable(digits = 3)

term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	0.021	0.237	-16.298	0.000	0.013	0.033
menthlth_days	1.046	0.004	10.298	0.000	1.037	1.055
sexFemale	1.182	0.081	2.073	0.038	1.009	1.386
exerciseYes	0.561	0.085	-6.794	0.000	0.475	0.663
depressionYes	1.295	0.096	2.699	0.007	1.072	1.560
age_group35-49	1.685	0.158	3.305	0.001	1.240	2.304
age_group50-64	2.172	0.147	5.287	0.000	1.635	2.908
age_group65+	2.664	0.144	6.800	0.000	2.017	3.551
income$25K-$49K	0.827	0.111	-1.723	0.085	0.666	1.027
income$50K-$99K	0.645	0.111	-3.946	0.000	0.519	0.802
income$100K+	0.637	0.221	-2.042	0.041	0.408	0.971
educationHS/GED	1.035	0.165	0.210	0.834	0.751	1.434
educationSome college	1.371	0.165	1.914	0.056	0.995	1.899
educationCollege graduate	1.306	0.171	1.561	0.119	0.937	1.832
smokingFormer	1.221	0.090	2.228	0.026	1.024	1.455
smokingCurrent	1.228	0.115	1.784	0.075	0.979	1.536
gen_healthGood	2.420	0.116	7.644	0.000	1.933	3.042
gen_healthFair/Poor	14.535	0.114	23.580	0.000	11.670	18.215

#A multiple logistic regression model was fit to examine the association between selected predictors and frequent physical distress, adjusting for all variables included in the final linear model. Adjusted odds ratios (ORs) with 95% confidence intervals were estimated for all predictors. #For menthlth_days, the adjusted odds ratio was 1.046, indicating that for each additional day of poor mental health, the odds of frequent physical distress were multiplied by 1.046, 95% CI [1.037, 1.055], holding all other variables constant.For exercise (Yes vs No), the adjusted odds ratio was 0.561, meaning that individuals who reported exercising had 0.56 times the odds (approximately 44% lower odds) of frequent physical distress compared to those who did not exercise, 95% CI [0.475, 0.663], holding all other variables constant. For general health, individuals reporting fair or poor health had 14.54 times higher odds of frequent physical distress compared to those reporting excellent or very good health, 95% CI [11.67, 18.22], holding all other variables constant. This indicates a very strong association between perceived general health and frequent physical distress.

###Step 4: Likelihood Ratio Test

# Reduced model
mod_reduced <- glm(
  frequent_distress ~ menthlth_days + sex + exercise + 
    depression + age_group + education + 
    smoking + gen_health,
  data = brfss_analytic,
  family = binomial
)

# Likelihood ratio test
anova(mod_reduced, mod_logistic, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: frequent_distress ~ menthlth_days + sex + exercise + depression + 
##     age_group + education + smoking + gen_health
## Model 2: frequent_distress ~ menthlth_days + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      7985     4440.4                          
## 2      7982     4423.9  3   16.488 0.0009004 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#A likelihood ratio test was conducted to evaluate whether income levels collectively improve the logistic regression model. The null hypothesis was that all income coefficients are equal to zero (income does not improve the model), while the alternative hypothesis was that at least one income coefficient differs from zero. #The likelihood ratio test yielded a chi-square statistic (deviance) of 16.488 with 3 degrees of freedom and a p-value of 0.0009 . At α = 0.05, the null hypothesis was rejected, indicating that income levels significantly improve the model fit.

###Step 5: ROC Curve and AUC

roc_obj <- roc(brfss_analytic$frequent_distress, fitted(mod_logistic))

plot(roc_obj,
     main = "ROC Curve: Frequent Physical Distress Model",
     print.auc = TRUE)

# AUC
auc(roc_obj)

## Area under the curve: 0.8555

# 95% CI
ci.auc(roc_obj)

## 95% CI: 0.8427-0.8683 (DeLong)

#The ROC curve was generated to evaluate the performance of the multiple logistic regression model. The area under the curve (AUC) was 0.856, indicating the model’s overall ability to discriminate between individuals with and without frequent physical distress. #Based on this value, the model demonstrates excellent discrimination, meaning it is highly effective at correctly distinguishing between individuals who experience frequent physical distress and those who do not.

###Step 6: Hosmer-Lemeshow Goodness-of-Fit Test

hl <- hoslem.test(mod_logistic$y, fitted(mod_logistic), g = 10)
hl

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  mod_logistic$y, fitted(mod_logistic)
## X-squared = 6.8688, df = 8, p-value = 0.5509

#A Hosmer–Lemeshow goodness-of-fit test was conducted to assess model calibration. The null hypothesis states that the model fits the data well, while the alternative hypothesis states that the model does not fit the data well. The test yielded a chi-square statistic of 6.87 with 8 degrees of freedom and a p-value of 0.551. At α = 0.05, the null hypothesis was not rejected, indicating that there is no evidence of poor model fit. #The non-significant Hosmer–Lemeshow test suggests that the model fits the data adequately (good calibration). This complements the AUC of 0.856, which indicates excellent discrimination, suggesting that the model both distinguishes well between individuals and fits the observed data reasonably well.

Part 3: Reflection

#The results show that general health, mental health, and exercise are strongly related to physical health burden among U.S. adults. In both models, people with fair or poor general health had much worse outcomes, and more mentally unhealthy days were linked to more physically unhealthy days. Exercise was protective, with people who exercised having better health outcomes. Some variables like education and smoking were less consistent, being weaker or not significant in one of the models. A limitation of this study is that it uses cross-sectional data, so we cannot say anything about cause and effect. Also, some important confounders were not included, such as access to healthcare and presence of chronic diseases.

#The linear regression model shows how predictors affect the number of physically unhealthy days, while the logistic model shows the likelihood of having frequent distress. Linear regression gives more detailed information about changes in the outcome, while logistic regression is easier to interpret for risk. I would use linear regression when the outcome is continuous and logistic regression when focusing on classification or risk. The models are also evaluated differently: linear regression uses R-squared, while logistic regression uses AUC to measure performance.

#END

Homework 4: Model Selection and Introduction to Logistic Regression

EPI 553 — Principles of Statistical Inference II (Spring 2026)

Nursultan

April 24, 2026

Setup and Data

Part 0: Data Preparation

Part 1: Model Selection for Linear Regression

Part 2: Logistic Regression

Part 3: Reflection