Part 0: Data Preparation

Step 1: Load Required Packages and Select Variables

library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(gtsummary)
library(leaps)
library(pROC)
library(ResourceSelection)

brfss_raw <- read_xpt("C:/Users/MY789914/OneDrive - University at Albany - SUNY/Desktop/Stat 553 (R)/Assignment 4/LLCP2023.XPT")

cat("Raw dataset dimensions:\n")

## Raw dataset dimensions:

cat("  Rows:   ", nrow(brfss_raw), "\n")

##   Rows:    433323

cat("  Columns:", ncol(brfss_raw), "\n")

##   Columns: 350

The 2023 BRFSS raw dataset includes 433,323 observations and 350 variables, with each row corresponding to a single respondent from U.S. states and territories surveyed in 2023.

brfss <- brfss_raw %>%
  select(
    PHYSHLTH,
    MENTHLTH,
    `_BMI5`,
    SEXVAR,
    EXERANY2,
    ADDEPEV3,
    `_AGEG5YR`,
    `_INCOMG1`,
    EDUCA,
    `_SMOKER3`,
    GENHLTH,
    MARITAL
  )

Step 2: Recode All Variables

#Outcome 
brfss <- brfss %>%
  mutate(
    physhlth_days = case_when(
      PHYSHLTH == 88 ~ 0,
      PHYSHLTH %in% c(77, 99) ~ NA_real_,
      TRUE ~ as.numeric(PHYSHLTH)
    )
  )

#Continuous predictors
brfss <- brfss %>%
  mutate(
    menthlth_days = case_when(
      MENTHLTH == 88 ~ 0,
      MENTHLTH %in% c(77, 99) ~ NA_real_,
      TRUE ~ as.numeric(MENTHLTH)
    ),
    bmi = case_when(
      `_BMI5` == 9999 ~ NA_real_,
      TRUE ~ `_BMI5` / 100
    )
  )

#Binary predictors 
brfss <- brfss %>%
  mutate(
    sex = factor(SEXVAR, levels = c(1, 2), labels = c("Male", "Female")),
    exercise = factor(case_when(
      EXERANY2 == 1 ~ "Yes",
      EXERANY2 == 2 ~ "No",
      TRUE ~ NA_character_
    )),
    depression = factor(case_when(
      ADDEPEV3 == 1 ~ "Yes",
      ADDEPEV3 == 2 ~ "No",
      TRUE ~ NA_character_
    ))
  )

#Categorical predictors 
brfss <- brfss %>%
  mutate(
    age_group = case_when(
      `_AGEG5YR` %in% 1:3 ~ "18-34",
      `_AGEG5YR` %in% 4:6 ~ "35-49",
      `_AGEG5YR` %in% 7:9 ~ "50-64",
      `_AGEG5YR` %in% 10:13 ~ "65+",
      TRUE ~ NA_character_
    ),
    income = case_when(
      `_INCOMG1` %in% 1:2 ~ "Less than $25K",
      `_INCOMG1` %in% 3:4 ~ "$25K-$49K",
      `_INCOMG1` %in% 5:6 ~ "$50K-$99K",
      `_INCOMG1` == 7 ~ "$100K+",
      TRUE ~ NA_character_
    ),
    education = case_when(
      EDUCA %in% 1:3 ~ "Less than HS",
      EDUCA == 4 ~ "HS/GED",
      EDUCA == 5 ~ "Some college",
      EDUCA == 6 ~ "College graduate",
      TRUE ~ NA_character_
    ),
    smoking = case_when(
      `_SMOKER3` %in% 1:2 ~ "Current",
      `_SMOKER3` == 3 ~ "Former",
      `_SMOKER3` == 4 ~ "Never",
      TRUE ~ NA_character_
    ),
    gen_health = case_when(
      GENHLTH %in% 1:2 ~ "Excellent/Very Good",
      GENHLTH == 3 ~ "Good",
      GENHLTH %in% 4:5 ~ "Fair/Poor",
      TRUE ~ NA_character_
    ),
    marital = case_when(
      MARITAL %in% c(1, 6) ~ "Married/Partnered",
      MARITAL %in% 2:4 ~ "Previously married",
      MARITAL == 5 ~ "Never married",
      TRUE ~ NA_character_
    )
  )

#Set reference levels 
brfss <- brfss %>%
  mutate(
    sex = factor(sex) %>% relevel("Male"),
    exercise = factor(exercise) %>% relevel("No"),
    depression = factor(depression) %>% relevel("No"),
    age_group = factor(age_group) %>% relevel("18-34"),
    income = factor(income) %>% relevel("Less than $25K"),
    education = factor(education) %>% relevel("Less than HS"),
    smoking = factor(smoking) %>% relevel("Never"),
    gen_health = factor(gen_health) %>% relevel("Excellent/Very Good"),
    marital = factor(marital) %>% relevel("Married/Partnered")
  )

saveRDS(brfss, "C:/Users/MY789914/OneDrive - University at Albany - SUNY/Desktop/Stat 553 (R)/Assignment 4/brfss_clean2.rds")

cat("Saved: brfss_clean2.rds —", nrow(brfss), "rows,",
    ncol(brfss), "columns\n")

## Saved: brfss_clean2.rds — 433323 rows, 24 columns

Step 3: Create Analytic Dataset and Report

# Check missingness BEFORE removing
missing_summary <- brfss %>%
  summarise(
    n_total = n(),
    miss_physhlth = sum(is.na(physhlth_days)),
    pct_physhlth = round(mean(is.na(physhlth_days)) * 100, 2),
    miss_menthlth = sum(is.na(menthlth_days)),
    pct_menthlth = round(mean(is.na(menthlth_days)) * 100, 2),
    miss_bmi = sum(is.na(bmi)),
    pct_bmi = round(mean(is.na(bmi)) * 100, 2)
  )

missing_summary

## # A tibble: 1 × 7
##   n_total miss_physhlth pct_physhlth miss_menthlth pct_menthlth miss_bmi pct_bmi
##     <int>         <int>        <dbl>         <int>        <dbl>    <int>   <dbl>
## 1  433323         10785         2.49          8108         1.87    40535    9.35

# Remove missing values
brfss_clean <- brfss %>%
  drop_na()

# Draw random sample
set.seed(1220)

brfss_analytic <- brfss_clean %>%
  slice_sample(n = 8000)

# Final analytic sample size
nrow(brfss_analytic)

## [1] 8000

# Descriptive summary table
table1 <- brfss_analytic %>%
  select(
    physhlth_days,
    menthlth_days,
    bmi,
    sex,
    exercise,
    depression,
    age_group,
    income,
    education,
    smoking,
    gen_health,
    marital
  ) %>%
  tbl_summary(
    statistic = list(
      all_continuous() ~ "{median} ({p25}, {p75})",
      all_categorical() ~ "{n} ({p}%)"
    )
  ) %>%
  modify_caption("Table 1. Descriptive Characteristics of Analytic Sample (N = 8,000)") %>%
  modify_footnote(all_stat_cols() ~ NA)

table1

Table 1. Descriptive Characteristics of Analytic Sample (N = 8,000)
Characteristic	N = 8,000
physhlth_days	0 (0, 4)
menthlth_days	0 (0, 5)
bmi	27 (24, 32)
sex
Male	3,971 (50%)
Female	4,029 (50%)
exercise	6,157 (77%)
depression	1,776 (22%)
age_group
18-34	1,294 (16%)
35-49	1,657 (21%)
50-64	2,109 (26%)
65+	2,940 (37%)
income
Less than $25K	1,090 (14%)
$100K+	682 (8.5%)
$25K-$49K	1,931 (24%)
$50K-$99K	4,297 (54%)
education
Less than HS	391 (4.9%)
College graduate	3,607 (45%)
HS/GED	1,887 (24%)
Some college	2,115 (26%)
smoking
Never	4,808 (60%)
Current	962 (12%)
Former	2,230 (28%)
gen_health
Excellent/Very Good	3,926 (49%)
Fair/Poor	1,426 (18%)
Good	2,648 (33%)
marital
Married/Partnered	4,582 (57%)
Never married	1,368 (17%)
Previously married	2,050 (26%)

Part 1: Model Selection for Linear Regression

Step 1: Fit the Maximum Model

max_model <- lm(
  physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health + marital,
  data = brfss_analytic
)

summary(max_model)

## 
## Call:
## lm(formula = physhlth_days ~ menthlth_days + bmi + sex + exercise + 
##     depression + age_group + income + education + smoking + gen_health + 
##     marital, data = brfss_analytic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.795  -2.839  -1.145   0.690  31.047 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                2.20132    0.62746   3.508 0.000453 ***
## menthlth_days              0.18362    0.01135  16.172  < 2e-16 ***
## bmi                       -0.01877    0.01287  -1.459 0.144638    
## sexFemale                  0.23449    0.16389   1.431 0.152538    
## exerciseYes               -1.93289    0.20408  -9.471  < 2e-16 ***
## depressionYes              0.77536    0.21528   3.602 0.000318 ***
## age_group35-49             0.76935    0.28365   2.712 0.006696 ** 
## age_group50-64             1.40108    0.27874   5.026 5.11e-07 ***
## age_group65+               1.72725    0.27852   6.202 5.87e-10 ***
## income$100K+              -1.31580    0.39821  -3.304 0.000956 ***
## income$25K-$49K           -1.08440    0.27627  -3.925 8.74e-05 ***
## income$50K-$99K           -1.52385    0.27607  -5.520 3.50e-08 ***
## educationCollege graduate  1.05262    0.40445   2.603 0.009269 ** 
## educationHS/GED            0.29371    0.40089   0.733 0.463806    
## educationSome college      1.00198    0.40273   2.488 0.012867 *  
## smokingCurrent             0.46414    0.26617   1.744 0.081241 .  
## smokingFormer              0.43777    0.18776   2.332 0.019749 *  
## gen_healthFair/Poor       10.06509    0.25239  39.879  < 2e-16 ***
## gen_healthGood             1.40982    0.18635   7.565 4.30e-14 ***
## maritalNever married      -0.41244    0.24899  -1.656 0.097666 .  
## maritalPreviously married -0.26701    0.20679  -1.291 0.196680    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.112 on 7979 degrees of freedom
## Multiple R-squared:  0.3258, Adjusted R-squared:  0.3242 
## F-statistic: 192.8 on 20 and 7979 DF,  p-value: < 2.2e-16

AIC(max_model)

## [1] 54115.05

BIC(max_model)

## [1] 54268.77

Interpretation: The model explained about 32.6% of the variability in physically unhealthy days (R² = 0.3258). The adjusted R² (0.3242) was very similar, indicating that adding predictors did not substantially overfit the model. The AIC (54,115.05) and BIC (54,268.77) will be used for model comparison, with lower values indicating better model fit after accounting for model complexity.

Step 2: Best Subsets Regression

best_subsets <- regsubsets(
  physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
    age_group + income + education + smoking + gen_health + marital,
  data = brfss_analytic,
  nvmax = length(coef(max_model)) - 1
)

best_summary <- summary(best_subsets)

# Create summary table
best_table <- data.frame(
  model_size = 1:length(best_summary$adjr2),
  adj_r2 = best_summary$adjr2,
  bic = best_summary$bic
)

best_table

##    model_size    adj_r2       bic
## 1           1 0.2626622 -2420.699
## 2           2 0.2955836 -2778.124
## 3           3 0.3074432 -2905.972
## 4           4 0.3133145 -2966.096
## 5           5 0.3160228 -2989.725
## 6           6 0.3182630 -3007.984
## 7           7 0.3196962 -3016.833
## 8           8 0.3207508 -3021.258
## 9           9 0.3214842 -3021.914
## 10         10 0.3220983 -3021.173
## 11         11 0.3225305 -3018.289
## 12         12 0.3230967 -3016.992
## 13         13 0.3235160 -3013.964
## 14         14 0.3237326 -3008.540
## 15         15 0.3239005 -3002.541
## 16         16 0.3239915 -2995.632
## 17         17 0.3240695 -2988.571
## 18         18 0.3241394 -2981.413
## 19         19 0.3241928 -2974.060
## 20         20 0.3241536 -2965.611

# Adjusted R-squared plot
plot(
  best_table$model_size, best_table$adj_r2,
  type = "b", pch = 19,
  xlab = "Model Size",
  ylab = "Adjusted R-squared",
  main = "Adjusted R-squared Across Model Sizes"
)

# BIC plot
plot(
  best_table$model_size, best_table$bic,
  type = "b", pch = 19,
  xlab = "Model Size",
  ylab = "BIC",
  main = "BIC Across Model Sizes"
)

# Best model by adjusted R-squared
best_size_adjr2 <- which.max(best_summary$adjr2)
best_size_adjr2

## [1] 19

# Best model by BIC
best_size_bic <- which.min(best_summary$bic)
best_size_bic

## [1] 9

Interpretation: The model with the highest adjusted R-squared included 19 predictors, while the model with the lowest BIC included 9 predictors, so the two criteria do not agree on the best model size. Adjusted R-squared continues to increase as more predictors are added, whereas BIC reaches a minimum and then increases. Therefore, BIC favors the simpler model because it penalizes model complexity more strongly.

Step 3: Stepwise Selection Methods

# Backward elimination
backward_model <- step(max_model, direction = "backward", trace = 0)

# Forward selection
null_model <- lm(physhlth_days ~ 1, data = brfss_analytic)

forward_model <- step(
  null_model,
  scope = formula(max_model),
  direction = "forward",
  trace = 0
)

# Stepwise (both directions)
both_model <- step(
  null_model,
  scope = formula(max_model),
  direction = "both",
  trace = 0
)

# Show final models
formula(backward_model)

## physhlth_days ~ menthlth_days + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health

formula(forward_model)

## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking + sex

formula(both_model)

## physhlth_days ~ gen_health + menthlth_days + exercise + age_group + 
##     income + depression + education + smoking + sex

Interpretation: The final model from backward elimination included: menthlth_days, sex, exercise, depression, age_group, income, education, smoking, and gen_health.

The final model from forward selection included: gen_health, menthlth_days, exercise, age_group, income, depression, education, smoking, and sex.

The final model from stepwise selection included: gen_health, menthlth_days, exercise, age_group, income, depression, education, smoking, and sex.

All three stepwise selection methods resulted in the same final model, indicating strong agreement. The predictors retained in all models were menthlth_days, sex, exercise, depression, age_group, income, education, smoking, and gen_health. Marital status was excluded by all methods, suggesting it did not meaningfully contribute to the model.

Step 4: Final Model Selection and Interpretation

final_model <- backward_model

library(broom)

tidy(final_model, conf.int = TRUE)

## # A tibble: 18 × 7
##    term                estimate std.error statistic   p.value conf.low conf.high
##    <chr>                  <dbl>     <dbl>     <dbl>     <dbl>    <dbl>     <dbl>
##  1 (Intercept)            1.39     0.487      2.84  4.49e-  3   0.430      2.34 
##  2 menthlth_days          0.184    0.0114    16.2   6.29e- 58   0.161      0.206
##  3 sexFemale              0.233    0.163      1.43  1.54e-  1  -0.0872     0.553
##  4 exerciseYes           -1.90     0.203     -9.38  8.49e- 21  -2.30      -1.51 
##  5 depressionYes          0.747    0.215      3.48  5.10e-  4   0.326      1.17 
##  6 age_group35-49         0.837    0.270      3.10  1.96e-  3   0.308      1.37 
##  7 age_group50-64         1.48     0.258      5.72  1.10e-  8   0.971      1.98 
##  8 age_group65+           1.84     0.251      7.31  2.93e- 13   1.34       2.33 
##  9 income$100K+          -1.11     0.385     -2.89  3.87e-  3  -1.87      -0.358
## 10 income$25K-$49K       -1.03     0.275     -3.76  1.74e-  4  -1.57      -0.494
## 11 income$50K-$99K       -1.39     0.266     -5.22  1.80e-  7  -1.91      -0.867
## 12 educationCollege g…    1.03     0.404      2.55  1.08e-  2   0.239      1.82 
## 13 educationHS/GED        0.258    0.401      0.643 5.20e-  1  -0.528      1.04 
## 14 educationSome coll…    0.964    0.402      2.39  1.67e-  2   0.175      1.75 
## 15 smokingCurrent         0.478    0.265      1.80  7.13e-  2  -0.0414     0.997
## 16 smokingFormer          0.440    0.188      2.34  1.91e-  2   0.0720     0.808
## 17 gen_healthFair/Poor   10.0      0.249     40.2   2.57e-322   9.51      10.5  
## 18 gen_healthGood         1.36     0.184      7.42  1.33e- 13   1.00       1.72

Interpretation: Holding all other variables constant, each additional mentally unhealthy day was associated with an average increase of 0.18 physically unhealthy days. Individuals who reported any exercise had about 1.90 fewer physically unhealthy days compared to those who did not exercise. Additionally, individuals reporting fair or poor general health had approximately 10 more physically unhealthy days compared to those with excellent or very good health.

Step 5: LINE Assumption Check

par(mfrow = c(2, 2))
plot(final_model)

par(mfrow = c(1, 1))

#Residuals vs. Fitted: The residuals show a clear pattern with a downward trend and a funnel shape, indicating non-linearity and non-constant variance (heteroscedasticity). This suggests that the linearity and constant variance assumptions are somewhat violated. #Normal Q-Q: The residuals deviate noticeably from the reference line, especially in the upper tail, indicating that the residuals are not normally distributed, with heavier tails and potential skewness. #Scale-Location: The spread of residuals increases with fitted values, as shown by the upward trend, indicating heteroscedasticity (non-constant variance). #Residuals vs. Leverage: Most observations have low leverage, and there are no points with extremely large Cook’s distance, suggesting no highly influential observations.

Overall, some LINE assumptions are not fully satisfied, particularly linearity and constant variance, as well as normality of residuals. However, no highly influential observations were identified. These violations are common in large observational datasets and should be acknowledged, although the model may still provide useful insights.

Part 2: Logistic Regression

Step 1: Create the Binary Outcome

brfss_analytic <- brfss_analytic %>%
  mutate(
    frequent_distress = ifelse(physhlth_days >= 14, "Yes", "No"),
    frequent_distress = factor(frequent_distress, levels = c("No", "Yes"))
  )

# Check prevalence
distress_table <- brfss_analytic %>%
  count(frequent_distress) %>%
  mutate(percent = round(n / sum(n) * 100, 2))

distress_table

## # A tibble: 2 × 3
##   frequent_distress     n percent
##   <fct>             <int>   <dbl>
## 1 No                 6948    86.8
## 2 Yes                1052    13.2

#Prevalence: Among the analytic sample (N = 8,000), 1,052 (13.15%) participants reported frequent physical distress (≥14 days), while 6,948 (86.85%) did not.

Interpretation: The outcome is imbalanced, with a relatively small proportion of participants reporting frequent physical distress compared to those who did not. This imbalance should be considered when interpreting logistic regression results.

Step 2: Simple (Unadjusted) Logistic Regression

mod_simple <- glm(
  frequent_distress ~ gen_health,
  data = brfss_analytic,
  family = binomial
)

summary(mod_simple)

## 
## Call:
## glm(formula = frequent_distress ~ gen_health, family = binomial, 
##     data = brfss_analytic)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -3.39831    0.09021  -37.67   <2e-16 ***
## gen_healthFair/Poor  3.28880    0.10465   31.43   <2e-16 ***
## gen_healthGood       1.14179    0.11198   10.20   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6227.7  on 7999  degrees of freedom
## Residual deviance: 4754.1  on 7997  degrees of freedom
## AIC: 4760.1
## 
## Number of Fisher Scoring iterations: 6

# Odds ratios
exp(coef(mod_simple))

##         (Intercept) gen_healthFair/Poor      gen_healthGood 
##          0.03342985         26.81066762          3.13235705

# 95% confidence intervals
exp(confint(mod_simple))

##                           2.5 %      97.5 %
## (Intercept)          0.02787183  0.03970661
## gen_healthFair/Poor 21.91471664 33.03776143
## gen_healthGood       2.52057996  3.91109619

#General health status (gen_health) was selected because it showed a strong association with physically unhealthy days in the linear model and is conceptually related to physical distress.

Interpretation: Compared to individuals reporting excellent or very good health, those reporting fair or poor health had 26.81 times the odds of frequent physical distress (95% CI: 21.91, 33.04). Similarly, those reporting good health had 3.13 times the odds of frequent physical distress (95% CI: 2.52, 3.91).

These associations are statistically significant, as the p-values are < 0.05 and the 95% confidence intervals do not include 1.0.

Step 3: Multiple Logistic Regression

library(broom)
library(kableExtra)

mod_logistic <- glm(
  frequent_distress ~ menthlth_days + sex + exercise + depression +
    age_group + income + education + smoking + gen_health,
  data = brfss_analytic,
  family = binomial
)

summary(mod_logistic)

## 
## Call:
## glm(formula = frequent_distress ~ menthlth_days + sex + exercise + 
##     depression + age_group + income + education + smoking + gen_health, 
##     family = binomial, data = brfss_analytic)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -3.869370   0.237416 -16.298  < 2e-16 ***
## menthlth_days              0.044803   0.004351  10.298  < 2e-16 ***
## sexFemale                  0.167499   0.080816   2.073 0.038208 *  
## exerciseYes               -0.577926   0.085063  -6.794 1.09e-11 ***
## depressionYes              0.258162   0.095637   2.699 0.006946 ** 
## age_group35-49             0.522042   0.157941   3.305 0.000949 ***
## age_group50-64             0.775775   0.146727   5.287 1.24e-07 ***
## age_group65+               0.980003   0.144115   6.800 1.05e-11 ***
## income$100K+              -0.451308   0.220972  -2.042 0.041115 *  
## income$25K-$49K           -0.190466   0.110542  -1.723 0.084887 .  
## income$50K-$99K           -0.439184   0.111295  -3.946 7.94e-05 ***
## educationCollege graduate  0.266869   0.171000   1.561 0.118610    
## educationHS/GED            0.034581   0.164741   0.210 0.833738    
## educationSome college      0.315308   0.164722   1.914 0.055597 .  
## smokingCurrent             0.205214   0.115062   1.784 0.074504 .  
## smokingFormer              0.199548   0.089583   2.228 0.025913 *  
## gen_healthFair/Poor        2.676571   0.113511  23.580  < 2e-16 ***
## gen_healthGood             0.883594   0.115593   7.644 2.11e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6227.7  on 7999  degrees of freedom
## Residual deviance: 4423.9  on 7982  degrees of freedom
## AIC: 4459.9
## 
## Number of Fisher Scoring iterations: 6

# Adjusted odds ratios with 95% CI
logistic_table <- tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE)

logistic_table |> 
  kable(digits = 3)

term	estimate	std.error	statistic	p.value	conf.low	conf.high
(Intercept)	0.021	0.237	-16.298	0.000	0.013	0.033
menthlth_days	1.046	0.004	10.298	0.000	1.037	1.055
sexFemale	1.182	0.081	2.073	0.038	1.009	1.386
exerciseYes	0.561	0.085	-6.794	0.000	0.475	0.663
depressionYes	1.295	0.096	2.699	0.007	1.072	1.560
age_group35-49	1.685	0.158	3.305	0.001	1.240	2.304
age_group50-64	2.172	0.147	5.287	0.000	1.635	2.908
age_group65+	2.664	0.144	6.800	0.000	2.017	3.551
income$100K+	0.637	0.221	-2.042	0.041	0.408	0.971
income$25K-$49K	0.827	0.111	-1.723	0.085	0.666	1.027
income$50K-$99K	0.645	0.111	-3.946	0.000	0.519	0.802
educationCollege graduate	1.306	0.171	1.561	0.119	0.937	1.832
educationHS/GED	1.035	0.165	0.210	0.834	0.751	1.434
educationSome college	1.371	0.165	1.914	0.056	0.995	1.899
smokingCurrent	1.228	0.115	1.784	0.075	0.979	1.536
smokingFormer	1.221	0.090	2.228	0.026	1.024	1.455
gen_healthFair/Poor	14.535	0.114	23.580	0.000	11.670	18.215
gen_healthGood	2.420	0.116	7.644	0.000	1.933	3.042

Interpretation: Holding all other variables constant, each additional mentally unhealthy day was associated with a 4.6% increase in the odds of frequent physical distress (OR = 1.046, 95% CI: 1.037–1.055).

Individuals who reported any exercise had 44% lower odds of frequent physical distress compared to those who did not exercise (OR = 0.561, 95% CI: 0.475–0.663).

Additionally, individuals reporting fair or poor general health had 14.54 times the odds of frequent physical distress compared to those with excellent or very good health (95% CI: 11.67–18.22), indicating a very strong positive association.

Step 4: Likelihood Ratio Test

# Reduced model (remove income)
mod_reduced <- glm(
  frequent_distress ~ menthlth_days + sex + exercise + depression +
    age_group + education + smoking + gen_health,
  data = brfss_analytic,
  family = binomial
)

# Likelihood ratio test
anova(mod_reduced, mod_logistic, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: frequent_distress ~ menthlth_days + sex + exercise + depression + 
##     age_group + education + smoking + gen_health
## Model 2: frequent_distress ~ menthlth_days + sex + exercise + depression + 
##     age_group + income + education + smoking + gen_health
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      7985     4440.4                          
## 2      7982     4423.9  3   16.488 0.0009004 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Hypotheses: Null hypothesis (H₀): The income variables do not improve the model fit (all income coefficients = 0). Alternative hypothesis (H₁): At least one income category is associated with frequent physical distress.

The likelihood ratio test comparing the reduced and full models yielded a deviance difference of 16.49 with 3 degrees of freedom (χ² = 16.49, df = 3), and a p-value = 0.0009.

Conclusion: Since the p-value is less than 0.05, we reject the null hypothesis and conclude that the income variables significantly improve the model fit. Therefore, income should be retained in the model.

Step 5: ROC Curve and AUC

roc_obj <- roc(brfss_analytic$frequent_distress, fitted(mod_logistic))

plot(
  roc_obj,
  main = "ROC Curve: Frequent Physical Distress Model",
  print.auc = TRUE
)

# AUC and 95% CI
auc(roc_obj)

## Area under the curve: 0.8555

ci.auc(roc_obj)

## 95% CI: 0.8427-0.8683 (DeLong)

Interpretation:

The area under the ROC curve (AUC) was 0.856 (95% CI: 0.843–0.868).

The AUC indicates excellent discrimination, meaning the model has a strong ability to distinguish between individuals with and without frequent physical distress. Specifically, the model can correctly rank a randomly selected individual with frequent physical distress higher than one without it approximately 85.6% of the time.

Step 6: Hosmer-Lemeshow Goodness-of-Fit Test

library(ResourceSelection)

hoslem.test(mod_logistic$y, fitted(mod_logistic), g = 10)

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  mod_logistic$y, fitted(mod_logistic)
## X-squared = 6.8688, df = 8, p-value = 0.5509

Hypotheses: Null hypothesis (H₀): The model fits the data well. Alternative hypothesis (H₁): The model does not fit the data well.

Results: The Hosmer-Lemeshow test yielded a chi-square statistic of 6.87 with 8 degrees of freedom (χ² = 6.87, df = 8) and a p-value = 0.551.

Interpretation: Since the p-value is greater than 0.05, we fail to reject the null hypothesis and conclude that there is no evidence of poor model fit, indicating that the model fits the data adequately.

The Hosmer-Lemeshow test suggests good model calibration, while the ROC/AUC result indicated excellent discrimination. Together, these findings suggest that the model both fits the data well and effectively distinguishes between individuals with and without frequent physical distress.

Part 3: Reflection

The results from both the linear and logistic regression models suggest that several factors are strongly associated with physical health burden among U.S. adults. In particular, general health status and mentally unhealthy days emerged as the strongest predictors in both models. Individuals reporting fair or poor health had substantially more physically unhealthy days and much higher odds of frequent physical distress. Similarly, an increase in mentally unhealthy days was consistently associated with worse physical health outcomes, highlighting the close relationship between mental and physical health. Exercise also appeared to be a protective factor, as individuals who exercised reported fewer unhealthy days and lower odds of frequent distress. Some predictors, such as education and smoking, showed weaker or less consistent associations across the models.

A key limitation of this analysis is the use of cross-sectional survey data, which prevents establishing causal relationships and limits interpretation to associations only. Additionally, the reliance on self-reported data may introduce recall or reporting bias. Important potential confounders not included in the model include access to healthcare and chronic medical conditions, both of which could influence both predictors and physical health outcomes.

The linear regression model provides information on the average change in physically unhealthy days, while the logistic regression model estimates the odds of experiencing frequent physical distress, which is useful for identifying high-risk groups. Linear regression is preferred for continuous outcomes, whereas logistic regression is more appropriate for binary outcomes. Model evaluation also differs, with linear models using R-squared and logistic models relying on measures such as AUC and goodness-of-fit tests.

AI Transparency:

AI tool (Chat GPT) were used only for debugging. During this assignment, I encountered issues when running regression models and formatting outputs (errors when using regsubsets() and tidy() for model summaries). I used prompts such as: “Why is regsubsets() not running properly in R?” and “How to format logistic regression output with tidy() and confidence intervals?” The AI suggested checking package usage and function arguments. I verified the suggestions by re-running the code, confirming that the models executed correctly, and ensuring that outputs (coefficients, AUC, and tables) matched expectations from course materials.

EPI 553 — Homework 4: Model Selection, Logistic Regression

EPI 553: Principles of Statistical Inference II | Spring 2026

Maral Yerdenova

2026-04-22