library(tidyverse)
library(haven)
library(broom)
library(kableExtra)
library(car)
library(gtsummary)
library(leaps)
library(pROC)
library(ResourceSelection)
brfss_raw <- read_xpt("C:/Users/MY789914/OneDrive - University at Albany - SUNY/Desktop/Stat 553 (R)/Assignment 4/LLCP2023.XPT")
cat("Raw dataset dimensions:\n")
## Raw dataset dimensions:
cat(" Rows: ", nrow(brfss_raw), "\n")
## Rows: 433323
cat(" Columns:", ncol(brfss_raw), "\n")
## Columns: 350
The 2023 BRFSS raw dataset includes 433,323 observations and 350 variables, with each row corresponding to a single respondent from U.S. states and territories surveyed in 2023.
brfss <- brfss_raw %>%
select(
PHYSHLTH,
MENTHLTH,
`_BMI5`,
SEXVAR,
EXERANY2,
ADDEPEV3,
`_AGEG5YR`,
`_INCOMG1`,
EDUCA,
`_SMOKER3`,
GENHLTH,
MARITAL
)
#Outcome
brfss <- brfss %>%
mutate(
physhlth_days = case_when(
PHYSHLTH == 88 ~ 0,
PHYSHLTH %in% c(77, 99) ~ NA_real_,
TRUE ~ as.numeric(PHYSHLTH)
)
)
#Continuous predictors
brfss <- brfss %>%
mutate(
menthlth_days = case_when(
MENTHLTH == 88 ~ 0,
MENTHLTH %in% c(77, 99) ~ NA_real_,
TRUE ~ as.numeric(MENTHLTH)
),
bmi = case_when(
`_BMI5` == 9999 ~ NA_real_,
TRUE ~ `_BMI5` / 100
)
)
#Binary predictors
brfss <- brfss %>%
mutate(
sex = factor(SEXVAR, levels = c(1, 2), labels = c("Male", "Female")),
exercise = factor(case_when(
EXERANY2 == 1 ~ "Yes",
EXERANY2 == 2 ~ "No",
TRUE ~ NA_character_
)),
depression = factor(case_when(
ADDEPEV3 == 1 ~ "Yes",
ADDEPEV3 == 2 ~ "No",
TRUE ~ NA_character_
))
)
#Categorical predictors
brfss <- brfss %>%
mutate(
age_group = case_when(
`_AGEG5YR` %in% 1:3 ~ "18-34",
`_AGEG5YR` %in% 4:6 ~ "35-49",
`_AGEG5YR` %in% 7:9 ~ "50-64",
`_AGEG5YR` %in% 10:13 ~ "65+",
TRUE ~ NA_character_
),
income = case_when(
`_INCOMG1` %in% 1:2 ~ "Less than $25K",
`_INCOMG1` %in% 3:4 ~ "$25K-$49K",
`_INCOMG1` %in% 5:6 ~ "$50K-$99K",
`_INCOMG1` == 7 ~ "$100K+",
TRUE ~ NA_character_
),
education = case_when(
EDUCA %in% 1:3 ~ "Less than HS",
EDUCA == 4 ~ "HS/GED",
EDUCA == 5 ~ "Some college",
EDUCA == 6 ~ "College graduate",
TRUE ~ NA_character_
),
smoking = case_when(
`_SMOKER3` %in% 1:2 ~ "Current",
`_SMOKER3` == 3 ~ "Former",
`_SMOKER3` == 4 ~ "Never",
TRUE ~ NA_character_
),
gen_health = case_when(
GENHLTH %in% 1:2 ~ "Excellent/Very Good",
GENHLTH == 3 ~ "Good",
GENHLTH %in% 4:5 ~ "Fair/Poor",
TRUE ~ NA_character_
),
marital = case_when(
MARITAL %in% c(1, 6) ~ "Married/Partnered",
MARITAL %in% 2:4 ~ "Previously married",
MARITAL == 5 ~ "Never married",
TRUE ~ NA_character_
)
)
#Set reference levels
brfss <- brfss %>%
mutate(
sex = factor(sex) %>% relevel("Male"),
exercise = factor(exercise) %>% relevel("No"),
depression = factor(depression) %>% relevel("No"),
age_group = factor(age_group) %>% relevel("18-34"),
income = factor(income) %>% relevel("Less than $25K"),
education = factor(education) %>% relevel("Less than HS"),
smoking = factor(smoking) %>% relevel("Never"),
gen_health = factor(gen_health) %>% relevel("Excellent/Very Good"),
marital = factor(marital) %>% relevel("Married/Partnered")
)
saveRDS(brfss, "C:/Users/MY789914/OneDrive - University at Albany - SUNY/Desktop/Stat 553 (R)/Assignment 4/brfss_clean2.rds")
cat("Saved: brfss_clean2.rds —", nrow(brfss), "rows,",
ncol(brfss), "columns\n")
## Saved: brfss_clean2.rds — 433323 rows, 24 columns
# Check missingness BEFORE removing
missing_summary <- brfss %>%
summarise(
n_total = n(),
miss_physhlth = sum(is.na(physhlth_days)),
pct_physhlth = round(mean(is.na(physhlth_days)) * 100, 2),
miss_menthlth = sum(is.na(menthlth_days)),
pct_menthlth = round(mean(is.na(menthlth_days)) * 100, 2),
miss_bmi = sum(is.na(bmi)),
pct_bmi = round(mean(is.na(bmi)) * 100, 2)
)
missing_summary
## # A tibble: 1 × 7
## n_total miss_physhlth pct_physhlth miss_menthlth pct_menthlth miss_bmi pct_bmi
## <int> <int> <dbl> <int> <dbl> <int> <dbl>
## 1 433323 10785 2.49 8108 1.87 40535 9.35
# Remove missing values
brfss_clean <- brfss %>%
drop_na()
# Draw random sample
set.seed(1220)
brfss_analytic <- brfss_clean %>%
slice_sample(n = 8000)
# Final analytic sample size
nrow(brfss_analytic)
## [1] 8000
# Descriptive summary table
table1 <- brfss_analytic %>%
select(
physhlth_days,
menthlth_days,
bmi,
sex,
exercise,
depression,
age_group,
income,
education,
smoking,
gen_health,
marital
) %>%
tbl_summary(
statistic = list(
all_continuous() ~ "{median} ({p25}, {p75})",
all_categorical() ~ "{n} ({p}%)"
)
) %>%
modify_caption("Table 1. Descriptive Characteristics of Analytic Sample (N = 8,000)") %>%
modify_footnote(all_stat_cols() ~ NA)
table1
| Characteristic | N = 8,000 |
|---|---|
| physhlth_days | 0 (0, 4) |
| menthlth_days | 0 (0, 5) |
| bmi | 27 (24, 32) |
| sex | |
| Male | 3,971 (50%) |
| Female | 4,029 (50%) |
| exercise | 6,157 (77%) |
| depression | 1,776 (22%) |
| age_group | |
| 18-34 | 1,294 (16%) |
| 35-49 | 1,657 (21%) |
| 50-64 | 2,109 (26%) |
| 65+ | 2,940 (37%) |
| income | |
| Less than $25K | 1,090 (14%) |
| $100K+ | 682 (8.5%) |
| $25K-$49K | 1,931 (24%) |
| $50K-$99K | 4,297 (54%) |
| education | |
| Less than HS | 391 (4.9%) |
| College graduate | 3,607 (45%) |
| HS/GED | 1,887 (24%) |
| Some college | 2,115 (26%) |
| smoking | |
| Never | 4,808 (60%) |
| Current | 962 (12%) |
| Former | 2,230 (28%) |
| gen_health | |
| Excellent/Very Good | 3,926 (49%) |
| Fair/Poor | 1,426 (18%) |
| Good | 2,648 (33%) |
| marital | |
| Married/Partnered | 4,582 (57%) |
| Never married | 1,368 (17%) |
| Previously married | 2,050 (26%) |
max_model <- lm(
physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
age_group + income + education + smoking + gen_health + marital,
data = brfss_analytic
)
summary(max_model)
##
## Call:
## lm(formula = physhlth_days ~ menthlth_days + bmi + sex + exercise +
## depression + age_group + income + education + smoking + gen_health +
## marital, data = brfss_analytic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.795 -2.839 -1.145 0.690 31.047
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.20132 0.62746 3.508 0.000453 ***
## menthlth_days 0.18362 0.01135 16.172 < 2e-16 ***
## bmi -0.01877 0.01287 -1.459 0.144638
## sexFemale 0.23449 0.16389 1.431 0.152538
## exerciseYes -1.93289 0.20408 -9.471 < 2e-16 ***
## depressionYes 0.77536 0.21528 3.602 0.000318 ***
## age_group35-49 0.76935 0.28365 2.712 0.006696 **
## age_group50-64 1.40108 0.27874 5.026 5.11e-07 ***
## age_group65+ 1.72725 0.27852 6.202 5.87e-10 ***
## income$100K+ -1.31580 0.39821 -3.304 0.000956 ***
## income$25K-$49K -1.08440 0.27627 -3.925 8.74e-05 ***
## income$50K-$99K -1.52385 0.27607 -5.520 3.50e-08 ***
## educationCollege graduate 1.05262 0.40445 2.603 0.009269 **
## educationHS/GED 0.29371 0.40089 0.733 0.463806
## educationSome college 1.00198 0.40273 2.488 0.012867 *
## smokingCurrent 0.46414 0.26617 1.744 0.081241 .
## smokingFormer 0.43777 0.18776 2.332 0.019749 *
## gen_healthFair/Poor 10.06509 0.25239 39.879 < 2e-16 ***
## gen_healthGood 1.40982 0.18635 7.565 4.30e-14 ***
## maritalNever married -0.41244 0.24899 -1.656 0.097666 .
## maritalPreviously married -0.26701 0.20679 -1.291 0.196680
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.112 on 7979 degrees of freedom
## Multiple R-squared: 0.3258, Adjusted R-squared: 0.3242
## F-statistic: 192.8 on 20 and 7979 DF, p-value: < 2.2e-16
AIC(max_model)
## [1] 54115.05
BIC(max_model)
## [1] 54268.77
Interpretation: The model explained about 32.6% of the variability in physically unhealthy days (R² = 0.3258). The adjusted R² (0.3242) was very similar, indicating that adding predictors did not substantially overfit the model. The AIC (54,115.05) and BIC (54,268.77) will be used for model comparison, with lower values indicating better model fit after accounting for model complexity.
best_subsets <- regsubsets(
physhlth_days ~ menthlth_days + bmi + sex + exercise + depression +
age_group + income + education + smoking + gen_health + marital,
data = brfss_analytic,
nvmax = length(coef(max_model)) - 1
)
best_summary <- summary(best_subsets)
# Create summary table
best_table <- data.frame(
model_size = 1:length(best_summary$adjr2),
adj_r2 = best_summary$adjr2,
bic = best_summary$bic
)
best_table
## model_size adj_r2 bic
## 1 1 0.2626622 -2420.699
## 2 2 0.2955836 -2778.124
## 3 3 0.3074432 -2905.972
## 4 4 0.3133145 -2966.096
## 5 5 0.3160228 -2989.725
## 6 6 0.3182630 -3007.984
## 7 7 0.3196962 -3016.833
## 8 8 0.3207508 -3021.258
## 9 9 0.3214842 -3021.914
## 10 10 0.3220983 -3021.173
## 11 11 0.3225305 -3018.289
## 12 12 0.3230967 -3016.992
## 13 13 0.3235160 -3013.964
## 14 14 0.3237326 -3008.540
## 15 15 0.3239005 -3002.541
## 16 16 0.3239915 -2995.632
## 17 17 0.3240695 -2988.571
## 18 18 0.3241394 -2981.413
## 19 19 0.3241928 -2974.060
## 20 20 0.3241536 -2965.611
# Adjusted R-squared plot
plot(
best_table$model_size, best_table$adj_r2,
type = "b", pch = 19,
xlab = "Model Size",
ylab = "Adjusted R-squared",
main = "Adjusted R-squared Across Model Sizes"
)
# BIC plot
plot(
best_table$model_size, best_table$bic,
type = "b", pch = 19,
xlab = "Model Size",
ylab = "BIC",
main = "BIC Across Model Sizes"
)
# Best model by adjusted R-squared
best_size_adjr2 <- which.max(best_summary$adjr2)
best_size_adjr2
## [1] 19
# Best model by BIC
best_size_bic <- which.min(best_summary$bic)
best_size_bic
## [1] 9
Interpretation: The model with the highest adjusted R-squared included 19 predictors, while the model with the lowest BIC included 9 predictors, so the two criteria do not agree on the best model size. Adjusted R-squared continues to increase as more predictors are added, whereas BIC reaches a minimum and then increases. Therefore, BIC favors the simpler model because it penalizes model complexity more strongly.
# Backward elimination
backward_model <- step(max_model, direction = "backward", trace = 0)
# Forward selection
null_model <- lm(physhlth_days ~ 1, data = brfss_analytic)
forward_model <- step(
null_model,
scope = formula(max_model),
direction = "forward",
trace = 0
)
# Stepwise (both directions)
both_model <- step(
null_model,
scope = formula(max_model),
direction = "both",
trace = 0
)
# Show final models
formula(backward_model)
## physhlth_days ~ menthlth_days + sex + exercise + depression +
## age_group + income + education + smoking + gen_health
formula(forward_model)
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking + sex
formula(both_model)
## physhlth_days ~ gen_health + menthlth_days + exercise + age_group +
## income + depression + education + smoking + sex
Interpretation: The final model from backward elimination included: menthlth_days, sex, exercise, depression, age_group, income, education, smoking, and gen_health.
The final model from forward selection included: gen_health, menthlth_days, exercise, age_group, income, depression, education, smoking, and sex.
The final model from stepwise selection included: gen_health, menthlth_days, exercise, age_group, income, depression, education, smoking, and sex.
All three stepwise selection methods resulted in the same final model, indicating strong agreement. The predictors retained in all models were menthlth_days, sex, exercise, depression, age_group, income, education, smoking, and gen_health. Marital status was excluded by all methods, suggesting it did not meaningfully contribute to the model.
final_model <- backward_model
library(broom)
tidy(final_model, conf.int = TRUE)
## # A tibble: 18 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1.39 0.487 2.84 4.49e- 3 0.430 2.34
## 2 menthlth_days 0.184 0.0114 16.2 6.29e- 58 0.161 0.206
## 3 sexFemale 0.233 0.163 1.43 1.54e- 1 -0.0872 0.553
## 4 exerciseYes -1.90 0.203 -9.38 8.49e- 21 -2.30 -1.51
## 5 depressionYes 0.747 0.215 3.48 5.10e- 4 0.326 1.17
## 6 age_group35-49 0.837 0.270 3.10 1.96e- 3 0.308 1.37
## 7 age_group50-64 1.48 0.258 5.72 1.10e- 8 0.971 1.98
## 8 age_group65+ 1.84 0.251 7.31 2.93e- 13 1.34 2.33
## 9 income$100K+ -1.11 0.385 -2.89 3.87e- 3 -1.87 -0.358
## 10 income$25K-$49K -1.03 0.275 -3.76 1.74e- 4 -1.57 -0.494
## 11 income$50K-$99K -1.39 0.266 -5.22 1.80e- 7 -1.91 -0.867
## 12 educationCollege g… 1.03 0.404 2.55 1.08e- 2 0.239 1.82
## 13 educationHS/GED 0.258 0.401 0.643 5.20e- 1 -0.528 1.04
## 14 educationSome coll… 0.964 0.402 2.39 1.67e- 2 0.175 1.75
## 15 smokingCurrent 0.478 0.265 1.80 7.13e- 2 -0.0414 0.997
## 16 smokingFormer 0.440 0.188 2.34 1.91e- 2 0.0720 0.808
## 17 gen_healthFair/Poor 10.0 0.249 40.2 2.57e-322 9.51 10.5
## 18 gen_healthGood 1.36 0.184 7.42 1.33e- 13 1.00 1.72
Interpretation: Holding all other variables constant, each additional mentally unhealthy day was associated with an average increase of 0.18 physically unhealthy days. Individuals who reported any exercise had about 1.90 fewer physically unhealthy days compared to those who did not exercise. Additionally, individuals reporting fair or poor general health had approximately 10 more physically unhealthy days compared to those with excellent or very good health.
par(mfrow = c(2, 2))
plot(final_model)
par(mfrow = c(1, 1))
#Residuals vs. Fitted: The residuals show a clear pattern with a downward trend and a funnel shape, indicating non-linearity and non-constant variance (heteroscedasticity). This suggests that the linearity and constant variance assumptions are somewhat violated. #Normal Q-Q: The residuals deviate noticeably from the reference line, especially in the upper tail, indicating that the residuals are not normally distributed, with heavier tails and potential skewness. #Scale-Location: The spread of residuals increases with fitted values, as shown by the upward trend, indicating heteroscedasticity (non-constant variance). #Residuals vs. Leverage: Most observations have low leverage, and there are no points with extremely large Cook’s distance, suggesting no highly influential observations.
Overall, some LINE assumptions are not fully satisfied, particularly linearity and constant variance, as well as normality of residuals. However, no highly influential observations were identified. These violations are common in large observational datasets and should be acknowledged, although the model may still provide useful insights.
brfss_analytic <- brfss_analytic %>%
mutate(
frequent_distress = ifelse(physhlth_days >= 14, "Yes", "No"),
frequent_distress = factor(frequent_distress, levels = c("No", "Yes"))
)
# Check prevalence
distress_table <- brfss_analytic %>%
count(frequent_distress) %>%
mutate(percent = round(n / sum(n) * 100, 2))
distress_table
## # A tibble: 2 × 3
## frequent_distress n percent
## <fct> <int> <dbl>
## 1 No 6948 86.8
## 2 Yes 1052 13.2
#Prevalence: Among the analytic sample (N = 8,000), 1,052 (13.15%) participants reported frequent physical distress (≥14 days), while 6,948 (86.85%) did not.
Interpretation: The outcome is imbalanced, with a relatively small proportion of participants reporting frequent physical distress compared to those who did not. This imbalance should be considered when interpreting logistic regression results.
mod_simple <- glm(
frequent_distress ~ gen_health,
data = brfss_analytic,
family = binomial
)
summary(mod_simple)
##
## Call:
## glm(formula = frequent_distress ~ gen_health, family = binomial,
## data = brfss_analytic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.39831 0.09021 -37.67 <2e-16 ***
## gen_healthFair/Poor 3.28880 0.10465 31.43 <2e-16 ***
## gen_healthGood 1.14179 0.11198 10.20 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6227.7 on 7999 degrees of freedom
## Residual deviance: 4754.1 on 7997 degrees of freedom
## AIC: 4760.1
##
## Number of Fisher Scoring iterations: 6
# Odds ratios
exp(coef(mod_simple))
## (Intercept) gen_healthFair/Poor gen_healthGood
## 0.03342985 26.81066762 3.13235705
# 95% confidence intervals
exp(confint(mod_simple))
## 2.5 % 97.5 %
## (Intercept) 0.02787183 0.03970661
## gen_healthFair/Poor 21.91471664 33.03776143
## gen_healthGood 2.52057996 3.91109619
#General health status (gen_health) was selected because it showed a strong association with physically unhealthy days in the linear model and is conceptually related to physical distress.
Interpretation: Compared to individuals reporting excellent or very good health, those reporting fair or poor health had 26.81 times the odds of frequent physical distress (95% CI: 21.91, 33.04). Similarly, those reporting good health had 3.13 times the odds of frequent physical distress (95% CI: 2.52, 3.91).
These associations are statistically significant, as the p-values are < 0.05 and the 95% confidence intervals do not include 1.0.
library(broom)
library(kableExtra)
mod_logistic <- glm(
frequent_distress ~ menthlth_days + sex + exercise + depression +
age_group + income + education + smoking + gen_health,
data = brfss_analytic,
family = binomial
)
summary(mod_logistic)
##
## Call:
## glm(formula = frequent_distress ~ menthlth_days + sex + exercise +
## depression + age_group + income + education + smoking + gen_health,
## family = binomial, data = brfss_analytic)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.869370 0.237416 -16.298 < 2e-16 ***
## menthlth_days 0.044803 0.004351 10.298 < 2e-16 ***
## sexFemale 0.167499 0.080816 2.073 0.038208 *
## exerciseYes -0.577926 0.085063 -6.794 1.09e-11 ***
## depressionYes 0.258162 0.095637 2.699 0.006946 **
## age_group35-49 0.522042 0.157941 3.305 0.000949 ***
## age_group50-64 0.775775 0.146727 5.287 1.24e-07 ***
## age_group65+ 0.980003 0.144115 6.800 1.05e-11 ***
## income$100K+ -0.451308 0.220972 -2.042 0.041115 *
## income$25K-$49K -0.190466 0.110542 -1.723 0.084887 .
## income$50K-$99K -0.439184 0.111295 -3.946 7.94e-05 ***
## educationCollege graduate 0.266869 0.171000 1.561 0.118610
## educationHS/GED 0.034581 0.164741 0.210 0.833738
## educationSome college 0.315308 0.164722 1.914 0.055597 .
## smokingCurrent 0.205214 0.115062 1.784 0.074504 .
## smokingFormer 0.199548 0.089583 2.228 0.025913 *
## gen_healthFair/Poor 2.676571 0.113511 23.580 < 2e-16 ***
## gen_healthGood 0.883594 0.115593 7.644 2.11e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6227.7 on 7999 degrees of freedom
## Residual deviance: 4423.9 on 7982 degrees of freedom
## AIC: 4459.9
##
## Number of Fisher Scoring iterations: 6
# Adjusted odds ratios with 95% CI
logistic_table <- tidy(mod_logistic, conf.int = TRUE, exponentiate = TRUE)
logistic_table |>
kable(digits = 3)
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.021 | 0.237 | -16.298 | 0.000 | 0.013 | 0.033 |
| menthlth_days | 1.046 | 0.004 | 10.298 | 0.000 | 1.037 | 1.055 |
| sexFemale | 1.182 | 0.081 | 2.073 | 0.038 | 1.009 | 1.386 |
| exerciseYes | 0.561 | 0.085 | -6.794 | 0.000 | 0.475 | 0.663 |
| depressionYes | 1.295 | 0.096 | 2.699 | 0.007 | 1.072 | 1.560 |
| age_group35-49 | 1.685 | 0.158 | 3.305 | 0.001 | 1.240 | 2.304 |
| age_group50-64 | 2.172 | 0.147 | 5.287 | 0.000 | 1.635 | 2.908 |
| age_group65+ | 2.664 | 0.144 | 6.800 | 0.000 | 2.017 | 3.551 |
| income$100K+ | 0.637 | 0.221 | -2.042 | 0.041 | 0.408 | 0.971 |
| income$25K-$49K | 0.827 | 0.111 | -1.723 | 0.085 | 0.666 | 1.027 |
| income$50K-$99K | 0.645 | 0.111 | -3.946 | 0.000 | 0.519 | 0.802 |
| educationCollege graduate | 1.306 | 0.171 | 1.561 | 0.119 | 0.937 | 1.832 |
| educationHS/GED | 1.035 | 0.165 | 0.210 | 0.834 | 0.751 | 1.434 |
| educationSome college | 1.371 | 0.165 | 1.914 | 0.056 | 0.995 | 1.899 |
| smokingCurrent | 1.228 | 0.115 | 1.784 | 0.075 | 0.979 | 1.536 |
| smokingFormer | 1.221 | 0.090 | 2.228 | 0.026 | 1.024 | 1.455 |
| gen_healthFair/Poor | 14.535 | 0.114 | 23.580 | 0.000 | 11.670 | 18.215 |
| gen_healthGood | 2.420 | 0.116 | 7.644 | 0.000 | 1.933 | 3.042 |
Interpretation: Holding all other variables constant, each additional mentally unhealthy day was associated with a 4.6% increase in the odds of frequent physical distress (OR = 1.046, 95% CI: 1.037–1.055).
Individuals who reported any exercise had 44% lower odds of frequent physical distress compared to those who did not exercise (OR = 0.561, 95% CI: 0.475–0.663).
Additionally, individuals reporting fair or poor general health had 14.54 times the odds of frequent physical distress compared to those with excellent or very good health (95% CI: 11.67–18.22), indicating a very strong positive association.
# Reduced model (remove income)
mod_reduced <- glm(
frequent_distress ~ menthlth_days + sex + exercise + depression +
age_group + education + smoking + gen_health,
data = brfss_analytic,
family = binomial
)
# Likelihood ratio test
anova(mod_reduced, mod_logistic, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: frequent_distress ~ menthlth_days + sex + exercise + depression +
## age_group + education + smoking + gen_health
## Model 2: frequent_distress ~ menthlth_days + sex + exercise + depression +
## age_group + income + education + smoking + gen_health
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 7985 4440.4
## 2 7982 4423.9 3 16.488 0.0009004 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Hypotheses: Null hypothesis (H₀): The income variables do not improve the model fit (all income coefficients = 0). Alternative hypothesis (H₁): At least one income category is associated with frequent physical distress.
The likelihood ratio test comparing the reduced and full models yielded a deviance difference of 16.49 with 3 degrees of freedom (χ² = 16.49, df = 3), and a p-value = 0.0009.
Conclusion: Since the p-value is less than 0.05, we reject the null hypothesis and conclude that the income variables significantly improve the model fit. Therefore, income should be retained in the model.
roc_obj <- roc(brfss_analytic$frequent_distress, fitted(mod_logistic))
plot(
roc_obj,
main = "ROC Curve: Frequent Physical Distress Model",
print.auc = TRUE
)
# AUC and 95% CI
auc(roc_obj)
## Area under the curve: 0.8555
ci.auc(roc_obj)
## 95% CI: 0.8427-0.8683 (DeLong)
Interpretation:
The area under the ROC curve (AUC) was 0.856 (95% CI: 0.843–0.868).
The AUC indicates excellent discrimination, meaning the model has a strong ability to distinguish between individuals with and without frequent physical distress. Specifically, the model can correctly rank a randomly selected individual with frequent physical distress higher than one without it approximately 85.6% of the time.
library(ResourceSelection)
hoslem.test(mod_logistic$y, fitted(mod_logistic), g = 10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: mod_logistic$y, fitted(mod_logistic)
## X-squared = 6.8688, df = 8, p-value = 0.5509
Hypotheses: Null hypothesis (H₀): The model fits the data well. Alternative hypothesis (H₁): The model does not fit the data well.
Results: The Hosmer-Lemeshow test yielded a chi-square statistic of 6.87 with 8 degrees of freedom (χ² = 6.87, df = 8) and a p-value = 0.551.
Interpretation: Since the p-value is greater than 0.05, we fail to reject the null hypothesis and conclude that there is no evidence of poor model fit, indicating that the model fits the data adequately.
The Hosmer-Lemeshow test suggests good model calibration, while the ROC/AUC result indicated excellent discrimination. Together, these findings suggest that the model both fits the data well and effectively distinguishes between individuals with and without frequent physical distress.
The results from both the linear and logistic regression models suggest that several factors are strongly associated with physical health burden among U.S. adults. In particular, general health status and mentally unhealthy days emerged as the strongest predictors in both models. Individuals reporting fair or poor health had substantially more physically unhealthy days and much higher odds of frequent physical distress. Similarly, an increase in mentally unhealthy days was consistently associated with worse physical health outcomes, highlighting the close relationship between mental and physical health. Exercise also appeared to be a protective factor, as individuals who exercised reported fewer unhealthy days and lower odds of frequent distress. Some predictors, such as education and smoking, showed weaker or less consistent associations across the models.
A key limitation of this analysis is the use of cross-sectional survey data, which prevents establishing causal relationships and limits interpretation to associations only. Additionally, the reliance on self-reported data may introduce recall or reporting bias. Important potential confounders not included in the model include access to healthcare and chronic medical conditions, both of which could influence both predictors and physical health outcomes.
The linear regression model provides information on the average change in physically unhealthy days, while the logistic regression model estimates the odds of experiencing frequent physical distress, which is useful for identifying high-risk groups. Linear regression is preferred for continuous outcomes, whereas logistic regression is more appropriate for binary outcomes. Model evaluation also differs, with linear models using R-squared and logistic models relying on measures such as AUC and goodness-of-fit tests.
AI tool (Chat GPT) were used only for debugging. During this assignment, I encountered issues when running regression models and formatting outputs (errors when using regsubsets() and tidy() for model summaries). I used prompts such as: “Why is regsubsets() not running properly in R?” and “How to format logistic regression output with tidy() and confidence intervals?” The AI suggested checking package usage and function arguments. I verified the suggestions by re-running the code, confirming that the models executed correctly, and ensuring that outputs (coefficients, AUC, and tables) matched expectations from course materials.