Setup and Data
library(tidyverse)
library(haven)
library(janitor)
library(knitr)
library(kableExtra)
library(broom)
library(gtsummary)
library(car)
library(ggeffects)
library(plotly)
options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))We continue using the Behavioral Risk Factor Surveillance System (BRFSS) 2020. In previous lectures, we modeled mentally unhealthy days as a continuous outcome. Today, we transform this into a binary outcome: frequent mental distress (FMD), defined as reporting 14 or more mentally unhealthy days in the past 30 days. This threshold is used by the CDC as a standard measure of frequent mental distress.
Research question:
What individual, behavioral, and socioeconomic factors are associated with frequent mental distress among U.S. adults?
brfss_full <- read_xpt(
"C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/BRFSS_2020.XPT"
) |>
clean_names()brfss_logistic <- brfss_full |>
mutate(
# Binary outcome: frequent mental distress (>= 14 days)
menthlth_days = case_when(
menthlth == 88 ~ 0,
menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
TRUE ~ NA_real_
),
fmd = factor(
ifelse(menthlth_days >= 14, 1, 0),
levels = c(0, 1),
labels = c("No", "Yes")
),
# Predictors
physhlth_days = case_when(
physhlth == 88 ~ 0,
physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
TRUE ~ NA_real_
),
sleep_hrs = case_when(
sleptim1 >= 1 & sleptim1 <= 14 ~ as.numeric(sleptim1),
TRUE ~ NA_real_
),
age = age80,
sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_),
exercise = factor(case_when(
exerany2 == 1 ~ "Yes",
exerany2 == 2 ~ "No",
TRUE ~ NA_character_
), levels = c("No", "Yes")),
income_cat = case_when(
income2 %in% 1:8 ~ as.numeric(income2),
TRUE ~ NA_real_
),
smoker = factor(case_when(
smokday2 %in% c(1, 2) ~ "Current",
smokday2 == 3 ~ "Former/Never",
TRUE ~ NA_character_
), levels = c("Former/Never", "Current"))
) |>
filter(
!is.na(fmd), !is.na(physhlth_days), !is.na(sleep_hrs),
!is.na(age), age >= 18, !is.na(sex), !is.na(bmi),
!is.na(exercise), !is.na(income_cat), !is.na(smoker)
)
set.seed(1220)
brfss_logistic <- brfss_logistic |>
dplyr::select(fmd, menthlth_days, physhlth_days, sleep_hrs, age, sex,
bmi, exercise, income_cat, smoker) |>
slice_sample(n = 5000)
# Save for lab use
saveRDS(brfss_logistic,
"C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/BRFSS_2020.rds")
tibble(Metric = c("Observations", "Variables", "FMD Cases", "FMD Prevalence"),
Value = c(nrow(brfss_logistic), ncol(brfss_logistic),
sum(brfss_logistic$fmd == "Yes"),
paste0(round(100 * mean(brfss_logistic$fmd == "Yes"), 1), "%"))) |>
kable(caption = "Analytic Dataset Overview") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Metric | Value |
|---|---|
| Observations | 5000 |
| Variables | 10 |
| FMD Cases | 757 |
| FMD Prevalence | 15.1% |
brfss_logistic |>
tbl_summary(
by = fmd,
include = c(physhlth_days, sleep_hrs, age, sex, bmi, exercise,
income_cat, smoker),
type = list(
c(physhlth_days, sleep_hrs, age, bmi, income_cat) ~ "continuous"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})"
),
label = list(
physhlth_days ~ "Physical unhealthy days",
sleep_hrs ~ "Sleep hours",
age ~ "Age (years)",
sex ~ "Sex",
bmi ~ "BMI",
exercise ~ "Exercise in past 30 days",
income_cat ~ "Income category (1-8)",
smoker ~ "Smoking status"
)
) |>
add_overall() |>
add_p() |>
bold_labels()| Characteristic | Overall N = 5,0001 |
No N = 4,2431 |
Yes N = 7571 |
p-value |
|---|---|---|---|---|
| Physical unhealthy days | 4 (9) | 3 (8) | 10 (13) | |
| Sleep hours | 7.00 (1.48) | 7.09 (1.40) | 6.51 (1.83) | |
| Age (years) | 56 (16) | 57 (16) | 50 (16) | |
| Sex | ||||
| Male | 2,701 (54%) | 2,378 (56%) | 323 (43%) | |
| Female | 2,299 (46%) | 1,865 (44%) | 434 (57%) | |
| BMI | 28.5 (6.3) | 28.4 (6.2) | 29.3 (7.0) | |
| Exercise in past 30 days | 3,673 (73%) | 3,192 (75%) | 481 (64%) | |
| Income category (1-8) | 5.85 (2.11) | 6.00 (2.04) | 5.05 (2.29) | |
| Smoking status | ||||
| Former/Never | 3,280 (66%) | 2,886 (68%) | 394 (52%) | |
| Current | 1,720 (34%) | 1,357 (32%) | 363 (48%) | |
| 1 Mean (SD); n (%) | ||||
| Variable | Description | Type |
|---|---|---|
fmd |
Frequent mental distress (No/Yes) | Factor (outcome) |
menthlth_days |
Mentally unhealthy days (0-30) | Numeric |
physhlth_days |
Physically unhealthy days (0-30) | Numeric |
sleep_hrs |
Hours of sleep per night | Numeric |
age |
Age in years | Numeric |
sex |
Male / Female | Factor |
bmi |
Body mass index | Numeric |
exercise |
Exercised in past 30 days (No/Yes) | Factor |
income_cat |
Household income category (1-8) | Numeric |
smoker |
Former/Never vs. Current | Factor |
1a. (5 pts) Create a frequency table showing the number and percentage of individuals with and without frequent mental distress.
# Create frequency table (counts)
freq_table <- table(brfss_logistic$fmd)
# Convert to percentages
percent_table <- prop.table(freq_table) * 100
# Combine into one table
result <- data.frame(
Category = names(freq_table),
Count = as.vector(freq_table),
Percentage = round(as.vector(percent_table), 2)
)
# View result
print(result)## Category Count Percentage
## 1 No 4243 84.86
## 2 Yes 757 15.14
1b. (5 pts) Create a descriptive summary table of at
least 4 predictors, stratified by FMD status. Use
tbl_summary().
brfss_logistic |>
tbl_summary(
by = fmd,
include = c(exercise, age, sex, bmi),
type = list(
c(age, bmi) ~ "continuous"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})"
),
label = list(
exercise ~ "Exercise",
age ~ "Age (years)",
sex ~ "Sex",
bmi ~ "BMI"
)
) |>
add_overall() |>
add_p() |>
bold_labels()| Characteristic | Overall N = 5,0001 |
No N = 4,2431 |
Yes N = 7571 |
p-value |
|---|---|---|---|---|
| Exercise | 3,673 (73%) | 3,192 (75%) | 481 (64%) | |
| Age (years) | 56 (16) | 57 (16) | 50 (16) | |
| Sex | ||||
| Male | 2,701 (54%) | 2,378 (56%) | 323 (43%) | |
| Female | 2,299 (46%) | 1,865 (44%) | 434 (57%) | |
| BMI | 28.5 (6.3) | 28.4 (6.2) | 29.3 (7.0) | |
| 1 n (%); Mean (SD) | ||||
1c. (5 pts) Create a bar chart showing the proportion of FMD by exercise status OR smoking status.
ggplot(brfss_logistic, aes(x = fmd, fill = exercise)) +
geom_bar(color = "black", position = "dodge") +
geom_text(stat = "count", aes(label = ..count..),
position = position_dodge(width = 0.9), vjust = -0.5) +
labs(title = "Frequent mental distress by exercise",
x = "Frequent mental distress",
y = "Count") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5)
)2a. (5 pts) Fit a simple logistic regression model predicting FMD from exercise. Report the coefficients on the log-odds scale.
mod_ex <- glm(fmd ~ exercise, data = brfss_logistic,
family = binomial(link = "logit"))
tidy(mod_ex, conf.int = TRUE, exponentiate = FALSE) |>
kable(digits = 3, caption = "Simple Logistic Regression: FMD ~ Exercise (Log-Odds Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | -1.337 | 0.068 | -19.769 | 0 | -1.471 | -1.206 |
| exerciseYes | -0.555 | 0.083 | -6.655 | 0 | -0.718 | -0.391 |
2b. (5 pts) Exponentiate the coefficients to obtain odds ratios with 95% confidence intervals.
mod_ex <- glm(
fmd ~ exercise,
data = brfss_logistic,
family = binomial(link = "logit")
)
tidy(mod_ex, conf.int = TRUE, exponentiate = TRUE) |>
kable(
digits = 3,
caption = "Simple Logistic Regression: FMD ~ Exercise (Odds Ratio Scale)"
) |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.263 | 0.068 | -19.769 | 0 | 0.230 | 0.299 |
| exerciseYes | 0.574 | 0.083 | -6.655 | 0 | 0.488 | 0.676 |
2c. (5 pts) Interpret the odds ratio for exercise in the context of the research question.
The odds ratio for exerciseYes is 0.574(95%CI: 0.488-0.676). This shows us that individuals who exercised in the past 30 days have about 43% lower odds of exerpeincing frequent mental distress compared to those who do not exercise.
2d. (5 pts) Create a plot showing the predicted probability of FMD across levels of a continuous predictor (e.g., age or sleep hours).
mod_age <- glm(fmd ~ age, data = brfss_logistic,
family = binomial(link = "logit"))
ggpredict(mod_age, terms = "age [18:80]") |>
plot() +
labs(title = "Predicted Probability of Frequent Mental Distress by Age",
x = "Age (years)", y = "Predicted Probability of FMD") +
theme_minimal()3a. (5 pts) Fit three separate simple logistic regression models, each with a different predictor of your choice.
mod_smoker <- glm(fmd ~ smoker, data = brfss_logistic,
family = binomial(link = "logit"))
tidy(mod_smoker, conf.int = TRUE, exponentiate = TRUE) |>
kable(digits = 3,
caption = "Simple Logistic Regression: FMD ~ Smoking (Odds Ratio Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.137 | 0.054 | -37.076 | 0 | 0.123 | 0.151 |
| smokerCurrent | 1.959 | 0.080 | 8.424 | 0 | 1.675 | 2.291 |
mod_bmi <- glm(fmd ~ bmi, data = brfss_logistic,
family = binomial(link = "logit"))
tidy(mod_bmi, conf.int = TRUE, exponentiate = TRUE) |>
kable(digits = 3,
caption = "Simple Logistic Regression: FMD ~ BMI (Odds Ratio Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.094 | 0.178 | -13.299 | 0 | 0.066 | 0.133 |
| bmi | 1.023 | 0.006 | 3.745 | 0 | 1.011 | 1.034 |
mod_age <- glm(fmd ~ age, data = brfss_logistic,
family = binomial(link = "logit"))
tidy(mod_age, conf.int = TRUE, exponentiate = TRUE) |>
kable(digits = 3,
caption = "Simple Logistic Regression: FMD ~ Age (Odds Ratio Scale)") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 0.725 | 0.131 | -2.451 | 0.014 | 0.559 | 0.936 |
| age | 0.974 | 0.002 | -10.703 | 0.000 | 0.970 | 0.979 |
3b. (10 pts) Create a table comparing the odds ratios from all three models.
m1 <- glm(fmd ~ age, family = binomial, data = brfss_logistic)
m2 <- glm(fmd ~ bmi, family = binomial, data = brfss_logistic)
m3 <- glm(fmd ~ smoker, family = binomial, data = brfss_logistic)
tribble(
~Model, ~`Odds Ratio`, ~`95% CI`,
"M1 (age)",
round(exp(coef(m1)[2]), 3),
paste0("(", round(exp(confint(m1)[2,1]),3), ", ", round(exp(confint(m1)[2,2]),3), ")"),
"M2 (bmi)",
round(exp(coef(m2)[2]), 3),
paste0("(", round(exp(confint(m2)[2,1]),3), ", ", round(exp(confint(m2)[2,2]),3), ")"),
"M3 (smoker)",
round(exp(coef(m3)[2]), 3),
paste0("(", round(exp(confint(m3)[2,1]),3), ", ", round(exp(confint(m3)[2,2]),3), ")"),
) |>
kable(caption = "Comparing 3 Logistic Regression Models") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
row_spec(0, bold = TRUE)| Model | Odds Ratio | 95% CI |
|---|---|---|
| M1 (age) | 0.974 | (0.97, 0.979) |
| M2 (bmi) | 1.023 | (1.011, 1.034) |
| M3 (smoker) | 1.959 | (1.675, 2.291) |
3c. (5 pts) Which predictor has the strongest crude association with FMD? Justify your answer.
Among the three predictors, smoking has the strongest association with FMD. The OR is 1.959, meaning that current smokers have nearly twice the odds of experiencing frequent mental distress compared to non-smokers. Age and BMI have a weaker association but are still statistically significant.
4a. (5 pts) Fit a multiple logistic regression model predicting FMD from at least 3 predictors.
4b. (5 pts) Report the adjusted odds ratios using
tbl_regression().
mod_multi <- glm(fmd ~ age + bmi + smoker,
data = brfss_logistic,
family = binomial(link = "logit"))
mod_multi |>
tbl_regression(
exponentiate = TRUE,
label = list(
age ~ "Age",
bmi ~ "BMI",
smoker ~ "Smoking status"
)
) |>
bold_labels() |>
bold_p()| Characteristic | OR | 95% CI | p-value |
|---|---|---|---|
| Age | 0.98 | 0.97, 0.98 | <0.001 |
| BMI | 1.02 | 1.01, 1.03 | <0.001 |
| Smoking status | |||
| Former/Never | — | — | |
| Current | 1.67 | 1.42, 1.97 | <0.001 |
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio | |||
4c. (5 pts) For one predictor, compare the crude OR (from Task 3) with the adjusted OR (from Task 4). Show both values.
# Crude ORs
crude_smoker <- tidy(mod_smoker, exponentiate = TRUE, conf.int = TRUE) |>
filter(term == "smokerCurrent") |>
dplyr::select(term, estimate, conf.low, conf.high) |>
mutate(type = "Crude")
# Adjusted ORs
adj_smoker <- tidy(mod_multi, exponentiate = TRUE, conf.int = TRUE) |>
filter(term == "smokerCurrent") |>
dplyr::select(term, estimate, conf.low, conf.high) |>
mutate(type = "Adjusted")
bind_rows(crude_smoker, adj_smoker) |>
mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 3))) |>
kable(col.names = c("Predictor", "OR", "95% CI Lower", "95% CI Upper", "Type"),
caption = "Crude vs. Adjusted Odds Ratios") |>
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Predictor | OR | 95% CI Lower | 95% CI Upper | Type |
|---|---|---|---|---|
| smokerCurrent | 1.959 | 1.675 | 2.291 | Crude |
| smokerCurrent | 1.672 | 1.421 | 1.967 | Adjusted |
4d. (5 pts) In 2-3 sentences, assess whether confounding is present for the predictor you chose. Which direction did the OR change, and what does this mean?
Confounding is present for smoking status. The crude ratio for current smokers is 1.959, but after adjusting for age and BMI, the adjusted odds ratio decreased to 1.672. The shows that part of the association between smoking and FMD is explained by other predictors. The direction of the OR decreased showing us that age and BMI are positive confounders.
Completion credit (25 points): Awarded for a complete, good-faith attempt at all tasks. Total: 75 + 25 = 100 points.
End of Lab Activity