knitr::opts_chunk$set(
  echo    = TRUE,
  warning = FALSE,
  message = FALSE,
  fig.width  = 10,
  fig.height = 6,
  cache  = FALSE
)

Setup and Data

library(tidyverse)
library(haven)
library(janitor)
library(knitr)
library(kableExtra)
library(broom)
library(gtsummary)
library(GGally)
library(car)
library(ggeffects)
library(plotly)
library(lmtest)
library(dplyr)

options(gtsummary.use_ftExtra = TRUE)
set_gtsummary_theme(theme_gtsummary_compact(set_theme = TRUE))

Preparing the Dataset

We continue with the BRFSS 2020 dataset. For this lecture we shift our outcome to physically unhealthy days and examine sleep as the primary exposure, with sex, education, age, exercise, and general health as potential modifiers and confounders.

brfss_full <- read_xpt(
  "C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/BRFSS_2020.XPT"
) |>
  clean_names()

brfss_ci <- brfss_full |>
  mutate(
    # Outcome: physically unhealthy days in past 30
    physhlth_days = case_when(
      physhlth == 88                    ~ 0,
      physhlth >= 1 & physhlth <= 30   ~ as.numeric(physhlth),
      TRUE                             ~ NA_real_
    ),
    # Primary exposure: sleep hours
    sleep_hrs = case_when(
      sleptim1 >= 1 & sleptim1 <= 14   ~ as.numeric(sleptim1),
      TRUE                             ~ NA_real_
    ),
    # Mentally unhealthy days (covariate)
    menthlth_days = case_when(
      menthlth == 88                    ~ 0,
      menthlth >= 1 & menthlth <= 30   ~ as.numeric(menthlth),
      TRUE                             ~ NA_real_
    ),
    # Age
    age = age80,
    # Sex
    sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
    # Education (4-level)
    education = factor(case_when(
      educa %in% c(1, 2, 3) ~ "Less than HS",
      educa == 4             ~ "HS graduate",
      educa == 5             ~ "Some college",
      educa == 6             ~ "College graduate",
      TRUE                   ~ NA_character_
    ), levels = c("Less than HS", "HS graduate", "Some college", "College graduate")),
    # Exercise in past 30 days
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      TRUE          ~ NA_character_
    ), levels = c("No", "Yes")),
    # General health status
    gen_health = factor(case_when(
      genhlth == 1 ~ "Excellent",
      genhlth == 2 ~ "Very good",
      genhlth == 3 ~ "Good",
      genhlth == 4 ~ "Fair",
      genhlth == 5 ~ "Poor",
      TRUE         ~ NA_character_
    ), levels = c("Excellent", "Very good", "Good", "Fair", "Poor")),
    # Income category (ordinal 1-8)
    income_cat = case_when(
      income2 %in% 1:8 ~ as.numeric(income2),
      TRUE             ~ NA_real_
    )
  ) |>
  filter(
    !is.na(physhlth_days),
    !is.na(sleep_hrs),
    !is.na(menthlth_days),
    !is.na(age), age >= 18,
    !is.na(sex),
    !is.na(education),
    !is.na(exercise),
    !is.na(gen_health),
    !is.na(income_cat)
  )

# Reproducible random sample
set.seed(1220)
brfss_ci <- brfss_ci |>
  select(physhlth_days, sleep_hrs, menthlth_days, age, sex,
         education, exercise, gen_health, income_cat) |>
  slice_sample(n = 5000)

# Save for lab activity
saveRDS(brfss_ci,
  "C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/BRFSS_2020.rds")

tibble(Metric = c("Observations", "Variables"),
       Value  = c(nrow(brfss_ci), ncol(brfss_ci))) |>
  kable(caption = "Analytic Dataset Dimensions") |>
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Analytic Dataset Dimensions
Metric	Value
Observations	5000
Variables	9

In-Class Lab Activity

Data for the Lab

Use the saved analytic dataset from today’s lecture. It contains 5,000 randomly sampled BRFSS 2020 respondents.

Variable	Description	Type
`physhlth_days`	Physically unhealthy days in past 30	Continuous (0–30)
`sleep_hrs`	Sleep hours per night	Continuous (1–14)
`menthlth_days`	Mentally unhealthy days in past 30	Continuous (0–30)
`age`	Age in years (capped at 80)	Continuous
`sex`	Sex (Male/Female)	Factor
`education`	Education level (4 categories)	Factor
`exercise`	Any physical activity (Yes/No)	Factor
`gen_health`	General health status (5 categories)	Factor
`income_cat`	Household income (1–8 ordinal)	Numeric

Task 1: Exploratory Data Analysis (15 points)

1a. (5 pts) Create a scatterplot of physhlth_days (y-axis) vs. age (x-axis), colored by exercise status. Add separate regression lines for each group. Describe the pattern you observe.

The plot shows a positive relationship between age and physically unhealthy days. As age increase the number of reported unhealthy days increases.

p1 <- ggplot(brfss_ci, aes(x = age, y = physhlth_days, fill = exercise)) +
  geom_jitter(alpha = 0.2, width = 0.5, height = 0.02, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1.2) +
  labs(
    title = "Relationship Between Age and physhlth_days",
    subtitle = "Simple Linear Regression",
    x = "Age (years)",
    y = "Probability of physhlth_days"
  ) +
  theme_minimal(base_size = 12)

ggplotly(p1) |>
  layout(hovermode = "closest")

1b. (5 pts) Compute the mean physhlth_days for each combination of sex and exercise. Present the results in a table. Does it appear that the association between exercise and physical health days might differ by sex?

The association between exercise and physical health days might differ slightly with men having a slightly lower mean in both yes and no categories for exercise than women.

summary_stats <- brfss_ci |>
  group_by(sex, exercise) |>
  summarise(
    n = n(),
    Mean = mean(physhlth_days, na.rm = TRUE),
    SD = sd(physhlth_days, na.rm = TRUE),
    Median = median(physhlth_days, na.rm = TRUE),
    Min = min(physhlth_days, na.rm = TRUE),
    Max = max(physhlth_days, na.rm = TRUE)
  )

summary_stats |>
  kable(digits = 3, 
        caption = "Descriptive statistics for physical health by sex and exercise")

Descriptive statistics for physical health by sex and exercise
sex	exercise	n	Mean	SD	Max
Male	No	446	6.643	10.983	30
Male	Yes	1845	2.324	6.488	30
Female	No	625	7.430	11.460	30
Female	Yes	2084	2.451	6.322	30

1c. (5 pts) Create a scatterplot of physhlth_days vs. sleep_hrs, faceted by education level. Comment on whether the slopes appear similar or different across education groups.

The slopes appear different across educational groups with less than HS having a steeper slope compared to the other groups.

p2 <- ggplot(brfss_ci, aes(x = sleep_hrs, y = physhlth_days, fill = education)) +
  geom_jitter(alpha = 0.2, width = 0.5, height = 0.02, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1.2) +
  labs(
    title = "Relationship Between sleep_hrs and physhlth_day, faceted by education level",
    subtitle = "Simple Linear Regression",
    x = "sleep_hrs (hours)",
    y = "Probability of physhlth_days"
  ) +
  theme_minimal(base_size = 12)

ggplotly(p2) |>
  layout(hovermode = "closest")

Task 2: Stratified Analysis for Interaction (15 points)

2a. (5 pts) Fit separate simple linear regression models of physhlth_days ~ age for exercisers and non-exercisers. Report the slope, SE, and 95% CI for age in each stratum.

# Fit separate models for yes and no of exercise
mod_no <- lm(physhlth_days ~ age, data = brfss_ci |> filter(exercise == "No"))
mod_yes <- lm(physhlth_days ~ age, data = brfss_ci |> filter(exercise == "Yes"))

# Compare coefficients
bind_rows(
  tidy(mod_no, conf.int = TRUE) |> mutate(Stratum = "No"),
  tidy(mod_yes, conf.int = TRUE) |> mutate(Stratum = "Yes")
) |>
  filter(term == "age") |>
  select(Stratum, estimate, std.error, conf.low, conf.high, p.value) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Stratified Analysis: Age → Physical Health Days, by exercise",
    col.names = c("Stratum", "Estimate", "SE", "CI Lower", "CI Upper", "p-value")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Stratified Analysis: Age → Physical Health Days, by exercise
Stratum	Estimate	SE	CI Lower	CI Upper	p-value
No	0.0745	0.0212	0.0328	0.1161	0.0005
Yes	0.0192	0.0060	0.0075	0.0309	0.0013

2b. (5 pts) Create a single plot showing the two fitted regression lines (one per exercise group) overlaid on the data. Are the lines approximately parallel?

Yes they are approximately parallel.

ggplot(brfss_ci, aes(x = age, y = physhlth_days, color = exercise)) +
  geom_jitter(alpha = 0.08, width = 0.3, height = 0.3, size = 0.8) +
  geom_smooth(method = "lm", se = TRUE, linewidth = 1.2) +
  labs(
    title    = "Association Between Age and Physical Health Days, Stratified by Excercise",
    subtitle = "Are the slopes parallel or different?",
    x        = "Age",
    y        = "Physically Unhealthy Days (Past 30)",
    color    = "Excercise"
  ) +
  theme_minimal(base_size = 13) +
  scale_color_brewer(palette = "Set1")

2c. (5 pts) Can you formally test whether the two slopes are different using only the stratified results? Explain why or why not.

No, you cannot formally test whether the two slopes differ using only the stratified regression results because it doesn’t show the p-value and you cant determine if the results are significant.

Task 3: Interaction via Regression (25 points)

3a. (5 pts) Fit the interaction model: physhlth_days ~ age * exercise. Write out the fitted equation.

Physhlth_days= 2.7610 + 0.0745(age)-1.3858(exerciseYes)- 0.0553(age:exerciseYes)

crude_model <- lm(physhlth_days ~ exercise, data = brfss_ci)

mod_add <- lm(physhlth_days ~ age + exercise, data = brfss_ci)

mod_int <- lm(physhlth_days ~ age * exercise, data = brfss_ci)

tidy(mod_int, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Interaction Model: Age * Exercise → Physical Health Days",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Interaction Model: Age * Exercise → Physical Health Days
Term	Estimate	SE	t	p-value	CI Lower	CI Upper
(Intercept)	2.7610	0.8798	3.1381	0.0017	1.0361	4.4858
age	0.0745	0.0145	5.1203	0.0000	0.0460	0.1030
exerciseYes	-1.3858	0.9664	-1.4340	0.1516	-3.2804	0.5087
age:exerciseYes	-0.0553	0.0162	-3.4072	0.0007	-0.0871	-0.0235

3b. (5 pts) Using the fitted equation, derive the stratum-specific equations for exercisers and non-exercisers. Verify that these match the stratified analysis from Task 2. For exercisers: Physhlth_days= 2.7610 + 0.0745(age)-1.3858(exerciseYes)- 0.0553(age:exerciseYes)

For non-exercisers: Physhlth_days= 2.7610 + 0.0745(age)

These match the stratified models with an interaction term of -0.0553 in magnitude.

3c. (5 pts) Conduct the t-test for the interaction term (age:exerciseYes). State the null and alternative hypotheses, report the test statistic and p-value, and state your conclusion about whether interaction is present.

Null hypothesis: There is no interaction between age and exercise.

Alternative Hypothesis: There is an interaction between age and exercise.

T-test: -3.407 p-value: 7e-04

There is strong evidence that the realtionship between age and exercise is statistically significant because the p-value is less than 0.05. We can reject the null hypothesis.

int_term <- tidy(mod_int) |> filter(term == "age:exerciseYes")
cat("Interaction term (age:exerciseYes):\n")

## Interaction term (age:exerciseYes):

cat("  Estimate:", round(int_term$estimate, 4), "\n")

##   Estimate: -0.0553

cat("  t-statistic:", round(int_term$statistic, 3), "\n")

##   t-statistic: -3.407

cat("  p-value:", round(int_term$p.value, 4), "\n")

##   p-value: 7e-04

3d. (5 pts) Now fit a model with an interaction between age and education: physhlth_days ~ age * education. How many interaction terms are produced? Use a partial F-test to test whether the age \(\times\) education interaction as a whole is significant.

The interaction between age and education produces 3 interaction terms. The interaction is not significant as a whole because the p-value is 0.6818 which is greater than 0.05.

mod_int_edu <- lm(physhlth_days ~ age * education, data = brfss_ci)

mod_no_int_edu <- lm(physhlth_days ~ age + education, data = brfss_ci)

anova(mod_no_int_edu, mod_int_edu) |>
  tidy() |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(caption = "Partial F-test: Is the Age x Education Interaction Significant?") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Partial F-test: Is the Age x Education Interaction Significant?
term	df.residual	rss	df	sumsq	statistic	p.value
physhlth_days ~ age + education	4995	306764.2	NA	NA	NA	NA
physhlth_days ~ age * education	4992	306671.9	3	92.2713	0.5007	0.6818

3e. (5 pts) Create a visualization using ggpredict() showing the predicted physhlth_days by age for each education level. Do the lines appear parallel?

The lines are somewhat parallell but not perfectly parallell.

mod_int_edu <- lm(physhlth_days ~ age * education, data = brfss_ci)

pred_int <- ggpredict(mod_int_edu, terms = c("age", "education"))

ggplot(pred_int, aes(x = x, y = predicted, color = group, fill = group)) +
  geom_line(linewidth = 1.2) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.12, color = NA) +
  labs(
    title    = "Predicted Physical Health Days by Age and Education Level",
    subtitle = "From interaction model: age * education",
    x        = "Age (years)",
    y        = "Predicted Physically Unhealthy Days",
    color    = "Education",
    fill     = "Education"
  ) +
  theme_minimal(base_size = 13) +
  scale_color_brewer(palette = "Set1")

Task 4: Confounding Assessment (25 points)

For this task, the exposure is exercise and the outcome is physhlth_days.

4a. (5 pts) Fit the crude model: physhlth_days ~ exercise. Report the exercise coefficient. This is the unadjusted estimate.

The coefficient is -4.71.

# Crude model
mod_crude <- lm(physhlth_days ~ exercise, data = brfss_ci)

tidy(crude_model) |>
  filter(str_detect(term, "exercise"))

## # A tibble: 1 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 exerciseYes    -4.71     0.266     -17.7 2.46e-68

4b. (10 pts) Systematically assess whether each of the following is a confounder of the exercise-physical health association: age, sex, sleep_hrs, education, and income_cat. For each:

Fit the model physhlth_days ~ exercise + [covariate]
Report the adjusted exercise coefficient
Compute the percent change from the crude estimate
Apply the 10% rule to determine if the variable is a confounder

Present your results in a single summary table.

# Crude model
b_crude_val <- coef(mod_crude)["exerciseYes"]

# One-at-a-time adjusted models
confounders <- list(
  "Age"             = lm(physhlth_days ~ exercise + age, data = brfss_ci),
  "sleep"        = lm(physhlth_days ~ exercise + sleep_hrs, data = brfss_ci),
  "Education"       = lm(physhlth_days ~ exercise + education, data = brfss_ci),
  "Sex"             = lm(physhlth_days ~ exercise + sex, data = brfss_ci),
  "Income"          = lm(physhlth_days ~ exercise + income_cat, data = brfss_ci)
)

conf_table <- map_dfr(names(confounders), \(name) {
  mod <- confounders[[name]]
  b_adj_val <- coef(mod)["exerciseYes"]
  tibble(
    Covariate = name,
    `Crude β (exercise)` = round(b_crude_val, 4),
    `Adjusted β (exercise)` = round(b_adj_val, 4),
    `% Change` = round(abs(b_crude_val - b_adj_val) / abs(b_crude_val) * 100, 1),
    Confounder = ifelse(abs(b_crude_val - b_adj_val) / abs(b_crude_val) * 100 > 10,
                        "Yes (>10%)", "No")
  )
})

conf_table |>
  kable(caption = "Systematic Confounding Assessment: One-at-a-Time Addition") |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
  column_spec(5, bold = TRUE)

Systematic Confounding Assessment: One-at-a-Time Addition
Covariate	Crude β (exercise)	Adjusted β (exercise)	% Change	Confounder
Age	-4.7115	-4.5504	3.4	No
sleep	-4.7115	-4.6831	0.6	No
Education	-4.7115	-4.3912	6.8	No
Sex	-4.7115	-4.6974	0.3	No
Income	-4.7115	-3.9406	16.4	Yes (>10%)

4c. (5 pts) Fit a fully adjusted model including exercise and all identified confounders. Report the exercise coefficient and compare it to the crude estimate. How much did the estimate change overall?

The crude exercise coefficient is -4.71. The adjusted estimate is -3.94. The estimated effect of exercise decreased by about 0.77 days, suggesting the crude assocation was explained by income.

mod_income <- lm(physhlth_days ~ exercise + income_cat, data = brfss_ci)

tidy(mod_income, conf.int = TRUE) |>
  mutate(across(where(is.numeric), \(x) round(x, 4))) |>
  kable(
    caption = "Interaction Model: Income * Exercise → Physical Health Days",
    col.names = c("Term", "Estimate", "SE", "t", "p-value", "CI Lower", "CI Upper")
  ) |>
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Interaction Model: Income * Exercise → Physical Health Days
Term	Estimate	SE	t	CI Lower	CI Upper
(Intercept)	10.6363	0.3619	29.3924	9.9269	11.3457
exerciseYes	-3.9406	0.2684	-14.6842	-4.4667	-3.4145
income_cat	-0.6750	0.0531	-12.7135	-0.7790	-0.5709

4d. (5 pts) Is gen_health a confounder or a mediator of the exercise-physical health relationship? Could it be both? Explain your reasoning with reference to the three conditions for confounding and the concept of the causal pathway.

General health can be both a confounder and a mediator, depending on whether it represents pre-existing health status or health affected by exercise.

Task 5: Public Health Interpretation (20 points)

5a. (10 pts) Based on your analyses, write a 4–5 sentence paragraph for a public health audience summarizing:

Whether the association between exercise and physical health differs by any subgroup (interaction assessment)
Which variables confounded the exercise-physical health association
The direction and approximate magnitude of the adjusted exercise effect
Appropriate caveats about cross-sectional data and potential unmeasured confounding

Do not use statistical jargon.

The relationship between age and physical health differed by exercise status but not by education level. The age-exercise t-test gave us a result of -3.41 and a p-value of 0.001 showing that the way age relates to health changes depending on whether someone exercises or not.In addition, in 1b, the descriptive results show the benefit of exercise between both male and females who exercised and they report fewer days of poor physical health with about 2-2.5 days on average, in comparison to those who did not exercise with about 6-7 more physically unhealthy days. On the other hand, age-education f-test showed a value of 0.50 and a p-value of 0.68, suggesting there’s no difference. In addition, the scatterplot in 1c shows fairly parallel lines indicating that education does not change the relationship.Income was the only factor that meaningfully influenced the relationship between exercise and physical health, indicating some confounding. After accounting for this, people who exercised still reported about 3-4 fewer days of poor physical health. However, due to the results being based on a cross sectional data we cannot determine whether exercise leads to better health or whether another those who are healthy are able to exercise more. Additionally, other unmeasured factors may be influencing the results.

5b. (10 pts) A colleague suggests including gen_health as a covariate in the final model because it changes the exercise coefficient by more than 10%. You disagree. Write a 3–4 sentence argument explaining why adjusting for general health may not be appropriate if the goal is to estimate the total effect of exercise on physical health days. Use the concept of mediation in your argument.

I would not include general health as a covariate if the goal is to estimate the total effect of exercise because it may lie on the causal pathway. If exercise improves general health, and better general health reduces physically unhealthy days, then adjusting for it would remove part of the effect we are trying to measure. The would lead to an underestimation of the overall benefit of exercise. For this exact reason, we should not control for general health because it can block part of the causal pathway rather than accounting for underlying differences.

Confounding and Interactions in Regression

EPI 553 — Principles of Statistical Inference II (Spring 2026)

Grace Beal

March 28, 2026