Part 2: In-Class Lab Activity

EPI 553 — Multiple Linear Regression Lab Due: End of class, March 10, 2026


Instructions

In this lab, you will build and interpret multiple linear regression models using the BRFSS 2020 analytic dataset. Work through each task systematically. You may discuss concepts with classmates, but your written answers and R code must be your own.

Submission: Knit your .Rmd to HTML and upload to Brightspace by end of class.


Data for the Lab

Use the saved analytic dataset from today’s lecture. It contains 5,000 randomly sampled BRFSS 2020 respondents with the following variables:

Variable Description Type
menthlth_days Mentally unhealthy days in past 30 Continuous (0–30)
physhlth_days Physically unhealthy days in past 30 Continuous (0–30)
sleep_hrs Sleep hours per night Continuous (1–14)
age Age in years (capped at 80) Continuous
income_cat Household income (1 = <$10k to 8 = >$75k) Ordinal
sex Sex (Male/Female) Factor
exercise Any physical activity past 30 days (Yes/No) Factor
# Load the dataset
library(dplyr)
library(tidyverse)
library(haven)
library(janitor)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(ggeffects)
library(ggstats)
library(gtsummary)
library(GGally)
library(car)
library(lmtest)
library(corrplot)
brfss_full <- read_xpt("C:/Users/kekor/Downloads/LLCP2020.XPT") %>%
  clean_names()
# ── Recode and clean ──────────────────────────────────────────────────────────
brfss_mlr <- brfss_full %>%
  mutate(
    # Outcome: mentally unhealthy days in past 30 (88 = none = 0; 77/99 = DK/refused = NA)
    menthlth_days = case_when(
      menthlth == 88              ~ 0,
      menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
      TRUE                        ~ NA_real_
    ),
    # Physical health days (key predictor)
    physhlth_days = case_when(
      physhlth == 88              ~ 0,
      physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
      TRUE                        ~ NA_real_
    ),
    # Sleep hours (practical cap at 14)
    sleep_hrs = case_when(
      sleptim1 >= 1 & sleptim1 <= 14 ~ as.numeric(sleptim1),
      TRUE                        ~ NA_real_
    ),
    # Age (capped at 80 per BRFSS coding)
    age = age80,
    # Income category (ordinal 1–8)
    income_cat = case_when(
      income2 %in% 1:8 ~ as.numeric(income2),
      TRUE             ~ NA_real_
    ),
    # Sex
    sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
    # Exercise in past 30 days (any physical activity)
    exercise = factor(case_when(
      exerany2 == 1 ~ "Yes",
      exerany2 == 2 ~ "No",
      TRUE          ~ NA_character_
    ), levels = c("No", "Yes")),
    # BMI (stored as integer × 100 in BRFSS)
    bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_),
    # Income as labeled factor (for display)
    income_f = factor(income2, levels = 1:8,
      labels = c("<$10k", "$10-15k", "$15-20k", "$20-25k",
                 "$25-35k", "$35-50k", "$50-75k", ">$75k"))
  ) %>%
  filter(
    !is.na(menthlth_days),
    !is.na(physhlth_days),
    !is.na(sleep_hrs),
    !is.na(age),   age >= 18,
    !is.na(income_cat),
    !is.na(sex),
    !is.na(exercise)
  )

# ── Analytic sample (reproducible random sample) ────────────────────────────
set.seed(553)
brfss_mlr <- brfss_mlr %>%
  select(menthlth_days, physhlth_days, sleep_hrs, age,
         income_cat, income_f, sex, exercise, bmi) %>%
  slice_sample(n = 5000)

# Save for lab activity
saveRDS(brfss_mlr,
  "C:/Users/kekor/Downloads/LLCP2020.XPT")

tibble(Metric = c("Observations", "Variables"),
       Value  = c(nrow(brfss_mlr), ncol(brfss_mlr))) %>%
  kable(caption = "Analytic Dataset Dimensions") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Analytic Dataset Dimensions
Metric Value
Observations 5000
Variables 9
brfss_mlr %>%
  select(menthlth_days, physhlth_days, sleep_hrs, age,
         income_f, sex, exercise) %>%
  tbl_summary(
    label = list(
      menthlth_days ~ "Mentally unhealthy days (past 30)",
      physhlth_days ~ "Physically unhealthy days (past 30)",
      sleep_hrs     ~ "Sleep (hours/night)",
      age           ~ "Age (years)",
      income_f      ~ "Household income",
      sex           ~ "Sex",
      exercise      ~ "Any physical activity (past 30 days)"
    ),
    statistic = list(
      all_continuous()  ~ "{mean} ({sd})",
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits  = all_continuous() ~ 1,
    missing = "no"
  ) %>%
  add_n() %>%
  bold_labels() %>%
  modify_caption("**Table 1. Descriptive Statistics — BRFSS 2020 Analytic Sample (n = 5,000)**")
Table 1. Descriptive Statistics — BRFSS 2020 Analytic Sample (n = 5,000)
Characteristic N N = 5,0001
Mentally unhealthy days (past 30) 5,000 3.8 (7.7)
Physically unhealthy days (past 30) 5,000 3.3 (7.8)
Sleep (hours/night) 5,000 7.1 (1.3)
Age (years) 5,000 54.3 (17.2)
Household income 5,000
    <$10k
190 (3.8%)
    $10-15k
169 (3.4%)
    $15-20k
312 (6.2%)
    $20-25k
434 (8.7%)
    $25-35k
489 (9.8%)
    $35-50k
683 (14%)
    $50-75k
841 (17%)
    >$75k
1,882 (38%)
Sex 5,000
    Male
2,331 (47%)
    Female
2,669 (53%)
Any physical activity (past 30 days) 5,000 3,874 (77%)
1 Mean (SD); n (%)

1b. Histogram of menthlth_days

ggplot(brfss_mlr, aes(x = menthlth_days)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "white", alpha = 0.85) +
  labs(
    title    = "Distribution of Mentally Unhealthy Days in the Past 30 Days",
    subtitle = "BRFSS 2020 Analytic Sample (n = 5,000)",
    x        = "Number of Mentally Unhealthy Days",
    y        = "Count"
  ) +
  theme_minimal(base_size = 13)

[Your interpretation here: comment on the shape — symmetric, right-skewed, or left-skewed — and what this means for regression modeling.]*

1c. Scatterplot Matrix of Continuous Variables

brfss_mlr %>%
  select(menthlth_days, physhlth_days, sleep_hrs, age) %>%
  rename(
    `Mental Health\nDays`   = menthlth_days,
    `Physical Health\nDays` = physhlth_days,
    `Sleep\n(hrs)`          = sleep_hrs,
    Age                     = age
  ) %>%
  ggpairs(
    lower = list(continuous = wrap("points", alpha = 0.05, size = 0.5)),
    diag  = list(continuous = wrap("densityDiag", fill = "steelblue", alpha = 0.5)),
    upper = list(continuous = wrap("cor", size = 4)),
    title = "Pairwise Relationships Among Continuous Variables (BRFSS 2020)"
  ) +
  theme_minimal(base_size = 11)

[Your interpretation here: comment on the direction and strength of each pairwise correlation with the outcome.]


Task 2: Unadjusted (Simple) Linear Regression

2a. Simple Linear Regression: menthlth_days ~ sleep_hrs

model_slr <- lm(menthlth_days ~ sleep_hrs, data = brfss_mlr)

tidy(model_slr, conf.int = TRUE) %>%
  mutate(across(where(is.numeric), ~ round(., 4))) %>%
  kable(
    caption   = "Table 2a. Simple Linear Regression: Mental Health Days ~ Sleep Hours",
    col.names = c("Term", "β̂", "SE", "t", "p-value", "CI Low", "CI High")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 2a. Simple Linear Regression: Mental Health Days ~ Sleep Hours
Term β
    S
(Intercept) 9.4743 0.5771 16.4165 0 8.3429 10.6057
sleep_hrs -0.8042 0.0802 -10.0218 0 -0.9616 -0.6469
# Extract coefficients for the fitted equation
b0_slr <- round(coef(model_slr)[1], 3)
b1_slr <- round(coef(model_slr)[2], 3)
cat("Fitted equation:\n")
## Fitted equation:
cat("Predicted Mental Health Days =", b0_slr, "+", b1_slr, "× Sleep Hours\n")
## Predicted Mental Health Days = 9.474 + -0.804 × Sleep Hours

Fitted equation: \(\widehat{\text{Mental Health Days}} = 9.474 + -0.804(\text{Sleep Hours})\)

# Extract t-statistic, p-value, and degrees of freedom
slr_tidy <- tidy(model_slr)
slr_glance <- glance(model_slr)

cat("H0: β_sleep = 0  (sleep hours have no linear association with mental health days)\n")
## H0: β_sleep = 0  (sleep hours have no linear association with mental health days)
cat("HA: β_sleep ≠ 0\n\n")
## HA: β_sleep ≠ 0
cat("t-statistic:", round(slr_tidy$statistic[2], 4), "\n")
## t-statistic: -10.0218
cat("p-value:    ", round(slr_tidy$p.value[2], 4), "\n")
## p-value:     0
cat("df (residual):", slr_glance$df.residual, "\n")
## df (residual): 4998
# Model A: sleep only
model_A <- lm(menthlth_days ~ sleep_hrs, data = brfss_mlr)

# Model B: sleep + age + sex
model_B <- lm(menthlth_days ~ sleep_hrs + age + sex, data = brfss_mlr)

# Model C: full model
model_C <- lm(menthlth_days ~ sleep_hrs + age + sex + physhlth_days + income_cat + exercise,
              data = brfss_mlr)

3b. Comparing the Sleep Coefficient Across Models A, B, C

# Extract sleep coefficient, SE, 95% CI, and p-value for each model
extract_sleep <- function(model, label) {
  t <- tidy(model, conf.int = TRUE) %>% filter(term == "sleep_hrs")
  tibble(
    Model   = label,
    `β̂ (Sleep)` = round(t$estimate, 4),
    SE           = round(t$std.error, 4),
    `95% CI`     = paste0("(", round(t$conf.low, 4), ", ", round(t$conf.high, 4), ")"),
    `p-value`    = round(t$p.value, 4)
  )
}

bind_rows(
  extract_sleep(model_A, "Model A: sleep only"),
  extract_sleep(model_B, "Model B: + age + sex"),
  extract_sleep(model_C, "Model C: full model")
) %>%
  kable(caption = "Table 3b. Sleep Coefficient Across Models A, B, and C") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 3b. Sleep Coefficient Across Models A, B, and C
Model β̂ (Sleep
    S
Model A: sleep only -0.8042 0.0802 (-0.9616, -0.6469) 0
Model B: + age + sex -0.7339 0.0793 (-0.8894, -0.5784) 0
Model C: full model -0.5092 0.0753 (-0.6569, -0.3614) 0

3c. Full Fitted Equation and Coefficient Interpretation for Model C

tidy(model_C, conf.int = TRUE) %>%
  mutate(
    term = dplyr::recode(term,
      "(Intercept)"   = "Intercept",
      "sleep_hrs"     = "Sleep (hours/night)",
      "age"           = "Age (years)",
      "sexFemale"     = "Sex: Female (ref = Male)",
      "physhlth_days" = "Physical health days",
      "income_cat"    = "Income (ordinal 1–8)",
      "exerciseYes"   = "Exercise: Yes (ref = No)"
    ),
    across(where(is.numeric), ~ round(., 4))
  ) %>%
  kable(
    caption   = "Table 3c. Model C — Full Multivariable Regression Results",
    col.names = c("Term", "β̂", "SE", "t", "p-value", "CI Low", "CI High")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 3c. Model C — Full Multivariable Regression Results
Term β
    S
Intercept 12.4755 0.7170 17.4006 0.0000 11.0699 13.8810
Sleep (hours/night) -0.5092 0.0753 -6.7574 0.0000 -0.6569 -0.3614
Age (years) -0.0823 0.0059 -13.8724 0.0000 -0.0939 -0.0707
Sex: Female (ref = Male) 1.2451 0.2023 6.1535 0.0000 0.8484 1.6417
Physical health days 0.2917 0.0136 21.4779 0.0000 0.2650 0.3183
Income (ordinal 1–8) -0.3213 0.0520 -6.1778 0.0000 -0.4233 -0.2194
Exercise: Yes (ref = No) -0.3427 0.2531 -1.3537 0.1759 -0.8389 0.1536
b <- round(coef(model_C), 3)
cat("Fitted equation (Model C):\n")
## Fitted equation (Model C):
cat("Predicted Mental Health Days =",
    b["(Intercept)"], "+",
    b["sleep_hrs"],     "× Sleep +",
    b["age"],           "× Age +",
    b["sexFemale"],     "× Female +",
    b["physhlth_days"], "× Physical Health Days +",
    b["income_cat"],    "× Income +",
    b["exerciseYes"],   "× Exercise (Yes)\n")
## Predicted Mental Health Days = 12.475 + -0.509 × Sleep + -0.082 × Age + 1.245 × Female + 0.292 × Physical Health Days + -0.321 × Income + -0.343 × Exercise (Yes)

[Your plain-language interpretation of every coefficient here.]


Task 4: Model Fit and ANOVA

4a. R² and Adjusted R² Across Models A, B, C

tribble(
  ~Model,                     ~Predictors, ~`R²`,           ~`Adj. R²`,
  "Model A: sleep only",      1,
    round(summary(model_A)$r.squared, 4), round(summary(model_A)$adj.r.squared, 4),
  "Model B: + age + sex",     3,
    round(summary(model_B)$r.squared, 4), round(summary(model_B)$adj.r.squared, 4),
  "Model C: full model",      6,
    round(summary(model_C)$r.squared, 4), round(summary(model_C)$adj.r.squared, 4)
) %>%
  kable(caption = "Table 4a. R² and Adjusted R² Across Models") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(3, bold = TRUE, background = "#EBF5FB")
Table 4a. R² and Adjusted R² Across Models
Model Predictors Adj. R²
Model A: sleep only 1 0.0197 0.0195
Model B: + age + sex 3 0.0504 0.0498
Model C: full model 6 0.1569 0.1559

[Your interpretation here: which model explains the most variance?]

4b. Root MSE for Model C

rmse_C <- round(summary(model_C)$sigma, 2)
cat("Root MSE (Model C):", rmse_C, "mentally unhealthy days\n")
## Root MSE (Model C): 7.09 mentally unhealthy days

[Your practical interpretation here: what does the Root MSE tell you about prediction accuracy?]

4c. ANOVA Table for Model C

# Get ANOVA decomposition
anova_C <- anova(model_C)
print(anova_C)
## Analysis of Variance Table
## 
## Response: menthlth_days
##                 Df Sum Sq Mean Sq  F value    Pr(>F)    
## sleep_hrs        1   5865  5864.8 116.6678 < 2.2e-16 ***
## age              1   6182  6182.2 122.9832 < 2.2e-16 ***
## sex              1   2947  2947.1  58.6266 2.274e-14 ***
## physhlth_days    1  29456 29455.5 585.9585 < 2.2e-16 ***
## income_cat       1   2177  2176.8  43.3031 5.169e-11 ***
## exercise         1     92    92.1   1.8326    0.1759    
## Residuals     4993 250993    50.3                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Compute summary ANOVA table values for manual fill-in
glance_C <- glance(model_C)

SSR <- sum(anova_C$`Sum Sq`[1:(nrow(anova_C)-1)])  # model SS
SSE <- anova_C$`Sum Sq`[nrow(anova_C)]              # residual SS
SSY <- SSR + SSE                                      # total SS
df_model    <- glance_C$df
df_residual <- glance_C$df.residual
df_total    <- df_model + df_residual
MSR <- SSR / df_model
MSE <- SSE / df_residual
F_stat <- MSR / MSE

tribble(
  ~Source,    ~df,         ~SS,               ~MS,              ~F,
  "Model",    df_model,    round(SSR, 2),     round(MSR, 2),    round(F_stat, 2),
  "Residual", df_residual, round(SSE, 2),     round(MSE, 2),    NA,
  "Total",    df_total,    round(SSY, 2),     NA,               NA
) %>%
  kable(caption = "Table 4c. ANOVA Summary Table for Model C",
        na = "") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 4c. ANOVA Summary Table for Model C
Source df SS MS F
Model 6 46718.54 7786.42 154.9
Residual 4993 250992.80 50.27 NA
Total 4999 297711.34 NA NA
cat("\nOverall F-statistic:", round(glance_C$statistic, 2),
    "\np-value:", format(glance_C$p.value, scientific = TRUE), "\n")
## 
## Overall F-statistic: 154.9 
## p-value: 6.48049e-181

[State the null hypothesis for the overall F-test and your conclusion here.]


Task 5: Residual Diagnostics

5a. Four Standard Diagnostic Plots for Model C

par(mfrow = c(2, 2))
plot(model_C, which = 1:4,
     col = adjustcolor("steelblue", alpha.f = 0.3), pch = 16)

par(mfrow = c(1, 1))

[Your comments here on what each plot reveals about the LINE assumptions.]

5b. Which Assumptions Are Most Likely Violated?

[Your interpretation here: given that menthlth_days is bounded at 0–30 and heavily right-skewed, discuss which LINE assumptions are most likely violated and whether this invalidates the analysis.]

5c. Observations with Cook’s Distance > 1

model_C_aug <- augment(model_C)

# Count and display observations with Cook's D > 1
influential <- model_C_aug %>%
  filter(.cooksd > 1) %>%
  select(menthlth_days, physhlth_days, sleep_hrs, age, .fitted, .resid, .cooksd)

cat("Number of observations with Cook's Distance > 1:", nrow(influential), "\n\n")
## Number of observations with Cook's Distance > 1: 0
if (nrow(influential) > 0) {
  influential %>%
    mutate(across(where(is.numeric), ~ round(., 3))) %>%
    kable(caption = "Observations with Cook's Distance > 1") %>%
    kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
} else {
  cat("No observations exceed Cook's Distance of 1.\n")
}
## No observations exceed Cook's Distance of 1.
# Plot Cook's distance for visual reference
model_C_aug %>%
  mutate(obs = row_number()) %>%
  ggplot(aes(x = obs, y = .cooksd)) +
  geom_col(fill = "black", alpha = 0.6) +
  geom_hline(yintercept = 1, color = "red", linetype = "dashed", linewidth = 1) +
  labs(
    title    = "Cook's Distance by Observation (Model C)",
    subtitle = "Red dashed line = threshold of 1",
    x        = "Observation Index",
    y        = "Cook's Distance"
  ) +
  theme_minimal(base_size = 13)

[Your interpretation here: how many influential observations are there and what would you do with them in a real analysis?]


Task 6: Interpretation for a Public Health Audience

[Write your 3–4 sentence paragraph here summarizing the findings from Model C for a non-statistical audience. Remember: identify significant predictors, state direction and magnitude of key associations, caveat the cross-sectional design, and avoid all statistical jargon.]


End of Lab Activity