EPI 553 — Multiple Linear Regression Lab Due: End of class, March 10, 2026
In this lab, you will build and interpret multiple linear regression models using the BRFSS 2020 analytic dataset. Work through each task systematically. You may discuss concepts with classmates, but your written answers and R code must be your own.
Submission: Knit your .Rmd to HTML and upload to Brightspace by end of class.
Use the saved analytic dataset from today’s lecture. It contains 5,000 randomly sampled BRFSS 2020 respondents with the following variables:
| Variable | Description | Type |
|---|---|---|
menthlth_days |
Mentally unhealthy days in past 30 | Continuous (0–30) |
physhlth_days |
Physically unhealthy days in past 30 | Continuous (0–30) |
sleep_hrs |
Sleep hours per night | Continuous (1–14) |
age |
Age in years (capped at 80) | Continuous |
income_cat |
Household income (1 = <$10k to 8 = >$75k) | Ordinal |
sex |
Sex (Male/Female) | Factor |
exercise |
Any physical activity past 30 days (Yes/No) | Factor |
# Load the dataset
library(dplyr)
library(tidyverse)
library(haven)
library(janitor)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(ggeffects)
library(ggstats)
library(gtsummary)
library(GGally)
library(car)
library(lmtest)
library(corrplot)
brfss_full <- read_xpt("C:/Users/kekor/Downloads/LLCP2020.XPT") %>%
clean_names()
# ── Recode and clean ──────────────────────────────────────────────────────────
brfss_mlr <- brfss_full %>%
mutate(
# Outcome: mentally unhealthy days in past 30 (88 = none = 0; 77/99 = DK/refused = NA)
menthlth_days = case_when(
menthlth == 88 ~ 0,
menthlth >= 1 & menthlth <= 30 ~ as.numeric(menthlth),
TRUE ~ NA_real_
),
# Physical health days (key predictor)
physhlth_days = case_when(
physhlth == 88 ~ 0,
physhlth >= 1 & physhlth <= 30 ~ as.numeric(physhlth),
TRUE ~ NA_real_
),
# Sleep hours (practical cap at 14)
sleep_hrs = case_when(
sleptim1 >= 1 & sleptim1 <= 14 ~ as.numeric(sleptim1),
TRUE ~ NA_real_
),
# Age (capped at 80 per BRFSS coding)
age = age80,
# Income category (ordinal 1–8)
income_cat = case_when(
income2 %in% 1:8 ~ as.numeric(income2),
TRUE ~ NA_real_
),
# Sex
sex = factor(sexvar, levels = c(1, 2), labels = c("Male", "Female")),
# Exercise in past 30 days (any physical activity)
exercise = factor(case_when(
exerany2 == 1 ~ "Yes",
exerany2 == 2 ~ "No",
TRUE ~ NA_character_
), levels = c("No", "Yes")),
# BMI (stored as integer × 100 in BRFSS)
bmi = ifelse(bmi5 > 0, bmi5 / 100, NA_real_),
# Income as labeled factor (for display)
income_f = factor(income2, levels = 1:8,
labels = c("<$10k", "$10-15k", "$15-20k", "$20-25k",
"$25-35k", "$35-50k", "$50-75k", ">$75k"))
) %>%
filter(
!is.na(menthlth_days),
!is.na(physhlth_days),
!is.na(sleep_hrs),
!is.na(age), age >= 18,
!is.na(income_cat),
!is.na(sex),
!is.na(exercise)
)
# ── Analytic sample (reproducible random sample) ────────────────────────────
set.seed(553)
brfss_mlr <- brfss_mlr %>%
select(menthlth_days, physhlth_days, sleep_hrs, age,
income_cat, income_f, sex, exercise, bmi) %>%
slice_sample(n = 5000)
# Save for lab activity
saveRDS(brfss_mlr,
"C:/Users/kekor/Downloads/LLCP2020.XPT")
tibble(Metric = c("Observations", "Variables"),
Value = c(nrow(brfss_mlr), ncol(brfss_mlr))) %>%
kable(caption = "Analytic Dataset Dimensions") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| Metric | Value |
|---|---|
| Observations | 5000 |
| Variables | 9 |
brfss_mlr %>%
select(menthlth_days, physhlth_days, sleep_hrs, age,
income_f, sex, exercise) %>%
tbl_summary(
label = list(
menthlth_days ~ "Mentally unhealthy days (past 30)",
physhlth_days ~ "Physically unhealthy days (past 30)",
sleep_hrs ~ "Sleep (hours/night)",
age ~ "Age (years)",
income_f ~ "Household income",
sex ~ "Sex",
exercise ~ "Any physical activity (past 30 days)"
),
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} ({p}%)"
),
digits = all_continuous() ~ 1,
missing = "no"
) %>%
add_n() %>%
bold_labels() %>%
modify_caption("**Table 1. Descriptive Statistics — BRFSS 2020 Analytic Sample (n = 5,000)**")
| Characteristic | N | N = 5,0001 |
|---|---|---|
| Mentally unhealthy days (past 30) | 5,000 | 3.8 (7.7) |
| Physically unhealthy days (past 30) | 5,000 | 3.3 (7.8) |
| Sleep (hours/night) | 5,000 | 7.1 (1.3) |
| Age (years) | 5,000 | 54.3 (17.2) |
| Household income | 5,000 | |
| <$10k | 190 (3.8%) | |
| $10-15k | 169 (3.4%) | |
| $15-20k | 312 (6.2%) | |
| $20-25k | 434 (8.7%) | |
| $25-35k | 489 (9.8%) | |
| $35-50k | 683 (14%) | |
| $50-75k | 841 (17%) | |
| >$75k | 1,882 (38%) | |
| Sex | 5,000 | |
| Male | 2,331 (47%) | |
| Female | 2,669 (53%) | |
| Any physical activity (past 30 days) | 5,000 | 3,874 (77%) |
| 1 Mean (SD); n (%) | ||
menthlth_daysggplot(brfss_mlr, aes(x = menthlth_days)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "white", alpha = 0.85) +
labs(
title = "Distribution of Mentally Unhealthy Days in the Past 30 Days",
subtitle = "BRFSS 2020 Analytic Sample (n = 5,000)",
x = "Number of Mentally Unhealthy Days",
y = "Count"
) +
theme_minimal(base_size = 13)
[Your interpretation here: comment on the shape — symmetric, right-skewed, or left-skewed — and what this means for regression modeling.]*
brfss_mlr %>%
select(menthlth_days, physhlth_days, sleep_hrs, age) %>%
rename(
`Mental Health\nDays` = menthlth_days,
`Physical Health\nDays` = physhlth_days,
`Sleep\n(hrs)` = sleep_hrs,
Age = age
) %>%
ggpairs(
lower = list(continuous = wrap("points", alpha = 0.05, size = 0.5)),
diag = list(continuous = wrap("densityDiag", fill = "steelblue", alpha = 0.5)),
upper = list(continuous = wrap("cor", size = 4)),
title = "Pairwise Relationships Among Continuous Variables (BRFSS 2020)"
) +
theme_minimal(base_size = 11)
[Your interpretation here: comment on the direction and strength of each pairwise correlation with the outcome.]
menthlth_days ~ sleep_hrsmodel_slr <- lm(menthlth_days ~ sleep_hrs, data = brfss_mlr)
tidy(model_slr, conf.int = TRUE) %>%
mutate(across(where(is.numeric), ~ round(., 4))) %>%
kable(
caption = "Table 2a. Simple Linear Regression: Mental Health Days ~ Sleep Hours",
col.names = c("Term", "β̂", "SE", "t", "p-value", "CI Low", "CI High")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | β |
S
| ||||
|---|---|---|---|---|---|---|
| (Intercept) | 9.4743 | 0.5771 | 16.4165 | 0 | 8.3429 | 10.6057 |
| sleep_hrs | -0.8042 | 0.0802 | -10.0218 | 0 | -0.9616 | -0.6469 |
# Extract coefficients for the fitted equation
b0_slr <- round(coef(model_slr)[1], 3)
b1_slr <- round(coef(model_slr)[2], 3)
cat("Fitted equation:\n")
## Fitted equation:
cat("Predicted Mental Health Days =", b0_slr, "+", b1_slr, "× Sleep Hours\n")
## Predicted Mental Health Days = 9.474 + -0.804 × Sleep Hours
Fitted equation: \(\widehat{\text{Mental Health Days}} = 9.474 + -0.804(\text{Sleep Hours})\)
# Extract t-statistic, p-value, and degrees of freedom
slr_tidy <- tidy(model_slr)
slr_glance <- glance(model_slr)
cat("H0: β_sleep = 0 (sleep hours have no linear association with mental health days)\n")
## H0: β_sleep = 0 (sleep hours have no linear association with mental health days)
cat("HA: β_sleep ≠ 0\n\n")
## HA: β_sleep ≠ 0
cat("t-statistic:", round(slr_tidy$statistic[2], 4), "\n")
## t-statistic: -10.0218
cat("p-value: ", round(slr_tidy$p.value[2], 4), "\n")
## p-value: 0
cat("df (residual):", slr_glance$df.residual, "\n")
## df (residual): 4998
# Model A: sleep only
model_A <- lm(menthlth_days ~ sleep_hrs, data = brfss_mlr)
# Model B: sleep + age + sex
model_B <- lm(menthlth_days ~ sleep_hrs + age + sex, data = brfss_mlr)
# Model C: full model
model_C <- lm(menthlth_days ~ sleep_hrs + age + sex + physhlth_days + income_cat + exercise,
data = brfss_mlr)
# Extract sleep coefficient, SE, 95% CI, and p-value for each model
extract_sleep <- function(model, label) {
t <- tidy(model, conf.int = TRUE) %>% filter(term == "sleep_hrs")
tibble(
Model = label,
`β̂ (Sleep)` = round(t$estimate, 4),
SE = round(t$std.error, 4),
`95% CI` = paste0("(", round(t$conf.low, 4), ", ", round(t$conf.high, 4), ")"),
`p-value` = round(t$p.value, 4)
)
}
bind_rows(
extract_sleep(model_A, "Model A: sleep only"),
extract_sleep(model_B, "Model B: + age + sex"),
extract_sleep(model_C, "Model C: full model")
) %>%
kable(caption = "Table 3b. Sleep Coefficient Across Models A, B, and C") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Model | β̂ (Sleep |
S
| ||
|---|---|---|---|---|
| Model A: sleep only | -0.8042 | 0.0802 | (-0.9616, -0.6469) | 0 |
| Model B: + age + sex | -0.7339 | 0.0793 | (-0.8894, -0.5784) | 0 |
| Model C: full model | -0.5092 | 0.0753 | (-0.6569, -0.3614) | 0 |
tidy(model_C, conf.int = TRUE) %>%
mutate(
term = dplyr::recode(term,
"(Intercept)" = "Intercept",
"sleep_hrs" = "Sleep (hours/night)",
"age" = "Age (years)",
"sexFemale" = "Sex: Female (ref = Male)",
"physhlth_days" = "Physical health days",
"income_cat" = "Income (ordinal 1–8)",
"exerciseYes" = "Exercise: Yes (ref = No)"
),
across(where(is.numeric), ~ round(., 4))
) %>%
kable(
caption = "Table 3c. Model C — Full Multivariable Regression Results",
col.names = c("Term", "β̂", "SE", "t", "p-value", "CI Low", "CI High")
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Term | β |
S
| ||||
|---|---|---|---|---|---|---|
| Intercept | 12.4755 | 0.7170 | 17.4006 | 0.0000 | 11.0699 | 13.8810 |
| Sleep (hours/night) | -0.5092 | 0.0753 | -6.7574 | 0.0000 | -0.6569 | -0.3614 |
| Age (years) | -0.0823 | 0.0059 | -13.8724 | 0.0000 | -0.0939 | -0.0707 |
| Sex: Female (ref = Male) | 1.2451 | 0.2023 | 6.1535 | 0.0000 | 0.8484 | 1.6417 |
| Physical health days | 0.2917 | 0.0136 | 21.4779 | 0.0000 | 0.2650 | 0.3183 |
| Income (ordinal 1–8) | -0.3213 | 0.0520 | -6.1778 | 0.0000 | -0.4233 | -0.2194 |
| Exercise: Yes (ref = No) | -0.3427 | 0.2531 | -1.3537 | 0.1759 | -0.8389 | 0.1536 |
b <- round(coef(model_C), 3)
cat("Fitted equation (Model C):\n")
## Fitted equation (Model C):
cat("Predicted Mental Health Days =",
b["(Intercept)"], "+",
b["sleep_hrs"], "× Sleep +",
b["age"], "× Age +",
b["sexFemale"], "× Female +",
b["physhlth_days"], "× Physical Health Days +",
b["income_cat"], "× Income +",
b["exerciseYes"], "× Exercise (Yes)\n")
## Predicted Mental Health Days = 12.475 + -0.509 × Sleep + -0.082 × Age + 1.245 × Female + 0.292 × Physical Health Days + -0.321 × Income + -0.343 × Exercise (Yes)
[Your plain-language interpretation of every coefficient here.]
tribble(
~Model, ~Predictors, ~`R²`, ~`Adj. R²`,
"Model A: sleep only", 1,
round(summary(model_A)$r.squared, 4), round(summary(model_A)$adj.r.squared, 4),
"Model B: + age + sex", 3,
round(summary(model_B)$r.squared, 4), round(summary(model_B)$adj.r.squared, 4),
"Model C: full model", 6,
round(summary(model_C)$r.squared, 4), round(summary(model_C)$adj.r.squared, 4)
) %>%
kable(caption = "Table 4a. R² and Adjusted R² Across Models") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(3, bold = TRUE, background = "#EBF5FB")
| Model | Predictors | R² | Adj. R² |
|---|---|---|---|
| Model A: sleep only | 1 | 0.0197 | 0.0195 |
| Model B: + age + sex | 3 | 0.0504 | 0.0498 |
| Model C: full model | 6 | 0.1569 | 0.1559 |
[Your interpretation here: which model explains the most variance?]
rmse_C <- round(summary(model_C)$sigma, 2)
cat("Root MSE (Model C):", rmse_C, "mentally unhealthy days\n")
## Root MSE (Model C): 7.09 mentally unhealthy days
[Your practical interpretation here: what does the Root MSE tell you about prediction accuracy?]
# Get ANOVA decomposition
anova_C <- anova(model_C)
print(anova_C)
## Analysis of Variance Table
##
## Response: menthlth_days
## Df Sum Sq Mean Sq F value Pr(>F)
## sleep_hrs 1 5865 5864.8 116.6678 < 2.2e-16 ***
## age 1 6182 6182.2 122.9832 < 2.2e-16 ***
## sex 1 2947 2947.1 58.6266 2.274e-14 ***
## physhlth_days 1 29456 29455.5 585.9585 < 2.2e-16 ***
## income_cat 1 2177 2176.8 43.3031 5.169e-11 ***
## exercise 1 92 92.1 1.8326 0.1759
## Residuals 4993 250993 50.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Compute summary ANOVA table values for manual fill-in
glance_C <- glance(model_C)
SSR <- sum(anova_C$`Sum Sq`[1:(nrow(anova_C)-1)]) # model SS
SSE <- anova_C$`Sum Sq`[nrow(anova_C)] # residual SS
SSY <- SSR + SSE # total SS
df_model <- glance_C$df
df_residual <- glance_C$df.residual
df_total <- df_model + df_residual
MSR <- SSR / df_model
MSE <- SSE / df_residual
F_stat <- MSR / MSE
tribble(
~Source, ~df, ~SS, ~MS, ~F,
"Model", df_model, round(SSR, 2), round(MSR, 2), round(F_stat, 2),
"Residual", df_residual, round(SSE, 2), round(MSE, 2), NA,
"Total", df_total, round(SSY, 2), NA, NA
) %>%
kable(caption = "Table 4c. ANOVA Summary Table for Model C",
na = "") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
| Source | df | SS | MS | F |
|---|---|---|---|---|
| Model | 6 | 46718.54 | 7786.42 | 154.9 |
| Residual | 4993 | 250992.80 | 50.27 | NA |
| Total | 4999 | 297711.34 | NA | NA |
cat("\nOverall F-statistic:", round(glance_C$statistic, 2),
"\np-value:", format(glance_C$p.value, scientific = TRUE), "\n")
##
## Overall F-statistic: 154.9
## p-value: 6.48049e-181
[State the null hypothesis for the overall F-test and your conclusion here.]
par(mfrow = c(2, 2))
plot(model_C, which = 1:4,
col = adjustcolor("steelblue", alpha.f = 0.3), pch = 16)
par(mfrow = c(1, 1))
[Your comments here on what each plot reveals about the LINE assumptions.]
[Your interpretation here: given that menthlth_days
is bounded at 0–30 and heavily right-skewed, discuss which LINE
assumptions are most likely violated and whether this invalidates the
analysis.]
model_C_aug <- augment(model_C)
# Count and display observations with Cook's D > 1
influential <- model_C_aug %>%
filter(.cooksd > 1) %>%
select(menthlth_days, physhlth_days, sleep_hrs, age, .fitted, .resid, .cooksd)
cat("Number of observations with Cook's Distance > 1:", nrow(influential), "\n\n")
## Number of observations with Cook's Distance > 1: 0
if (nrow(influential) > 0) {
influential %>%
mutate(across(where(is.numeric), ~ round(., 3))) %>%
kable(caption = "Observations with Cook's Distance > 1") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
} else {
cat("No observations exceed Cook's Distance of 1.\n")
}
## No observations exceed Cook's Distance of 1.
# Plot Cook's distance for visual reference
model_C_aug %>%
mutate(obs = row_number()) %>%
ggplot(aes(x = obs, y = .cooksd)) +
geom_col(fill = "black", alpha = 0.6) +
geom_hline(yintercept = 1, color = "red", linetype = "dashed", linewidth = 1) +
labs(
title = "Cook's Distance by Observation (Model C)",
subtitle = "Red dashed line = threshold of 1",
x = "Observation Index",
y = "Cook's Distance"
) +
theme_minimal(base_size = 13)
[Your interpretation here: how many influential observations are there and what would you do with them in a real analysis?]
[Write your 3–4 sentence paragraph here summarizing the findings from Model C for a non-statistical audience. Remember: identify significant predictors, state direction and magnitude of key associations, caveat the cross-sectional design, and avoid all statistical jargon.]
End of Lab Activity