# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)

Load brfss_subset_2023.rds (Use your own file path) 220 -

brfss_clean <- read_rds("/Users/mattgoldman/Documents/PhD in Epi/Spring 2026/EPI 553/Week 6/brfss_subset_2023.rds")

Descriptive Statistics

# Summary table by diabetes status
desc_table <- brfss_clean %>%
  group_by(diabetes) %>%
  summarise(
    N = n(),
    `Mean Age` = round(mean(age_cont), 1),
    `% Male` = round(100 * mean(sex == "Male"), 1),
    `% Obese` = round(100 * mean(bmi_cat == "Obese", na.rm = TRUE), 1),
    `% Physically Active` = round(100 * mean(phys_active), 1),
    `% Current Smoker` = round(100 * mean(current_smoker), 1),
    `% Hypertension` = round(100 * mean(hypertension), 1),
    `% High Cholesterol` = round(100 * mean(high_chol), 1)
  ) %>%
  mutate(diabetes = ifelse(diabetes == 1, "Diabetes", "No Diabetes"))

desc_table %>%
  kable(caption = "Descriptive Statistics by Diabetes Status",
        align = "lrrrrrrrr") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Descriptive Statistics by Diabetes Status
diabetes	N	Mean Age	% Male	% Obese	% Physically Active	% Current Smoker	% Hypertension	% High Cholesterol
No Diabetes	1053	58.2	49.0	34.8	69.4	29.3	47.5	42.5
Diabetes	228	63.1	53.9	56.1	53.5	27.6	76.8	67.1

Part 2: Student Lab Activity

Lab Overview

In this lab, you will:

Build your own logistic regression model predicting hypertension (high blood pressure)
Create dummy variables for categorical predictors
Interpret regression coefficients
Test for confounding and interaction
Perform model diagnostics

Lab Instructions

Task 1: Explore the Outcome Variable

# YOUR CODE HERE: Create a frequency table of hypertension status
table(brfss_clean$hypertension)

## 
##   0   1 
## 606 675

summarise(brfss_clean, `% hypertension` = mean(hypertension))

## # A tibble: 1 × 1
##   `% hypertension`
##              <dbl>
## 1            0.527

# YOUR CODE HERE: Calculate the prevalence of hypertension by age group
hyper_table <- brfss_clean %>%
  group_by(age_group) %>%
  summarise(
    N = n(),
    `% Hypertension` = round(100 * mean(hypertension), 1),
  )

hyper_table %>%
  kable(caption = "Hypertension Prevalence by Age",
        align = "lrrrrrrrr") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Hypertension Prevalence by Age
age_group	N	% Hypertension
18-24	12	8.3
25-34	77	19.5
35-44	138	30.4
45-54	161	37.9
55-64	266	51.5
65+	627	66.8

Questions:

What is the overall prevalence of hypertension in the dataset? The overall prevalence of hypertension is 52.7%
How does hypertension prevalence vary by age group? Prevalence of hypertension increases as age group increases —

Task 2: Build a Simple Logistic Regression Model

# YOUR CODE HERE: Fit a simple logistic regression model
# Outcome: hypertension
# Predictor: age_cont
model_logistic_simple <- glm(hypertension ~ age_cont,
                              data = brfss_clean,
                              family = binomial(link = "logit"))

# YOUR CODE HERE: Display the results with odds ratios
tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
  kable(caption = "Simple Logistic Regression: Hypertension ~ Age (Odds Ratios)",
        digits = 3,
        col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)

Simple Logistic Regression: Hypertension ~ Age (Odds Ratios)
Term	Odds Ratio	Std. Error	z-statistic	p-value	95% CI Lower	95% CI Upper
(Intercept)	0.048	0.296	-10.293	0	0.026	0.084
age_cont	1.055	0.005	10.996	0	1.045	1.065

pp <- predict_response(model_logistic_simple, terms = "age_cont")
plot(pp)

Questions:

What is the odds ratio for age? Interpret this value. The odds ratio for age is 1.055, which means that for every year the risk of hypertension is expected to increase by 5.5%.
Is the association statistically significant? Yes, the association between age and hypertension is significant (p<0.05)
What is the 95% confidence interval for the odds ratio? The 95% confidence interval is 1.045 to 1.065.

Task 3: Create a Multiple Regression Model

# YOUR CODE HERE: Fit a multiple logistic regression model
# Outcome: hypertension
# Predictors: age_cont, sex, bmi_cat, phys_active, current_smoker
model_logistic_multiple <- glm(diabetes ~ age_cont + sex + bmi_cat +
                                phys_active + current_smoker,
                               data = brfss_clean,
                               family = binomial(link = "logit"))

# YOUR CODE HERE: Display the results
# Display results
tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
  kable(caption = "Multiple Logistic Regression: Diabetes ~ Age + Covariates (Odds Ratios)",
        digits = 3,
        col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  scroll_box(height = "400px")

Multiple Logistic Regression: Diabetes ~ Age + Covariates (Odds Ratios)
Term	Odds Ratio	Std. Error	z-statistic	p-value	95% CI Lower	95% CI Upper
(Intercept)	0.006	1.153	-4.477	0.000	0.000	0.039
age_cont	1.040	0.007	5.445	0.000	1.026	1.056
sexMale	1.214	0.154	1.264	0.206	0.899	1.642
bmi_catNormal	2.012	1.051	0.665	0.506	0.386	37.062
bmi_catOverweight	3.208	1.043	1.118	0.264	0.632	58.630
bmi_catObese	6.993	1.041	1.869	0.062	1.388	127.528
phys_active	0.566	0.155	-3.668	0.000	0.418	0.768
current_smoker	1.239	0.177	1.212	0.226	0.873	1.747

Questions:

How did the odds ratio for age change after adjusting for other variables? The odds ratio decreased, meaning as age increases the expected increase in hypertension risk is reduced.
What does this suggest about confounding? This suggests that there are confounding variables in the list that was included in this multiple logistic regression model.
Which variables are the strongest predictors of hypertension? BMI and physical activity were the strongest predictors of hypertension. The model shows that obese individuals have ~7-times the risk of hypertension and those who are physically active have 56.6% reduced risk. —

Task 4: Interpret Dummy Variables

# YOUR CODE HERE: Create a table showing the dummy variable coding for bmi_cat
summary(brfss_clean$bmi_cat)

## Underweight      Normal  Overweight       Obese 
##          20         320         447         494

dummy_table <- data.frame(
  BMI_cat = c("< Underweight", "Normal", "Overweight", "Obese"),
  `Dummy 1 (normal)` = c(0, 1, 0, 0),
  `Dummy 2 (over)` = c(0, 0, 1, 0),
  `Dummy 3 (obese)` = c(0, 0, 0, 1),
  check.names = FALSE
)


# YOUR CODE HERE: Extract and display the odds ratios for BMI categories
dummy_table %>%
  kable(caption = "Dummy Variable Coding for BMI",
        align = "lccc") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(1, bold = TRUE, background = "#ffe6e6")  # Highlight reference category

Dummy Variable Coding for BMI
BMI_cat	Dummy 1 (normal)	Dummy 2 (over)	Dummy 3 (obese)
< Underweight	0	0	0
Normal	1	0	0
Overweight	0	1	0
Obese	0	0	1

Questions:

What is the reference category for BMI? Underweight
Interpret the odds ratio for “Obese” compared to the reference category. The model shows that obese individuals have ~7-times the risk of hypertension compared to underweight individuals.
How would you explain this to a non-statistician? Obesity increases the risk of hypertension. —

Task 5: Test for Interaction

# YOUR CODE HERE: Fit a model with Age × BMI interaction
# Test if the effect of age on hypertension differs by BMI category

# Model with interaction term
model_interaction <- glm(diabetes ~ age_cont * bmi_cat,
                         data = brfss_clean,
                         family = binomial(link = "logit"))

# Display interaction results
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(str_detect(term, "age_cont")) %>%
  kable(caption = "Age × BMI Interaction Model (Odds Ratios)",
        digits = 3,
        col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)

Age × BMI Interaction Model (Odds Ratios)
Term	Odds Ratio	Std. Error	z-statistic	p-value	95% CI Lower	95% CI Upper
age_cont	3.530	40.609	0.031	0.975	853638056	84628851
age_cont:bmi_catNormal	0.291	40.609	-0.030	0.976	0	0
age_cont:bmi_catOverweight	0.296	40.609	-0.030	0.976	0	0
age_cont:bmi_catObese	0.294	40.609	-0.030	0.976	0	0

# YOUR CODE HERE: Perform a likelihood ratio test comparing models with and without interaction
# Model 1: Age only
model1 <- glm(diabetes ~ age_cont,
              data = brfss_clean,
              family = binomial)

# Model 2: Interaction model
model2 <- model_interaction

# Model 3: Full model
model3 <- model_logistic_multiple

# Likelihood ratio test
lrt_1_2 <- anova(model1, model2, test = "LRT")
lrt_2_3 <- anova(model2, model3, test = "LRT")

# Create comparison table
model_comp <- data.frame(
  Model = c("Model 1: Age only",
            "Model 2: Age and BMI Interaction",
            "Model 3: Full model"),
  AIC = c(AIC(model1), AIC(model2), AIC(model3)),
  BIC = c(BIC(model1), BIC(model2), BIC(model3)),
  `Deviance` = c(deviance(model1), deviance(model2), deviance(model3)),
  check.names = FALSE
)

model_comp %>%
  kable(caption = "Model Comparison: AIC, BIC, and Deviance",
        digits = 2,
        align = "lrrr") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which.min(model_comp$AIC), bold = TRUE, background = "#d4edda")

Model Comparison: AIC, BIC, and Deviance
Model	AIC	BIC	Deviance
Model 1: Age only	1175.08	1185.39	1171.08
Model 2: Age and BMI Interaction	1135.64	1176.88	1119.64
Model 3: Full model	1120.93	1162.17	1104.93

Questions:

Is the interaction term statistically significant? No, the p-values are >0.05.
What does this mean in epidemiologic terms (effect modification)? BMI does not impart effect modification on age and hypertension. This means that the association between age and hypertension does not differ across BMI groups.
Create a visualization showing predicted probabilities by age and BMI category

# Generate predicted probabilities by sex
pred_interact <- ggpredict(model_interaction, terms = c("age_cont", "bmi_cat"))

# Plot
p4 <- ggplot(pred_interact, aes(x = x, y = predicted, color = group, fill = group)) +
  geom_line(linewidth = 1.2) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, color = NA) +
  labs(
    title = "Predicted Probability of Diabetes by Age and BMI",
    subtitle = "Testing for Age × BMI Interaction",
    x = "Age (years)",
    y = "Predicted Probability of Diabetes",
    color = "bmi_cat",
    fill = "bmi_cat"
  ) +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "bottom")

ggplotly(p4)

Task 6: Model Diagnostics

# YOUR CODE HERE: Calculate VIF for your multiple regression model
# Calculate VIF
vif_values <- vif(model_logistic_multiple)

# Create VIF table
# For models with categorical variables, vif() returns GVIF (Generalized VIF)
if (is.matrix(vif_values)) {
  # If matrix (categorical variables present), extract GVIF^(1/(2*Df))
  vif_df <- data.frame(
    Variable = rownames(vif_values),
    VIF = vif_values[, "GVIF^(1/(2*Df))"]
  )
} else {
  # If vector (only continuous variables)
  vif_df <- data.frame(
    Variable = names(vif_values),
    VIF = as.numeric(vif_values)
  )
}

# Add interpretation
vif_df <- vif_df %>%
  arrange(desc(VIF)) %>%
  mutate(
    Interpretation = case_when(
      VIF < 5 ~ "Low (No concern)",
      VIF >= 5 & VIF < 10 ~ "Moderate (Monitor)",
      VIF >= 10 ~ "High (Problem)"
    )
  )

vif_df %>%
  kable(caption = "Variance Inflation Factors (VIF) for Multiple Regression Model",
        digits = 2,
        align = "lrc") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which(vif_df$VIF >= 10), bold = TRUE, color = "white", background = "#DC143C") %>%
  row_spec(which(vif_df$VIF >= 5 & vif_df$VIF < 10), background = "#FFA500") %>%
  row_spec(which(vif_df$VIF < 5), background = "#90EE90")

Variance Inflation Factors (VIF) for Multiple Regression Model
	Variable	VIF	Interpretation
age_cont	age_cont	1.05	Low (No concern)
current_smoker	current_smoker	1.04	Low (No concern)
phys_active	phys_active	1.01	Low (No concern)
sex	sex	1.01	Low (No concern)
bmi_cat	bmi_cat	1.01	Low (No concern)

# YOUR CODE HERE: Create a Cook's distance plot to identify influential observations
cooks_d <- cooks.distance(model_logistic_multiple)

# Create data frame
influence_df <- data.frame(
  observation = 1:length(cooks_d),
  cooks_d = cooks_d
) %>%
  mutate(influential = ifelse(cooks_d > 1, "Yes", "No"))

# Plot
p5 <- ggplot(influence_df, aes(x = observation, y = cooks_d, color = influential)) +
  geom_point(alpha = 0.6) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
  labs(
    title = "Cook's Distance: Identifying Influential Observations",
    subtitle = "Values > 1 indicate potentially influential observations",
    x = "Observation Number",
    y = "Cook's Distance",
    color = "Influential?"
  ) +
  scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
  theme_minimal(base_size = 12)

ggplotly(p5)

Questions:

Are there any concerns about multicollinearity? No, all VIF values show low concern.
Are there any influential observations that might affect your results? No, all VIF values show low concern.
What would you do if you found serious violations? I would remove the variable with high VIF. —

Task 7: Model Comparison

# YOUR CODE HERE: Compare three models using AIC and BIC
# Model A: Age only
# Model B: Age + sex + bmi_cat
# Model C: Age + sex + bmi_cat + phys_active + current_smoker
# Model 1: Age only
model4 <- glm(diabetes ~ age_cont,
              data = brfss_clean,
              family = binomial)

# Model 2: Age + BMI
model5 <- glm(diabetes ~ age_cont + bmi_cat,
              data = brfss_clean,
              family = binomial)

# Model 3: Full model
model6 <- model_logistic_multiple

# Likelihood ratio test
lrt_1_2 <- anova(model4, model5, test = "LRT")
lrt_2_3 <- anova(model5, model6, test = "LRT")

# YOUR CODE HERE: Create a comparison table
model_comp_2 <- data.frame(
  Model = c("Model 1: Age only",
            "Model 2: Age + BMI",
            "Model 3: Full model"),
  AIC = c(AIC(model4), AIC(model5), AIC(model6)),
  BIC = c(BIC(model4), BIC(model5), BIC(model6)),
  `Deviance` = c(deviance(model4), deviance(model5), deviance(model6)),
  check.names = FALSE
)
model_comp_2 %>%
  kable(caption = "Model Comparison: AIC, BIC, and Deviance",
        digits = 2,
        align = "lrrr") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which.min(model_comp$AIC), bold = TRUE, background = "#d4edda")

Model Comparison: AIC, BIC, and Deviance
Model	AIC	BIC	Deviance
Model 1: Age only	1175.08	1185.39	1171.08
Model 2: Age + BMI	1131.45	1157.22	1121.45
Model 3: Full model	1120.93	1162.17	1104.93

Questions:

Which model has the best fit based on AIC? The full model had the best fit based on AIC.
Is the added complexity of the full model justified? Yes, because the AIC improved after adding confounders.
Which model would you choose for your final analysis? Why? I would choose the full model —

Write a brief report (1-2 pages) summarizing your findings:

Introduction: State your research question In this lab we analyze the association between age and hypertension. The research question concerns whether hypertension risk worsens with age. Hypertension, or high blood pressure, risk will likely increase with age as this disease is more common in older populations. In addition, this lab will determine the impact of covariates such as BMI, physical activity, sex, and smoking. These factors are all likely involved in the development of hypertension. It will be important to determine whether these counfounders have effect modifications on the association between age and hypertension.
Methods: Describe your analytic approach In this lab, we will use a multiple logistic regression model to understand the association between age and hypertension, while controlling for confounding variables. Post hoc testing will be done to ensure that these confounders are not causing effect modification. In addition, multicollinearity and likelihood ratio tests will be performed.

Results: Present key findings with tables and figures Hypertension was analyzed by age group to elucidate trends.

Hypertension Prevalence by Age
age_group	N	% Hypertension
18-24	12	8.3
25-34	77	19.5
35-44	138	30.4
45-54	161	37.9
55-64	266	51.5
65+	627	66.8

A multiple logistic regression model was developed to determine the association between age and hypertension, while controlling for confounders.

Multiple Logistic Regression: Diabetes ~ Age + Covariates (Odds Ratios)
Term	Odds Ratio	Std. Error	z-statistic	p-value	95% CI Lower	95% CI Upper
(Intercept)	0.006	1.153	-4.477	0.000	0.000	0.039
age_cont	1.040	0.007	5.445	0.000	1.026	1.056
sexMale	1.214	0.154	1.264	0.206	0.899	1.642
bmi_catNormal	2.012	1.051	0.665	0.506	0.386	37.062
bmi_catOverweight	3.208	1.043	1.118	0.264	0.632	58.630
bmi_catObese	6.993	1.041	1.869	0.062	1.388	127.528
phys_active	0.566	0.155	-3.668	0.000	0.418	0.768
current_smoker	1.239	0.177	1.212	0.226	0.873	1.747

VIF was calculated to check for multicollinearity.

Variance Inflation Factors (VIF) for Multiple Regression Model
	Variable	VIF	Interpretation
age_cont	age_cont	1.05	Low (No concern)
current_smoker	current_smoker	1.04	Low (No concern)
phys_active	phys_active	1.01	Low (No concern)
sex	sex	1.01	Low (No concern)
bmi_cat	bmi_cat	1.01	Low (No concern)

A model comparison using a likelihood ratio test was performed to determine which model resulted in the lowest AIC, indicating a better fitting model.

Model Comparison: AIC, BIC, and Deviance
Model	AIC	BIC	Deviance
Model 1: Age only	1175.08	1185.39	1171.08
Model 2: Age + BMI	1131.45	1157.22	1121.45
Model 3: Full model	1120.93	1162.17	1104.93

Interpretation: Explain what your results mean The multiple logistic regression model showed significant associations between hypertension and age. The odds ratio for age is 1.055, which means that for every year the risk of hypertension is expected to increase by 5.5%. The model shows that BMI and physical activity were the strongest predictors of hypertension. The model shows that obese individuals have ~7-times the risk of hypertension and those who are physically active have 56.6% reduced risk.

Model analytics showed that BMI does not impart effect modification on age and hypertension. This means that the association between age and hypertension does not differ across BMI groups. In addition, all VIF values showed low concern of multicollinearity. The full model including all confounders was found to have the lowest AIC, meaning the best fit model.

Limitations: Discuss potential issues with your analysis The sample size of 1281 could be improved to allow for more statistical power.

Statistical Modeling in Epidemiology

EPI 553 - Advanced Epidemiologic Methods

Matthew Goldman

2026-02-24

Load brfss_subset_2023.rds (Use your own file path) 220 -

Descriptive Statistics

Part 2: Student Lab Activity

Lab Overview

Lab Instructions

Task 1: Explore the Outcome Variable

Task 2: Build a Simple Logistic Regression Model

Task 3: Create a Multiple Regression Model

Task 4: Interpret Dummy Variables

Task 5: Test for Interaction

Task 6: Model Diagnostics

Task 7: Model Comparison

Write a brief report (1-2 pages) summarizing your findings: