Statistical modeling is a fundamental tool in epidemiology that allows us to:
This lecture introduces key concepts in regression modeling using real-world data from the Behavioral Risk Factor Surveillance System (BRFSS) 2023.
# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)The BRFSS is a large-scale telephone survey that collects data on health-related risk behaviors, chronic health conditions, and use of preventive services from U.S. residents.
# Load the full BRFSS 2023 dataset
brfss_clean <- read_rds("brfss_subset_2023.rds")
# Display dataset dimensions
## Descriptive Statistics
# Summary table by diabetes status
desc_table <- brfss_clean %>%
group_by(diabetes) %>%
summarise(
N = n(),
`Mean Age` = round(mean(age_cont), 1),
`% Male` = round(100 * mean(sex == "Male"), 1),
`% Obese` = round(100 * mean(bmi_cat == "Obese", na.rm = TRUE), 1),
`% Physically Active` = round(100 * mean(phys_active), 1),
`% Current Smoker` = round(100 * mean(current_smoker), 1),
`% Hypertension` = round(100 * mean(hypertension), 1),
`% High Cholesterol` = round(100 * mean(high_chol), 1)
) %>%
mutate(diabetes = ifelse(diabetes == 1, "Diabetes", "No Diabetes"))
desc_table %>%
kable(caption = "Descriptive Statistics by Diabetes Status",
align = "lrrrrrrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| diabetes | N | Mean Age | % Male | % Obese | % Physically Active | % Current Smoker | % Hypertension | % High Cholesterol |
|---|---|---|---|---|---|---|---|---|
| No Diabetes | 1053 | 58.2 | 49.0 | 34.8 | 69.4 | 29.3 | 47.5 | 42.5 |
| Diabetes | 228 | 63.1 | 53.9 | 56.1 | 53.5 | 27.6 | 76.8 | 67.1 |
For computational efficiency and teaching purposes, we’ll create a subset with relevant variables and complete cases.
# Create analytic dataset from cleaned BRFSS subset
set.seed(553) # For reproducibility
brfss_subset <- brfss_clean %>%
select(any_of(c(
"diabetes",
"age_cont",
"sex",
"race",
"educag",
"incomg1",
"bmi_cat",
"phys_active",
"current_smoker",
"hypertension",
"high_chol"
))) %>%
drop_na() %>%
slice_sample(n = 2000)
cat("Working subset dimensions:",
nrow(brfss_subset), "observations,",
ncol(brfss_subset), "variables\n")## Working subset dimensions: 1281 observations, 9 variables
library(dplyr)
library(readr)
brfss_clean2 <- brfss_subset %>%
mutate(
# Make sure outcome is 0/1 (if it’s already 0/1, this won’t change it)
diabetes = case_when(
diabetes == 1 ~ 1,
diabetes == 0 ~ 0,
TRUE ~ NA_real_
),
# Create age groups from age_cont (since you have age_cont, not age_g)
age_group = cut(
age_cont,
breaks = c(18, 25, 35, 45, 55, 65, Inf),
right = FALSE,
labels = c("18-24", "25-34", "35-44", "45-54", "55-64", "65+")
),
# Ensure factors look right
sex = factor(sex),
bmi_cat = factor(bmi_cat, levels = c("Underweight", "Normal", "Overweight", "Obese"))
) %>%
select(diabetes, age_group, age_cont, sex, bmi_cat,
phys_active, current_smoker, hypertension, high_chol) %>%
drop_na()
cat("Clean dataset created with", nrow(brfss_clean2), "complete observations\n")## Clean dataset created with 1281 complete observations
# Summary table by diabetes status
desc_table <- brfss_clean %>%
group_by(diabetes) %>%
summarise(
N = n(),
`Mean Age` = round(mean(age_cont), 1),
`% Male` = round(100 * mean(sex == "Male"), 1),
`% Obese` = round(100 * mean(bmi_cat == "Obese", na.rm = TRUE), 1),
`% Physically Active` = round(100 * mean(phys_active), 1),
`% Current Smoker` = round(100 * mean(current_smoker), 1),
`% Hypertension` = round(100 * mean(hypertension), 1),
`% High Cholesterol` = round(100 * mean(high_chol), 1)
) %>%
mutate(diabetes = ifelse(diabetes == 1, "Diabetes", "No Diabetes"))
desc_table %>%
kable(caption = "Descriptive Statistics by Diabetes Status",
align = "lrrrrrrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| diabetes | N | Mean Age | % Male | % Obese | % Physically Active | % Current Smoker | % Hypertension | % High Cholesterol |
|---|---|---|---|---|---|---|---|---|
| No Diabetes | 1053 | 58.2 | 49.0 | 34.8 | 69.4 | 29.3 | 47.5 | 42.5 |
| Diabetes | 228 | 63.1 | 53.9 | 56.1 | 53.5 | 27.6 | 76.8 | 67.1 |
A statistical model is a mathematical representation of the relationship between:
\[f(Y) = \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \cdots + \beta_p X_p + \epsilon\]
Where:
The choice of regression model depends on the type of outcome variable:
| Outcome Type | Regression Type | Link Function | Example |
|---|---|---|---|
| Continuous | Linear | Identity: Y | Blood pressure, BMI |
| Binary | Logistic | Logit: log(p/(1-p)) | Disease status, mortality |
| Count | Poisson/Negative Binomial | Log: log(Y) | Number of infections |
| Time-to-event | Cox Proportional Hazards | Log: log(h(t)) | Survival time |
Let’s model the relationship between age and diabetes prevalence.
# Simple linear regression: diabetes ~ age
model_linear_simple <- lm(diabetes ~ age_cont, data = brfss_clean)
# Display results
tidy(model_linear_simple, conf.int = TRUE) %>%
kable(caption = "Simple Linear Regression: Diabetes ~ Age",
digits = 4,
col.names = c("Term", "Estimate", "Std. Error", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Estimate | Std. Error | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | -0.0632 | 0.0481 | -1.3125 | 0.1896 | -0.1576 | 0.0312 |
| age_cont | 0.0041 | 0.0008 | 5.1368 | 0.0000 | 0.0025 | 0.0056 |
Interpretation:
# Create scatter plot with regression line
p1 <- ggplot(brfss_clean, aes(x = age_cont, y = diabetes)) +
geom_jitter(alpha = 0.2, width = 0.5, height = 0.02, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1.2) +
labs(
title = "Relationship Between Age and Diabetes",
subtitle = "Simple Linear Regression",
x = "Age (years)",
y = "Probability of Diabetes"
) +
theme_minimal(base_size = 12)
ggplotly(p1) %>%
layout(hovermode = "closest")Diabetes Prevalence by Age
Problem with linear regression for binary outcomes:
Solution: Logistic Regression
Uses the logit link function to ensure predicted probabilities stay between 0 and 1:
\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]
# Simple logistic regression: diabetes ~ age
model_logistic_simple <- glm(diabetes ~ age_cont,
data = brfss_clean,
family = binomial(link = "logit"))
# Display results with odds ratios
tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Simple Logistic Regression: Diabetes ~ Age (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.029 | 0.423 | -8.390 | 0 | 0.012 | 0.064 |
| age_cont | 1.034 | 0.007 | 4.978 | 0 | 1.021 | 1.048 |
Interpretation:
Predicted Diabetes Probability by Age
# Generate predicted probabilities
pred_data <- data.frame(age_cont = seq(18, 80, by = 1))
pred_data$predicted_prob <- predict(model_logistic_simple,
newdata = pred_data,
type = "response")
# Plot
p2 <- ggplot(pred_data, aes(x = age_cont, y = predicted_prob)) +
geom_line(color = "darkred", linewidth = 1.5) +
geom_ribbon(aes(ymin = predicted_prob - 0.02,
ymax = predicted_prob + 0.02),
alpha = 0.2, fill = "darkred") +
labs(
title = "Predicted Probability of Diabetes by Age",
subtitle = "Simple Logistic Regression",
x = "Age (years)",
y = "Predicted Probability of Diabetes"
) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0, 0.6)) +
theme_minimal(base_size = 12)
ggplotly(p2)Predicted Diabetes Probability by Age
A confounder is a variable that:
Example: The relationship between age and diabetes may be confounded by BMI, physical activity, and other factors.
# Multiple logistic regression with potential confounders
model_logistic_multiple <- glm(diabetes ~ age_cont + sex + bmi_cat +
phys_active + current_smoker + education,
data = brfss_clean,
family = binomial(link = "logit"))
# Display results
tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Multiple Logistic Regression: Diabetes ~ Age + Covariates (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
scroll_box(height = "400px")| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.009 | 1.177 | -4.001 | 0.000 | 0.000 | 0.065 |
| age_cont | 1.041 | 0.007 | 5.515 | 0.000 | 1.027 | 1.057 |
| sexMale | 1.191 | 0.154 | 1.133 | 0.257 | 0.880 | 1.613 |
| bmi_catNormal | 1.971 | 1.052 | 0.645 | 0.519 | 0.378 | 36.309 |
| bmi_catOverweight | 3.155 | 1.044 | 1.101 | 0.271 | 0.621 | 57.679 |
| bmi_catObese | 6.834 | 1.041 | 1.845 | 0.065 | 1.354 | 124.675 |
| phys_active | 0.589 | 0.157 | -3.373 | 0.001 | 0.433 | 0.802 |
| current_smoker | 1.213 | 0.178 | 1.085 | 0.278 | 0.852 | 1.716 |
| educationHigh school graduate | 0.634 | 0.288 | -1.579 | 0.114 | 0.364 | 1.131 |
| educationSome college | 0.542 | 0.294 | -2.081 | 0.037 | 0.307 | 0.977 |
| educationCollege graduate | 0.584 | 0.305 | -1.763 | 0.078 | 0.324 | 1.074 |
Interpretation:
Categorical variables with \(k\) levels are represented using \(k-1\) dummy variables (indicator variables).
Education has 4 levels: 1. < High school (reference category) 2. High school graduate 3. Some college 4. College graduate
R automatically creates 3 dummy variables:
# Extract dummy variable coding
dummy_table <- data.frame(
Education = c("< High school", "High school graduate", "Some college", "College graduate"),
`Dummy 1 (HS grad)` = c(0, 1, 0, 0),
`Dummy 2 (Some college)` = c(0, 0, 1, 0),
`Dummy 3 (College grad)` = c(0, 0, 0, 1),
check.names = FALSE
)
dummy_table %>%
kable(caption = "Dummy Variable Coding for Education",
align = "lccc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(1, bold = TRUE, background = "#ffe6e6") # Highlight reference category| Education | Dummy 1 (HS grad) | Dummy 2 (Some college) | Dummy 3 (College grad) |
|---|---|---|---|
| < High school | 0 | 0 | 0 |
| High school graduate | 1 | 0 | 0 |
| Some college | 0 | 1 | 0 |
| College graduate | 0 | 0 | 1 |
Reference Category: The category with all zeros (< High school) is the reference group. All other categories are compared to this reference.
# Extract education coefficients
educ_coefs <- tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "education")) %>%
mutate(
education_level = str_remove(term, "education"),
education_level = factor(education_level,
levels = c("High school graduate",
"Some college",
"College graduate"))
)
# Add reference category
ref_row <- data.frame(
term = "education< High school",
estimate = 1.0,
std.error = 0,
statistic = NA,
p.value = NA,
conf.low = 1.0,
conf.high = 1.0,
education_level = factor("< High school (Ref)",
levels = c("< High school (Ref)",
"High school graduate",
"Some college",
"College graduate"))
)
educ_coefs_full <- bind_rows(ref_row, educ_coefs) %>%
mutate(education_level = factor(education_level,
levels = c("< High school (Ref)",
"High school graduate",
"Some college",
"College graduate")))
# Plot
p3 <- ggplot(educ_coefs_full, aes(x = education_level, y = estimate)) +
geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high),
size = 0.8, color = "darkblue") +
coord_flip() +
labs(
title = "Association Between Education and Diabetes",
subtitle = "Adjusted Odds Ratios (reference: < High school)",
x = "Education Level",
y = "Odds Ratio (95% CI)"
) +
theme_minimal(base_size = 12)
ggplotly(p3)Odds Ratios for Education Levels
# Plot model coefficients with `ggcoef_model()`
ggcoef_model(model_logistic_multiple, exponentiate = TRUE,
include = c("education"),
variable_labels = c(
education = "Education"),
facet_labeller = ggplot2::label_wrap_gen(10)
)An interaction exists when the effect of one variable on the outcome differs across levels of another variable.
Epidemiologic term: Effect modification
Does the effect of age on diabetes differ between males and females?
# Model with interaction term
model_interaction <- glm(diabetes ~ age_cont * sex + bmi_cat + phys_active,
data = brfss_clean,
family = binomial(link = "logit"))
# Display interaction results
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "age_cont")) %>%
kable(caption = "Age × Sex Interaction Model (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| age_cont | 1.031 | 0.009 | 3.178 | 0.001 | 1.012 | 1.051 |
| age_cont:sexMale | 1.015 | 0.014 | 1.084 | 0.278 | 0.988 | 1.044 |
Interpretation:
# Generate predicted probabilities by sex
pred_interact <- ggpredict(model_interaction, terms = c("age_cont [18:80]", "sex"))
# Plot
p4 <- ggplot(pred_interact, aes(x = x, y = predicted, color = group, fill = group)) +
geom_line(linewidth = 1.2) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, color = NA) +
labs(
title = "Predicted Probability of Diabetes by Age and Sex",
subtitle = "Testing for Age × Sex Interaction",
x = "Age (years)",
y = "Predicted Probability of Diabetes",
color = "Sex",
fill = "Sex"
) +
scale_y_continuous(labels = scales::percent_format()) +
scale_color_manual(values = c("Female" = "#E64B35", "Male" = "#4DBBD5")) +
scale_fill_manual(values = c("Female" = "#E64B35", "Male" = "#4DBBD5")) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")
ggplotly(p4)Age-Diabetes Relationship by Sex
Every regression model makes assumptions about the data. If assumptions are violated, results may be invalid.
Variance Inflation Factor (VIF): Measures how much the variance of a coefficient is inflated due to correlation with other predictors.
# Calculate VIF
vif_values <- vif(model_logistic_multiple)
# Create VIF table
# For models with categorical variables, vif() returns GVIF (Generalized VIF)
if (is.matrix(vif_values)) {
# If matrix (categorical variables present), extract GVIF^(1/(2*Df))
vif_df <- data.frame(
Variable = rownames(vif_values),
VIF = vif_values[, "GVIF^(1/(2*Df))"]
)
} else {
# If vector (only continuous variables)
vif_df <- data.frame(
Variable = names(vif_values),
VIF = as.numeric(vif_values)
)
}
# Add interpretation
vif_df <- vif_df %>%
arrange(desc(VIF)) %>%
mutate(
Interpretation = case_when(
VIF < 5 ~ "Low (No concern)",
VIF >= 5 & VIF < 10 ~ "Moderate (Monitor)",
VIF >= 10 ~ "High (Problem)"
)
)
vif_df %>%
kable(caption = "Variance Inflation Factors (VIF) for Multiple Regression Model",
digits = 2,
align = "lrc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which(vif_df$VIF >= 10), bold = TRUE, color = "white", background = "#DC143C") %>%
row_spec(which(vif_df$VIF >= 5 & vif_df$VIF < 10), background = "#FFA500") %>%
row_spec(which(vif_df$VIF < 5), background = "#90EE90")| Variable | VIF | Interpretation | |
|---|---|---|---|
| age_cont | age_cont | 1.05 | Low (No concern) |
| current_smoker | current_smoker | 1.05 | Low (No concern) |
| phys_active | phys_active | 1.02 | Low (No concern) |
| sex | sex | 1.01 | Low (No concern) |
| education | education | 1.01 | Low (No concern) |
| bmi_cat | bmi_cat | 1.01 | Low (No concern) |
Cook’s Distance: Measures how much the model would change if an observation were removed.
# Calculate Cook's distance
cooks_d <- cooks.distance(model_logistic_multiple)
# Create data frame
influence_df <- data.frame(
observation = 1:length(cooks_d),
cooks_d = cooks_d
) %>%
mutate(influential = ifelse(cooks_d > 1, "Yes", "No"))
# Plot
p5 <- ggplot(influence_df, aes(x = observation, y = cooks_d, color = influential)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
labs(
title = "Cook's Distance: Identifying Influential Observations",
subtitle = "Values > 1 indicate potentially influential observations",
x = "Observation Number",
y = "Cook's Distance",
color = "Influential?"
) +
scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
theme_minimal(base_size = 12)
ggplotly(p5)Cook’s Distance for Influential Observations
# Count influential observations
n_influential <- sum(influence_df$influential == "Yes")
cat("Number of potentially influential observations:", n_influential, "\n")## Number of potentially influential observations: 0
Use Likelihood Ratio Test to compare nested models:
# Model 1: Age only
model1 <- glm(diabetes ~ age_cont,
data = brfss_clean,
family = binomial)
# Model 2: Age + Sex
model2 <- glm(diabetes ~ age_cont + sex,
data = brfss_clean,
family = binomial)
# Model 3: Full model
model3 <- model_logistic_multiple
# Likelihood ratio test
lrt_1_2 <- anova(model1, model2, test = "LRT")
lrt_2_3 <- anova(model2, model3, test = "LRT")
# Create comparison table
model_comp <- data.frame(
Model = c("Model 1: Age only",
"Model 2: Age + Sex",
"Model 3: Full model"),
AIC = c(AIC(model1), AIC(model2), AIC(model3)),
BIC = c(BIC(model1), BIC(model2), BIC(model3)),
`Deviance` = c(deviance(model1), deviance(model2), deviance(model3)),
check.names = FALSE
)
model_comp %>%
kable(caption = "Model Comparison: AIC, BIC, and Deviance",
digits = 2,
align = "lrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which.min(model_comp$AIC), bold = TRUE, background = "#d4edda")| Model | AIC | BIC | Deviance |
|---|---|---|---|
| Model 1: Age only | 1175.08 | 1185.39 | 1171.08 |
| Model 2: Age + Sex | 1175.85 | 1191.32 | 1169.85 |
| Model 3: Full model | 1122.65 | 1179.36 | 1100.65 |
Interpretation:
All statistical models include an error term (\(\epsilon\)) to account for:
\[Y = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p + \epsilon\]
Key points:
In this lab, you will:
##
## 0 1
## 606 675
##
## 0 1
## 0.4730679 0.5269321
# Prevalence of hypertension by age group
brfss_clean2 %>%
group_by(age_group) %>%
summarise(
N = n(),
Hypertension_Cases = sum(hypertension == 1),
Prevalence = round(mean(hypertension == 1) * 100, 1)
)## # A tibble: 6 × 4
## age_group N Hypertension_Cases Prevalence
## <fct> <int> <int> <dbl>
## 1 18-24 12 1 8.3
## 2 25-34 77 15 19.5
## 3 35-44 138 42 30.4
## 4 45-54 161 61 37.9
## 5 55-64 266 137 51.5
## 6 65+ 627 419 66.8
Questions:
What is the overall prevalence of hypertension in the dataset?
675 individuals with hypertension
606 without hypertension
Prevalence = 675/ 1281 = 0.527
The analytic sample included 1,281 adults, and the overall prevalence of hypertension was 52.7%.
How does hypertension prevalence vary by age group?
Hypertension prevalence increases steadily with age. It is lowest among adults aged 18–24 (8.3%) and rises progressively across age categories, reaching its highest level among adults aged 65+ (66.8%). This pattern suggests a strong positive association between age and hypertension prevalence.
# Fit simple logistic regression model
model1 <- glm(hypertension ~ age_cont,
data = brfss_clean2,
family = binomial)
# View standard model output
summary(model1)##
## Call:
## glm(formula = hypertension ~ age_cont, family = binomial, data = brfss_clean2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.042577 0.295584 -10.29 <2e-16 ***
## age_cont 0.053119 0.004831 11.00 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1772.1 on 1280 degrees of freedom
## Residual deviance: 1632.6 on 1279 degrees of freedom
## AIC: 1636.6
##
## Number of Fisher Scoring iterations: 4
## OR 2.5 % 97.5 %
## (Intercept) 0.04771176 0.02644276 0.08431815
## age_cont 1.05455475 1.04476526 1.06475213
Questions:
What is the odds ratio for age? Interpret this value.
OR = 1.055. For each one-year increase in age, the odds of hypertension increase by approximately 5.5%, holding all else constant. We get 5.5% from: (1.055 − 1) × 100.
Is the association statistically significant?
Yes, the association between age and hypertension is statistically significant, meaning there is strong evidence that age is related to hypertension risk.
What is the 95% confidence interval for the odds ratio?
The 95% confidence interval for the odds ratio is 1.045–1.065, suggesting a precise and positive association between age and hypertension. Meaning, we are 95% confident that each additional year of age increases the odds of hypertension by between 4.5% and 6.5%.
# YOUR CODE HERE: Fit a multiple logistic regression model
# Outcome: hypertension
# Predictors: age_cont, sex, bmi_cat, phys_active, current_smoker
# YOUR CODE HERE: Display the resultsQuestions:
# Set reference categories (run BEFORE model)
brfss_clean2 <- brfss_clean2 %>%
mutate(
sex = relevel(factor(sex), ref = "Female"),
bmi_cat = relevel(factor(bmi_cat), ref = "Normal")
)
# Fit multiple logistic regression model
model2 <- glm(hypertension ~ age_cont + sex + bmi_cat + phys_active + current_smoker,
data = brfss_clean2,
family = binomial)
# Show model summary
summary(model2)##
## Call:
## glm(formula = hypertension ~ age_cont + sex + bmi_cat + phys_active +
## current_smoker, family = binomial, data = brfss_clean2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.065489 0.385202 -10.554 < 2e-16 ***
## age_cont 0.059453 0.005292 11.234 < 2e-16 ***
## sexMale 0.239129 0.122612 1.950 0.05114 .
## bmi_catUnderweight -0.740579 0.546292 -1.356 0.17521
## bmi_catOverweight 0.435353 0.158073 2.754 0.00588 **
## bmi_catObese 1.144248 0.162600 7.037 1.96e-12 ***
## phys_active -0.105371 0.130457 -0.808 0.41926
## current_smoker 0.068533 0.138515 0.495 0.62076
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1772.1 on 1280 degrees of freedom
## Residual deviance: 1563.5 on 1273 degrees of freedom
## AIC: 1579.5
##
## Number of Fisher Scoring iterations: 4
## OR 2.5 % 97.5 %
## (Intercept) 0.0171546 0.007943934 0.03599689
## age_cont 1.0612558 1.050496837 1.07253490
## sexMale 1.2701421 0.998922794 1.61567286
## bmi_catUnderweight 0.4768376 0.148012712 1.31683702
## bmi_catOverweight 1.5455093 1.134572337 2.10902614
## bmi_catObese 3.1400805 2.288478035 4.33033241
## phys_active 0.8999907 0.696650987 1.16203458
## current_smoker 1.0709359 0.816955023 1.40654285
Questions:
What is the reference category for BMI?
The reference category is “Normal BMI.”
Interpret the odds ratio for “Obese” compared to the reference category.
OR = 3.14
95% CI: 2.29 – 4.33
p < 0.001
Individuals classified as obese have 3.14 times the odds of hypertension compared to individuals with normal BMI, after adjusting for age, sex, physical activity, and smoking.
How would you explain this to a non-statistician?
People who are obese are much more likely to have high blood pressure than people with a normal weight. In fact, their risk is more than three times higher, even after accounting for age, sex, physical activity, and smoking.
# Model WITHOUT interaction (main effects only)
model_no_int <- glm(hypertension ~ age_cont + sex + bmi_cat + phys_active + current_smoker,
data = brfss_clean2,
family = binomial)
# Model WITH Age × BMI interaction
model_int <- glm(hypertension ~ age_cont * bmi_cat + sex + phys_active + current_smoker,
data = brfss_clean2,
family = binomial)
# Compare models using Likelihood Ratio Test (LRT)
anova(model_no_int, model_int, test = "Chisq")## Analysis of Deviance Table
##
## Model 1: hypertension ~ age_cont + sex + bmi_cat + phys_active + current_smoker
## Model 2: hypertension ~ age_cont * bmi_cat + sex + phys_active + current_smoker
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 1273 1563.5
## 2 1270 1561.3 3 2.2363 0.5248
# ---- Visualization: predicted probabilities by age and BMI ----
library(ggplot2)
# Create a grid of values for prediction
newdata <- expand.grid(
age_cont = seq(min(brfss_clean2$age_cont),
max(brfss_clean2$age_cont),
length.out = 100),
bmi_cat = levels(brfss_clean2$bmi_cat),
sex = "Female",
phys_active = 0,
current_smoker = 0
)
# Predicted probabilities from the interaction model
newdata$pred <- predict(model_int, newdata = newdata, type = "response")
# Plot predicted probabilities
ggplot(newdata, aes(x = age_cont, y = pred, color = bmi_cat)) +
geom_line(linewidth = 1.2) +
labs(
x = "Age",
y = "Predicted Probability of Hypertension",
color = "BMI Category",
title = "Predicted Probability of Hypertension by Age and BMI Category"
) +
theme_minimal()Questions:
Is the interaction term statistically significant?
p = 0.5248
Decision rule:
p < 0.05 → significant interaction
p ≥ 0.05 → not significant
No, the interaction between age and BMI is not statistically significant (p = 0.525).
What does this mean in epidemiologic terms (effect modification)?
There is no evidence that BMI modifies the relationship between age and hypertension. The effect of age on hypertension appears to be similar across BMI categories. This indicates that BMI does not modify the association between age and hypertension. Therefore, the effect of age on hypertension is consistent across BMI categories.
Create a visualization showing predicted probabilities by age and BMI category
## GVIF Df GVIF^(1/(2*Df))
## age_cont 1.126628 1 1.061428
## sex 1.016509 1 1.008221
## bmi_cat 1.103045 3 1.016480
## phys_active 1.024820 1 1.012334
## current_smoker 1.073574 1 1.036134
# ---- Cook's distance (influential observations) ----
cooksD <- cooks.distance(model2)
# Basic Cook's D plot
plot(cooksD, type = "h",
main = "Cook's Distance for Multiple Logistic Regression Model",
ylab = "Cook's Distance", xlab = "Observation Index")
# Common cutoff line: 4/n
abline(h = 4/length(cooksD), lty = 2)## 31 168 228 272 280 366 426 440 496 552 629 646 783 814 910 1047
## 31 168 228 272 280 366 426 440 496 552 629 646 783 814 910 1047
## 1061 1081 1137 1140 1158 1186
## 1061 1081 1137 1140 1158 1186
Questions:
Are there any concerns about multicollinearity?
There is no evidence of problematic multicollinearity. The predictors represent distinct constructs (demographic, behavioral, and clinical factors), and variance inflation factors were within acceptable limits.
Are there any influential observations that might affect your results?
A small number of observations exceed the Cook’s distance cutoff, indicating they may be influential. However, the majority of observations fall well below the threshold, suggesting that influential points are limited and unlikely to substantially affect model results.
What would you do if you found serious violations?
If serious multicollinearity were present, I would examine correlations among predictors and consider removing or combining highly correlated variables. If influential observations were identified, I would inspect those cases for errors or unusual values and evaluate whether model results change when they are excluded.
library(dplyr)
# Model A: Age only
modelA <- glm(hypertension ~ age_cont,
data = brfss_clean2,
family = binomial)
# Model B: Age + sex + bmi_cat
modelB <- glm(hypertension ~ age_cont + sex + bmi_cat,
data = brfss_clean2,
family = binomial)
# Model C: Age + sex + bmi_cat + phys_active + current_smoker
modelC <- glm(hypertension ~ age_cont + sex + bmi_cat + phys_active + current_smoker,
data = brfss_clean2,
family = binomial)
# Create comparison table
model_comp <- tibble(
Model = c("A: Age only",
"B: Age + sex + BMI",
"C: Age + sex + BMI + phys_active + current_smoker"),
AIC = c(AIC(modelA), AIC(modelB), AIC(modelC)),
BIC = c(BIC(modelA), BIC(modelB), BIC(modelC))
) %>%
mutate(
AIC = round(AIC, 1),
BIC = round(BIC, 1)
)
model_comp## # A tibble: 3 × 3
## Model AIC BIC
## <chr> <dbl> <dbl>
## 1 A: Age only 1637. 1647.
## 2 B: Age + sex + BMI 1576. 1607.
## 3 C: Age + sex + BMI + phys_active + current_smoker 1580. 1621.
Questions:
Which model has the best fit based on AIC?
Model B (Age + sex + BMI) has the lowest AIC (1576.5).
Is the added complexity of the full model justified?
The added complexity of the full model is not justified because it results in a higher AIC and BIC compared to Model B.
Which model would you choose for your final analysis? Why?
Model B (Age + sex + BMI) has the lowest AIC (1576.5), indicating the best model fit. The full model (Model C) has a higher AIC and BIC, suggesting that the additional predictors (physical activity and smoking) do not meaningfully improve model performance. Therefore, Model B would be selected for the final analysis because it provides the best balance between model fit and parsimony.
Write a brief report (1-2 pages) summarizing your findings:
Submission: Submit your completed R Markdown file and knitted HTML report.
Introduction
Hypertension is a major public health concern and a leading risk factor for cardiovascular disease. Identifying predictors of hypertension can help inform prevention strategies and clinical screening efforts. The objective of this analysis was to evaluate demographic and behavioral predictors of hypertension and determine which statistical model best predicts hypertension risk (Schmidt et al., 2020).
Methods
We conducted a cross-sectional analysis using a cleaned subset of BRFSS 2023 data (n = 1,281). The outcome variable was hypertension status (yes/no). Predictor variables included age (continuous), sex, BMI category, physical activity, and smoking status. Logistic regression models were used to estimate odds ratios (ORs) and 95% confidence intervals (CIs). Effect modification was assessed using an Age × BMI interaction term and likelihood ratio testing. Model diagnostics included variance inflation factors (VIF) and Cook’s distance. Model fit was compared using AIC and BIC.
Results
The analytic sample included 1,281 adults, and the overall prevalence of hypertension was 52.7%. Hypertension prevalence increased steadily with age.
In adjusted logistic regression models, age and BMI were the strongest predictors of hypertension. Each additional year of age increased the odds of hypertension by approximately 6%. Compared with individuals with normal BMI, overweight individuals had higher odds of hypertension, and obese individuals had more than three times the odds. Sex, smoking, and physical activity were not statistically significant predictors after adjustment.
Model comparison showed that the model including age, sex, and BMI had the lowest AIC and BIC, indicating the best fit.
Interpretation
Age and BMI were the strongest predictors of hypertension in this sample. Each additional year of age increased the odds of hypertension by approximately 6%, and obesity was associated with more than a threefold increase in odds compared with normal BMI. These findings are consistent with established epidemiologic evidence linking aging and adiposity to cardiovascular risk.
The lack of significant associations for smoking and physical activity in adjusted models suggests their effects may be weaker or mediated through other factors in this dataset.
The absence of interaction between age and BMI indicates that the effect of age on hypertension risk is similar across BMI groups.
Limitations
Several limitations should be considered. First, the analysis was cross-sectional, so causal relationships cannot be inferred. Second, BRFSS data rely on self-report, which may introduce misclassification or recall bias. Third, residual confounding may exist due to unmeasured factors such as diet, medication use, or genetic predisposition. Finally, some age groups had small sample sizes, which may reduce estimate precision.
Conclusion
This analysis demonstrates that age and BMI are strong predictors of hypertension, with obesity representing a particularly important risk factor. The most parsimonious and best-fitting model included age, sex, and BMI. These findings reinforce the importance of weight management and age-related screening strategies in hypertension prevention efforts.
Logistic Regression:
\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]
Odds Ratio:
\[\text{OR} = e^{\beta_i}\]
Predicted Probability:
\[p = \frac{e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}{1 + e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}\]
Session Info
## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggstats_0.12.0 gtsummary_2.5.0 ggeffects_2.3.2 car_3.1-3
## [5] carData_3.0-5 broom_1.0.11 plotly_4.12.0 kableExtra_1.4.0
## [9] knitr_1.51 haven_2.5.5 lubridate_1.9.4 forcats_1.0.1
## [13] stringr_1.6.0 dplyr_1.2.0 purrr_1.2.1 readr_2.1.6
## [17] tidyr_1.3.2 tibble_3.3.1 ggplot2_4.0.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.56 bslib_0.9.0
## [4] htmlwidgets_1.6.4 insight_1.4.4 lattice_0.22-7
## [7] tzdb_0.5.0 crosstalk_1.2.2 vctrs_0.7.1
## [10] tools_4.5.2 generics_0.1.4 datawizard_1.3.0
## [13] pkgconfig_2.0.3 Matrix_1.7-4 data.table_1.18.0
## [16] RColorBrewer_1.1-3 S7_0.2.1 lifecycle_1.0.5
## [19] compiler_4.5.2 farver_2.1.2 textshaping_1.0.4
## [22] codetools_0.2-20 htmltools_0.5.9 sass_0.4.10
## [25] yaml_2.3.12 lazyeval_0.2.2 Formula_1.2-5
## [28] pillar_1.11.1 jquerylib_0.1.4 broom.helpers_1.22.0
## [31] cachem_1.1.0 abind_1.4-8 nlme_3.1-168
## [34] tidyselect_1.2.1 digest_0.6.39 stringi_1.8.7
## [37] labeling_0.4.3 splines_4.5.2 labelled_2.16.0
## [40] fastmap_1.2.0 grid_4.5.2 cli_3.6.5
## [43] magrittr_2.0.4 cards_0.7.1 utf8_1.2.6
## [46] withr_3.0.2 scales_1.4.0 backports_1.5.0
## [49] timechange_0.3.0 rmarkdown_2.30 httr_1.4.7
## [52] otel_0.2.0 hms_1.1.4 evaluate_1.0.5
## [55] viridisLite_0.4.2 mgcv_1.9-3 rlang_1.1.7
## [58] glue_1.8.0 xml2_1.5.1 svglite_2.2.2
## [61] rstudioapi_0.18.0 jsonlite_2.0.0 R6_2.6.1
## [64] systemfonts_1.3.1