Statistical modeling is a fundamental tool in epidemiology that allows us to:
This lecture introduces key concepts in regression modeling using real-world data from the Behavioral Risk Factor Surveillance System (BRFSS) 2023.
# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)The BRFSS is a large-scale telephone survey that collects data on health-related risk behaviors, chronic health conditions, and use of preventive services from U.S. residents.
# Load the full BRFSS 2023 dataset
brfss_full <- read_rds("~/R/site-library/Epi553/code/brfss_subset_2023.rds") %>%
janitor::clean_names()
# Display dataset dimensions
names(brfss_full)## [1] "diabetes" "age_group" "age_cont" "sex"
## [5] "race" "education" "income" "bmi_cat"
## [9] "phys_active" "current_smoker" "gen_health" "hypertension"
## [13] "high_chol"
# Select variables of interest and create analytic dataset
set.seed(553) # For reproducibility
brfss_subset <- brfss_full %>%
select(
# Outcome: Diabetes status
diabetes,
# Demographics
age_group, # Age category
age_cont,# Age ccontinous
sex, # Sex
race, # Race/ethnicity
education, # Education level
income, # Income category
# Health behaviors
bmi_cat, # BMI category
phys_active, # Physical activity
current_smoker, # Smoking frequency
# Health status
gen_health, # General health
hypertension, # High blood pressure
high_chol # High cholesterol
) %>%
# Filter to complete cases only
drop_na() %>%
# Sample 2000 observations for manageable analysis
slice_sample(n = 2000)
# Display subset dimensions
cat("Working subset dimensions:",
nrow(brfss_subset), "observations,",
ncol(brfss_subset), "variables\n")## Working subset dimensions: 1281 observations, 13 variables
In this lab, you will:
# YOUR CODE HERE: Create a frequency table of hypertension status
brfss_subset %>%
count(hypertension) %>%
mutate(percent = n / sum(n))## # A tibble: 2 × 3
## hypertension n percent
## <dbl> <int> <dbl>
## 1 0 606 0.473
## 2 1 675 0.527
brfss_subset %>%
group_by(age_group) %>%
summarise(
n = n(),
prevalence = mean(hypertension == 1, na.rm = TRUE)
)## # A tibble: 6 × 3
## age_group n prevalence
## <fct> <int> <dbl>
## 1 18-24 12 0.0833
## 2 25-34 77 0.195
## 3 35-44 138 0.304
## 4 45-54 161 0.379
## 5 55-64 266 0.515
## 6 65+ 627 0.668
# YOUR CODE HERE: Calculate the prevalence of hypertension by age group
brfss_subset %>%
group_by(age_group) %>%
summarise(
n = n(),
prevalence = mean(hypertension == 1, na.rm = TRUE)
)## # A tibble: 6 × 3
## age_group n prevalence
## <fct> <int> <dbl>
## 1 18-24 12 0.0833
## 2 25-34 77 0.195
## 3 35-44 138 0.304
## 4 45-54 161 0.379
## 5 55-64 266 0.515
## 6 65+ 627 0.668
Questions:
# YOUR CODE HERE: Fit a simple logistic regression model
# Outcome: hypertension
# Predictor: age_cont
# Fit simple logistic regression model
model1 <- glm(hypertension ~ age_cont,
data = brfss_subset,
family = binomial)
summary(model1)##
## Call:
## glm(formula = hypertension ~ age_cont, family = binomial, data = brfss_subset)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.042577 0.295584 -10.29 <2e-16 ***
## age_cont 0.053119 0.004831 11.00 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1772.1 on 1280 degrees of freedom
## Residual deviance: 1632.6 on 1279 degrees of freedom
## AIC: 1636.6
##
## Number of Fisher Scoring iterations: 4
# YOUR CODE HERE: Display the results with odds ratios
exp(cbind(
OR = coef(model1),
confint(model1)
))## OR 2.5 % 97.5 %
## (Intercept) 0.04771176 0.02644276 0.08431815
## age_cont 1.05455475 1.04476526 1.06475213
Questions:
# YOUR CODE HERE: Fit a multiple logistic regression model
# Outcome: hypertension
# Predictors: age_cont, sex, bmi_cat, phys_active, current_smoker
# Fit multiple logistic regression model
model2 <- glm(hypertension ~ age_cont + sex + bmi_cat +
phys_active + current_smoker,
data = brfss_subset,
family = binomial)
# YOUR CODE HERE: Display the results
summary(model2)##
## Call:
## glm(formula = hypertension ~ age_cont + sex + bmi_cat + phys_active +
## current_smoker, family = binomial, data = brfss_subset)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.806068 0.653465 -7.355 1.91e-13 ***
## age_cont 0.059453 0.005292 11.234 < 2e-16 ***
## sexMale 0.239129 0.122612 1.950 0.051141 .
## bmi_catNormal 0.740579 0.546292 1.356 0.175212
## bmi_catOverweight 1.175933 0.542839 2.166 0.030291 *
## bmi_catObese 1.884828 0.544866 3.459 0.000542 ***
## phys_active -0.105371 0.130457 -0.808 0.419260
## current_smoker 0.068533 0.138515 0.495 0.620763
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1772.1 on 1280 degrees of freedom
## Residual deviance: 1563.5 on 1273 degrees of freedom
## AIC: 1579.5
##
## Number of Fisher Scoring iterations: 4
## OR 2.5 % 97.5 %
## (Intercept) 0.008179959 0.002105268 0.02803472
## age_cont 1.061255783 1.050496837 1.07253490
## sexMale 1.270142112 0.998922794 1.61567286
## bmi_catNormal 2.097150060 0.759395421 6.75617644
## bmi_catOverweight 3.241164895 1.182648040 10.38462655
## bmi_catObese 6.585220088 2.394090483 21.17598499
## phys_active 0.899990714 0.696650987 1.16203458
## current_smoker 1.070935933 0.816955023 1.40654285
Questions:
# YOUR CODE HERE: Create a table showing the dummy variable coding for bmi_cat
levels(brfss_subset$bmi_cat)## [1] "Underweight" "Normal" "Overweight" "Obese"
## (Intercept) bmi_catNormal bmi_catOverweight bmi_catObese
## 1 1 0 1 0
## 2 1 0 0 1
## 3 1 0 1 0
## 4 1 0 0 1
## 5 1 0 0 1
## 6 1 1 0 0
## 7 1 1 0 0
## 8 1 0 1 0
## 9 1 0 1 0
## 10 1 1 0 0
# YOUR CODE HERE: Extract and display the odds ratios for BMI categories
exp(cbind(
OR = coef(model2)[grep("bmi_cat", names(coef(model2)))],
confint(model2)[grep("bmi_cat", rownames(confint(model2))), ]
))## OR 2.5 % 97.5 %
## bmi_catNormal 2.097150 0.7593954 6.756176
## bmi_catOverweight 3.241165 1.1826480 10.384627
## bmi_catObese 6.585220 2.3940905 21.175985
Questions:
# YOUR CODE HERE: Fit a model with Age × BMI interaction
# Test if the effect of age on hypertension differs by BMI category
# Model WITH interaction
model_int <- glm(hypertension ~ age_cont * bmi_cat +
sex + phys_active + current_smoker,
data = brfss_subset,
family = binomial)
summary(model_int)##
## Call:
## glm(formula = hypertension ~ age_cont * bmi_cat + sex + phys_active +
## current_smoker, family = binomial, data = brfss_subset)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.449064 2.558038 -0.566 0.5711
## age_cont 0.004922 0.041980 0.117 0.9067
## bmi_catNormal -2.703080 2.650288 -1.020 0.3078
## bmi_catOverweight -2.623344 2.623875 -1.000 0.3174
## bmi_catObese -1.253018 2.590804 -0.484 0.6286
## sexMale 0.244929 0.123167 1.989 0.0467 *
## phys_active -0.112236 0.130761 -0.858 0.3907
## current_smoker 0.075878 0.138923 0.546 0.5849
## age_cont:bmi_catNormal 0.055910 0.043458 1.287 0.1983
## age_cont:bmi_catOverweight 0.061652 0.043089 1.431 0.1525
## age_cont:bmi_catObese 0.050616 0.042695 1.186 0.2358
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1772.1 on 1280 degrees of freedom
## Residual deviance: 1561.3 on 1270 degrees of freedom
## AIC: 1583.3
##
## Number of Fisher Scoring iterations: 4
# YOUR CODE HERE: Perform a likelihood ratio test comparing models with and without interaction
anova(model2, model_int, test = "LRT")## Analysis of Deviance Table
##
## Model 1: hypertension ~ age_cont + sex + bmi_cat + phys_active + current_smoker
## Model 2: hypertension ~ age_cont * bmi_cat + sex + phys_active + current_smoker
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 1273 1563.5
## 2 1270 1561.3 3 2.2363 0.5248
#Create new data for prediction
new_data <- expand.grid(
age_cont = seq(min(brfss_subset$age_cont),
max(brfss_subset$age_cont),
length.out = 100),
bmi_cat = levels(brfss_subset$bmi_cat),
sex = "Male", # hold constant
phys_active = 1, # hold constant
current_smoker = 0 # hold constant
)
# Get predicted probabilities
new_data$predicted_prob <- predict(model_int,
newdata = new_data,
type = "response")
# Plot
ggplot(new_data,
aes(x = age_cont,
y = predicted_prob,
color = bmi_cat)) +
geom_line(size = 1) +
labs(
x = "Age",
y = "Predicted Probability of Hypertension",
color = "BMI Category"
) +
theme_minimal()Questions:
## GVIF Df GVIF^(1/(2*Df))
## age_cont 1.126628 1 1.061428
## sex 1.016509 1 1.008221
## bmi_cat 1.103045 3 1.016480
## phys_active 1.024820 1 1.012334
## current_smoker 1.073574 1 1.036134
# YOUR CODE HERE: Create a Cook's distance plot to identify influential observations
# Calculate Cook's distance
cooks_d <- cooks.distance(model2)
# Plot
plot(cooks_d,
type = "h",
main = "Cook's Distance Plot",
ylab = "Cook's Distance")
abline(h = 4/length(cooks_d), col = "red", lty = 2)Questions:
# YOUR CODE HERE: Compare three models using AIC and BIC
# Model A: Age only
# Model B: Age + sex + bmi_cat
# Model C: Age + sex + bmi_cat + phys_active + current_smoker
# Model A: Age only
modelA <- glm(hypertension ~ age_cont,
data = brfss_subset,
family = binomial)
# Model B: Age + sex + BMI
modelB <- glm(hypertension ~ age_cont + sex + bmi_cat,
data = brfss_subset,
family = binomial)
# Model C: Full model (already created as model2)
modelC <- model2
# YOUR CODE HERE: Create a comparison table
AIC(modelA, modelB, modelC)## df AIC
## modelA 2 1636.613
## modelB 6 1576.487
## modelC 8 1579.496
## df BIC
## modelA 2 1646.924
## modelB 6 1607.419
## modelC 8 1620.739
Questions:
Write a brief report (1-2 pages) summarizing your findings:
Introduction: State your research question ## The objective of this analysis was to examine the association between age and hypertension using BRFSS data. Specifically, we assessed whether increasing age is associated with higher odds of hypertension and whether this relationship differs across BMI categories. Additional covariates including sex, physical activity, and smoking status were evaluated to assess potential confounding. Logistic regression modeling was used to estimate odds ratios and assess model fit.
Methods: Describe your analytic approach ## We first calculated the overall prevalence of hypertension and examined prevalence across age groups. A simple logistic regression model was fit with age as the predictor to estimate the crude association with hypertension. We then fit multiple logistic regression models adjusting for sex, BMI category, physical activity, and smoking status to assess confounding. Model comparison was performed using AIC and BIC to determine the most parsimonious model.
Results: Present key findings with tables and figures ## Hypertension prevalence increased steadily across age groups, with the highest prevalence observed among individuals aged 65 and older. In the unadjusted model, each one-year increase in age was associated with approximately a 5–6% increase in the odds of hypertension. After adjusting for covariates, age remained a statistically significant predictor. Model comparison indicated that the model including age, sex, and BMI category provided the best balance between fit and complexity.
Interpretation: Explain what your results mean ##These findings suggest that age is a strong and independent predictor of hypertension. BMI category also appears to be an important contributor, with higher BMI levels associated with greater odds of hypertension. There was no evidence of effect modification between age and BMI category, indicating that the association between age and hypertension is consistent across BMI groups. Overall, the results align with established epidemiologic evidence that hypertension risk increases with age and adiposity.
Limitations: Discuss potential issues with your analysis ## This analysis is based on cross-sectional BRFSS data, which limits causal inference. Hypertension status and other variables are self-reported, introducing potential misclassification bias. Residual confounding may still be present due to unmeasured variables such as diet, medication use, or socioeconomic factors. Finally, logistic regression assumes correct model specification and linearity of the logit for continuous predictors, which may not fully hold.
Submission: Submit your completed R Markdown file and knitted HTML report.
Logistic Regression:
\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]
Odds Ratio:
\[\text{OR} = e^{\beta_i}\]
Predicted Probability:
\[p = \frac{e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}{1 + e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}\]
Session Info
## R version 4.3.2 (2023-10-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggstats_0.5.1 gtsummary_1.7.2 ggeffects_1.5.0 car_3.1-2
## [5] carData_3.0-5 broom_1.0.5 plotly_4.10.4 kableExtra_1.4.0
## [9] knitr_1.45 haven_2.5.4 lubridate_1.9.3 forcats_1.0.0
## [13] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
## [17] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.0 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.4 xfun_0.42 bslib_0.6.1
## [4] htmlwidgets_1.6.4 insight_0.19.8 tzdb_0.4.0
## [7] vctrs_0.6.5 tools_4.3.2 generics_0.1.3
## [10] fansi_1.0.6 highr_0.10 pkgconfig_2.0.3
## [13] data.table_1.15.99 gt_0.10.1 lifecycle_1.0.4
## [16] farver_2.1.1 compiler_4.3.2 munsell_0.5.0
## [19] codetools_0.2-19 htmltools_0.5.7 sass_0.4.8
## [22] yaml_2.3.8 lazyeval_0.2.2 pillar_1.9.0
## [25] jquerylib_0.1.4 MASS_7.3-60 broom.helpers_1.14.0
## [28] cachem_1.0.8 abind_1.4-5 tidyselect_1.2.0
## [31] digest_0.6.34 stringi_1.8.3 labeling_0.4.3
## [34] fastmap_1.1.1 grid_4.3.2 colorspace_2.1-0
## [37] cli_3.6.2 magrittr_2.0.3 utf8_1.2.4
## [40] withr_3.0.0 scales_1.3.0 backports_1.4.1
## [43] timechange_0.3.0 rmarkdown_2.25 httr_1.4.7
## [46] hms_1.1.3 evaluate_0.23 viridisLite_0.4.2
## [49] rlang_1.1.3 glue_1.7.0 xml2_1.3.6
## [52] svglite_2.1.3 rstudioapi_0.15.0 jsonlite_1.8.8
## [55] R6_2.5.1 systemfonts_1.0.5