Introduction

Statistical modeling is a fundamental tool in epidemiology that allows us to:

Describe relationships between variables
Predict outcomes based on risk factors
Estimate associations while controlling for confounding

This lecture introduces key concepts in regression modeling using real-world data from the Behavioral Risk Factor Surveillance System (BRFSS) 2023.

Setup and Data Preparation

# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)

Loading BRFSS 2023 Data

The BRFSS is a large-scale telephone survey that collects data on health-related risk behaviors, chronic health conditions, and use of preventive services from U.S. residents.

# Load the full BRFSS 2023 dataset
brfss_full <- read_xpt("C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/brfss_2023.XPT") %>%
  janitor::clean_names()

# Display dataset dimensions
names(brfss_full)

##   [1] "state"    "fmonth"   "idate"    "imonth"   "iday"     "iyear"   
##   [7] "dispcode" "seqno"    "psu"      "ctelenm1" "pvtresd1" "colghous"
##  [13] "statere1" "celphon1" "ladult1"  "numadult" "respslc1" "landsex2"
##  [19] "lndsxbrt" "safetime" "ctelnum1" "cellfon5" "cadult1"  "cellsex2"
##  [25] "celsxbrt" "pvtresd3" "cclghous" "cstate1"  "landline" "hhadult" 
##  [31] "sexvar"   "genhlth"  "physhlth" "menthlth" "poorhlth" "primins1"
##  [37] "persdoc3" "medcost1" "checkup1" "exerany2" "exract12" "exeroft1"
##  [43] "exerhmm1" "exract22" "exeroft2" "exerhmm2" "strength" "bphigh6" 
##  [49] "bpmeds1"  "cholchk3" "toldhi3"  "cholmed3" "cvdinfr4" "cvdcrhd4"
##  [55] "cvdstrk3" "asthma3"  "asthnow"  "chcscnc1" "chcocnc1" "chccopd3"
##  [61] "addepev3" "chckdny2" "havarth4" "diabete4" "diabage4" "marital" 
##  [67] "educa"    "renthom1" "numhhol4" "numphon4" "cpdemo1c" "veteran3"
##  [73] "employ1"  "children" "income3"  "pregnant" "weight2"  "height3" 
##  [79] "deaf"     "blind"    "decide"   "diffwalk" "diffdres" "diffalon"
##  [85] "fall12mn" "fallinj5" "smoke100" "smokday2" "usenow3"  "ecignow2"
##  [91] "alcday4"  "avedrnk3" "drnk3ge5" "maxdrnks" "flushot7" "flshtmy3"
##  [97] "pneuvac4" "shingle2" "hivtst7"  "hivtstd3" "seatbelt" "drnkdri2"
## [103] "covidpo1" "covidsm1" "covidact" "pdiabts1" "prediab2" "diabtype"
## [109] "insulin1" "chkhemo3" "eyeexam1" "diabeye1" "diabedu1" "feetsore"
## [115] "arthexer" "arthedu"  "lmtjoin3" "arthdis2" "joinpai2" "lcsfirst"
## [121] "lcslast"  "lcsnumcg" "lcsctsc1" "lcsscncr" "lcsctwhn" "hadmam"  
## [127] "howlong"  "cervscrn" "crvclcnc" "crvclpap" "crvclhpv" "hadhyst2"
## [133] "psatest1" "psatime1" "pcpsars2" "psasugs1" "pcstalk2" "hadsigm4"
## [139] "colnsigm" "colntes1" "sigmtes1" "lastsig4" "colncncr" "vircolo1"
## [145] "vclntes2" "smalstol" "stoltest" "stooldn2" "bldstfit" "sdnates1"
## [151] "cncrdiff" "cncrage"  "cncrtyp2" "csrvtrt3" "csrvdoc1" "csrvsum" 
## [157] "csrvrtrn" "csrvinst" "csrvinsr" "csrvdein" "csrvclin" "csrvpain"
## [163] "csrvctl2" "indortan" "numburn3" "sunprtct" "wkdayout" "wkendout"
## [169] "cimemlo1" "cdworry"  "cddiscu1" "cdhous1"  "cdsocia1" "caregiv1"
## [175] "crgvrel4" "crgvlng1" "crgvhrs1" "crgvprb3" "crgvalzd" "crgvper1"
## [181] "crgvhou1" "crgvexpt" "lastsmk2" "stopsmk2" "mentcigs" "mentecig"
## [187] "heattbco" "firearm5" "gunload"  "loadulk2" "hasymp1"  "hasymp2" 
## [193] "hasymp3"  "hasymp4"  "hasymp5"  "hasymp6"  "strsymp1" "strsymp2"
## [199] "strsymp3" "strsymp4" "strsymp5" "strsymp6" "firstaid" "aspirin" 
## [205] "birthsex" "somale"   "sofemale" "trnsgndr" "marijan1" "marjsmok"
## [211] "marjeat"  "marjvape" "marjdab"  "marjothr" "usemrjn4" "acedeprs"
## [217] "acedrink" "acedrugs" "aceprisn" "acedivrc" "acepunch" "acehurt1"
## [223] "aceswear" "acetouch" "acetthem" "acehvsex" "aceadsaf" "aceadned"
## [229] "imfvpla4" "hpvadvc4" "hpvadsht" "tetanus1" "covidva1" "covacge1"
## [235] "covidnu2" "lsatisfy" "emtsuprt" "sdlonely" "sdhemply" "foodstmp"
## [241] "sdhfood1" "sdhbills" "sdhutils" "sdhtrnsp" "sdhstre1" "rrclass3"
## [247] "rrcognt2" "rrtreat"  "rratwrk2" "rrhcare4" "rrphysm2" "rcsgend1"
## [253] "rcsxbrth" "rcsrltn2" "casthdx2" "casthno2" "qstver"   "qstlang" 
## [259] "metstat"  "urbstat"  "mscode"   "ststr"    "strwt"    "rawrake" 
## [265] "wt2rake"  "imprace"  "chispnc"  "crace1"   "cageg"    "cllcpwt" 
## [271] "dualuse"  "dualcor"  "llcpwt2"  "llcpwt"   "rfhlth"   "phys14d" 
## [277] "ment14d"  "hlthpl1"  "hcvu653"  "totinda"  "metvl12"  "metvl22" 
## [283] "maxvo21"  "fc601"    "actin13"  "actin23"  "padur1"   "padur2"  
## [289] "pafreq1"  "pafreq2"  "minac12"  "minac22"  "strfreq"  "pamiss3" 
## [295] "pamin13"  "pamin23"  "pa3min"   "pavig13"  "pavig23"  "pa3vigm" 
## [301] "pacat3"   "paindx3"  "pa150r4"  "pa300r4"  "pa30023"  "pastrng" 
## [307] "parec3"   "pastae3"  "rfhype6"  "cholch3"  "rfchol3"  "michd"   
## [313] "ltasth1"  "casthm1"  "asthms1"  "drdxar2"  "mrace1"   "hispanc" 
## [319] "race"     "raceg21"  "racegr3"  "raceprv"  "sex"      "ageg5yr" 
## [325] "age65yr"  "age80"    "age_g"    "htin4"    "htm4"     "wtkg3"   
## [331] "bmi5"     "bmi5cat"  "rfbmi5"   "chldcnt"  "educag"   "incomg1" 
## [337] "smoker3"  "rfsmok3"  "cureci2"  "drnkany6" "drocdy4"  "rfbing6" 
## [343] "drnkwk2"  "rfdrhv8"  "flshot7"  "pneumo3"  "aidtst4"  "rfseat2" 
## [349] "rfseat3"  "drnkdrv"

Creating a Working Subset

For computational efficiency and teaching purposes, we’ll create a subset with relevant variables and complete cases.

# Select variables of interest and create analytic dataset
set.seed(553)  # For reproducibility

brfss_subset <- brfss_full %>%
  select(
    # Outcome: Diabetes status
    diabete4,
    # Demographics
    age_g,      # Age category
    sex,         # Sex
    race,       # Race/ethnicity
    educag,     # Education level
    incomg1,    # Income category
    # Health behaviors
    bmi5cat,    # BMI category
    exerany2,     # Physical activity
    smokday2,     # Smoking frequency
    # Health status
    genhlth,      # General health
    rfhype6,    # High blood pressure
    rfchol3     # High cholesterol
  ) %>%
  # Filter to complete cases only
  drop_na() %>%
  # Sample 2000 observations for manageable analysis
  slice_sample(n = 2000)

# Display subset dimensions
cat("Working subset dimensions:",
    nrow(brfss_subset), "observations,",
    ncol(brfss_subset), "variables\n")

## Working subset dimensions: 2000 observations, 12 variables

Data Recoding and Cleaning

# Create clean dataset with recoded variables
brfss_clean <- brfss_subset %>%
  mutate(
    # Outcome: Diabetes (binary)
    diabetes = case_when(
      diabete4 == 1 ~ 1,  # Yes
      diabete4 %in% c(2, 3, 4) ~ 0,  # No, pre-diabetes, or gestational only
      TRUE ~ NA_real_
    ),

    # Age groups
    age_group = factor(case_when(
      age_g == 1 ~ "18-24",
      age_g == 2 ~ "25-34",
      age_g == 3 ~ "35-44",
      age_g == 4 ~ "45-54",
      age_g == 5 ~ "55-64",
      age_g == 6 ~ "65+"
    ), levels = c("18-24", "25-34", "35-44", "45-54", "55-64", "65+")),

    # Age continuous (midpoint of category)
    age_cont = case_when(
      age_g == 1 ~ 21,
      age_g == 2 ~ 29.5,
      age_g == 3 ~ 39.5,
      age_g == 4 ~ 49.5,
      age_g == 5 ~ 59.5,
      age_g == 6 ~ 70
    ),

    # Sex
    sex = factor(ifelse(sex == 1, "Male", "Female")),

    # Race/ethnicity
    race = factor(case_when(
      race == 1 ~ "White",
      race == 2 ~ "Black",
      race == 3 ~ "Native American",
      race == 4 ~ "Asian",
      race == 5 ~ "Native Hawaiian/PI",
      race == 6 ~ "Other",
      race == 7 ~ "Multiracial",
      race == 8 ~ "Hispanic"
    )),

    # Education (simplified)
    education = factor(case_when(
      educag == 1 ~ "< High school",
      educag == 2 ~ "High school graduate",
      educag == 3 ~ "Some college",
      educag == 4 ~ "College graduate"
    ), levels = c("< High school", "High school graduate", "Some college", "College graduate")),

    # Income (simplified)
    income = factor(case_when(
      incomg1 == 1 ~ "< $25,000",
      incomg1 == 2 ~ "$25,000-$49,999",
      incomg1 == 3 ~ "$50,000-$74,999",
      incomg1 == 4 ~ "$75,000+",
      incomg1 == 5 ~ "Unknown"
    ), levels = c("< $25,000", "$25,000-$49,999", "$50,000-$74,999", "$75,000+", "Unknown")),

    # BMI category
    bmi_cat = factor(case_when(
      bmi5cat == 1 ~ "Underweight",
      bmi5cat == 2 ~ "Normal",
      bmi5cat == 3 ~ "Overweight",
      bmi5cat == 4 ~ "Obese"
    ), levels = c("Underweight", "Normal", "Overweight", "Obese")),

    # Physical activity (binary)
    phys_active = ifelse(exerany2 == 1, 1, 0),

    # Current smoking
    current_smoker = case_when(
      smokday2 == 1 ~ 1,  # Every day
      smokday2 == 2 ~ 1,  # Some days
      smokday2 == 3 ~ 0,  # Not at all
      TRUE ~ 0
    ),

    # General health (simplified)
    gen_health = factor(case_when(
      genhlth %in% c(1, 2) ~ "Excellent/Very good",
      genhlth == 3 ~ "Good",
      genhlth %in% c(4, 5) ~ "Fair/Poor"
    ), levels = c("Excellent/Very good", "Good", "Fair/Poor")),

    # Hypertension
    hypertension = ifelse(rfhype6 == 2, 1, 0),

    # High cholesterol
    high_chol = ifelse(rfchol3 == 2, 1, 0)
  ) %>%
  # Select only the clean variables
  select(diabetes, age_group, age_cont, sex, race, education, income,
         bmi_cat, phys_active, current_smoker, gen_health,
         hypertension, high_chol) %>%
  # Remove any remaining missing values
  drop_na()

# Save the cleaned subset for future use
write_rds(brfss_clean,
          "C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/brfss_2023_clean.rds")

cat("Clean dataset saved with", nrow(brfss_clean), "complete observations\n")

## Clean dataset saved with 1281 complete observations

Descriptive Statistics

# Summary table by diabetes status
desc_table <- brfss_clean %>%
  group_by(hypertension) %>%
  summarise(
    N = n(),
    `Mean Age` = round(mean(age_cont), 1),
    `% Male` = round(100 * mean(sex == "Male"), 1),
    `% Obese` = round(100 * mean(bmi_cat == "Obese", na.rm = TRUE), 1),
    `% Physically Active` = round(100 * mean(phys_active), 1),
    `% Current Smoker` = round(100 * mean(current_smoker), 1),
    `% Hypertension` = round(100 * mean(hypertension), 1),
    `% High Cholesterol` = round(100 * mean(high_chol), 1)
  ) %>%
  mutate(hypertension = ifelse(hypertension == 1, "Hypertension", "No Hypertension"))

desc_table %>%
  kable(caption = "Descriptive Statistics by Diabetes Status",
        align = "lrrrrrrrr") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Descriptive Statistics by Diabetes Status
hypertension	N	Mean Age	% Male	% Obese	% Physically Active	% Current Smoker	% Hypertension	% High Cholesterol
No Hypertension	606	54.5	46.2	30.7	69.3	32.8	0	30.9
Hypertension	675	63.1	53.2	45.6	64.1	25.6	100	61.3

Student Lab Activity

Lab Overview

In this lab, you will:

Build your own logistic regression model predicting hypertension (high blood pressure)
Create dummy variables for categorical predictors
Interpret regression coefficients
Test for confounding and interaction
Perform model diagnostics

Lab Instructions

Task 1: Explore the Outcome Variable

# YOUR CODE HERE: Create a frequency table of hypertension status

table(brfss_clean$hypertension)

## 
##   0   1 
## 606 675

# YOUR CODE HERE: Calculate the prevalence of hypertension by age group

brfss_clean %>%
  group_by(age_group) %>%
  summarise(
    prevalence = mean(hypertension, na.rm = TRUE),
    n = n()
  )

## # A tibble: 6 × 3
##   age_group prevalence     n
##   <fct>          <dbl> <int>
## 1 18-24         0.0833    12
## 2 25-34         0.195     77
## 3 35-44         0.304    138
## 4 45-54         0.379    161
## 5 55-64         0.515    266
## 6 65+           0.668    627

Questions:

What is the overall prevalence of hypertension in the dataset?

The overall prevalence of hypertension in this data set is 52.7%.

How does hypertension prevalence vary by age group?

The prevalence of hypertension increases as age increases. This shows us that hypertension is strongly associated with aging. Older adults are more likely to have hypertension compared to younger adults.

Task 2: Build a Simple Logistic Regression Model

# YOUR CODE HERE: Fit a simple logistic regression model
# Outcome: hypertension
# Predictor: age_cont
model_logistic_regression <- glm(hypertension ~ age_cont,
                              data = brfss_clean,
                              family = binomial(link = "logit"))

# YOUR CODE HERE: Display the results with odds ratios

tidy(model_logistic_regression, exponentiate = TRUE, conf.int = TRUE) %>%
  kable(caption = "Simple logistic Regression: Hypertension ~ Age (Odds Ratios)",
        digits = 3,
        col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)

Simple logistic Regression: Hypertension ~ Age (Odds Ratios)
Term	Odds Ratio	Std. Error	z-statistic	p-value	95% CI Lower	95% CI Upper
(Intercept)	0.048	0.296	-10.293	0	0.026	0.084
age_cont	1.055	0.005	10.996	0	1.045	1.065

Questions:

What is the odds ratio for age? Interpret this value.

The odds ratio for age is 1.055. For each 1-year increase in age, the odds of having hypertension increases by 5.5%.

Is the association statistically significant?

The association between hypertension and age is statistically significant. The p-value is less than 0.05. We can reject the null hypothesis.

What is the 95% confidence interval for the odds ratio?

The 95% confidence interval for the odds ratio is 1.045-1.065. We are 95% confident that the true odds ratio for age lies between 1.045 and 1.065. The confidence interval does not cross 1, which shows a positive significant association between age and hypertension.

Task 3: Create a Multiple Regression Model

# YOUR CODE HERE: Fit a multiple logistic regression model
# Outcome: hypertension
# Predictors: age_cont, sex, bmi_cat, phys_active, current_smoker

model_logistic_multiple <- glm(hypertension ~ age_cont + sex + bmi_cat +
                                phys_active + current_smoker,
                               data = brfss_clean,
                               family = binomial(link = "logit"))

# YOUR CODE HERE: Display the results

tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
  kable(caption = "Multiple Logistic Regression: hypertension ~ Age + Covariates (Odds Ratios)",
        digits = 3,
        col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  scroll_box(height = "400px")

Multiple Logistic Regression: hypertension ~ Age + Covariates (Odds Ratios)
Term	Odds Ratio	Std. Error	z-statistic	p-value	95% CI Lower	95% CI Upper
(Intercept)	0.008	0.653	-7.355	0.000	0.002	0.028
age_cont	1.061	0.005	11.234	0.000	1.050	1.073
sexMale	1.270	0.123	1.950	0.051	0.999	1.616
bmi_catNormal	2.097	0.546	1.356	0.175	0.759	6.756
bmi_catOverweight	3.241	0.543	2.166	0.030	1.183	10.385
bmi_catObese	6.585	0.545	3.459	0.001	2.394	21.176
phys_active	0.900	0.130	-0.808	0.419	0.697	1.162
current_smoker	1.071	0.139	0.495	0.621	0.817	1.407

Questions:

How did the odds ratio for age change after adjusting for other variables? After adjusting for other variables, odds ratio increases from 1.055 to 1.061. This shows us that confounding factors negatively play a role in the association between hypertension and age, meaning the adjusted model underestimated the true association between age and hypertension.
What does this suggest about confounding? The increase in Odds Ratio after adjustment suggests that BMI, physical activity, sex, or smoking were negatively confounding the association between age and hypertension. These variables masked part of the true effect of age in the unadjusted model.
Which variables are the strongest predictors of hypertension? The strongest predictors of hypertension are age and obese BMI. Age and BMI had a p-value less than 0.05, making them statistically significant compared to the other confounding factors.

Task 4: Interpret Dummy Variables

# YOUR CODE HERE: Create a table showing the dummy variable coding for bmi_cat

dummy_table <- data.frame(
  bmi_cat = c("underweight","normal weight", "overweight", "obese"),
  `Dummy 1 (normal weight)` = c(0, 1, 0, 0),
  `Dummy 2 (overweight)` = c(0, 0, 1, 0),
  `Dummy 3 (obese)` = c(0, 0, 0, 1),
  check.names = FALSE
)

dummy_table %>%
  kable(caption = "Dummy Variable Coding for Education",
        align = "lccc") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(1, bold = TRUE, background = "#ffe6e6")  # Highlight reference category

Dummy Variable Coding for Education
bmi_cat	Dummy 1 (normal weight)	Dummy 2 (overweight)	Dummy 3 (obese)
underweight	0	0	0
normal weight	1	0	0
overweight	0	1	0
obese	0	0	1

# YOUR CODE HERE: Extract and display the odds ratios for BMI categories

bmi_coefs <- tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(str_detect(term, "bmi_cat")) %>%
  mutate(
    bmi_level = str_remove(term, "bmi_cat"),
    bmi_level = factor(bmi_level,
                             levels = c("underweight","normal weight", "overweight", "obese"))
  )

# Add reference category
ref_row <- data.frame(
  term = "bmi_catUnderweight",
  estimate = 1.0,
  std.error = 0,
  statistic = NA,
  p.value = NA,
  conf.low = 1.0,
  conf.high = 1.0,
  bmi_level = factor("Underweight (Ref)",
                          levels = c("Underweight (Ref)", "Normal", "Overweight", "Obese"))
)

bmi_coefs_full <- bind_rows(ref_row, bmi_coefs) %>%
  mutate(bmi_level = factor(bmi_level,
                                 levels = c("Underweight (Ref)", "Normal", "Overweight", "Obese")))

# Plot
p3 <- ggplot(bmi_coefs_full, aes(x = bmi_level, y = estimate)) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high),
                  size = 0.8, color = "darkblue") +
  coord_flip() +
  labs(
    title = "Association Between BMI and Hypertension",
    subtitle = "Adjusted Odds Ratios (reference: Underweight)",
    x = "BMI Level",
    y = "Odds Ratio (95% CI)"
  ) +
  theme_minimal(base_size = 12)

ggplotly(p3)

# Plot model coefficients with `ggcoef_model()`
ggcoef_model(model_logistic_multiple, exponentiate = TRUE,
  include = c("bmi_cat"),
  variable_labels = c(
    bmi_cat = "bmi_cat"),
  facet_labeller = ggplot2::label_wrap_gen(10)
)

Questions:

What is the reference category for BMI?

The reference category for BMI is Underweight.

Interpret the odds ratio for “Obese” compared to the reference category.

The odds ratio for the obese category is approximately 6 compared to the underweight reference group, with a wide confidence interval that crosses 1. In addition, the p-value is greater than 0.05 with a value of 0.062, making it not statistically significant.

How would you explain this to a non-statistician?

Individuals who are obese may have a higher chance of experiencing hypertension compared to those who are underweight. The estimate indicates the odds could be about 6 times greater, but we are not certain, so we can’t say for sure that this difference really matters in clinical practice.

Task 5: Test for Interaction

# YOUR CODE HERE: Fit a model with Age × BMI interaction
# Test if the effect of age on hypertension differs by BMI category

model_interaction <- glm(hypertension ~ age_cont * bmi_cat,
                         data = brfss_clean,
                         family = binomial(link = "logit")) 

# Display interaction results
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(str_detect(term, "age_cont")) %>%
  kable(caption = "Age × BMI Interaction Model (Odds Ratios)",
        digits = 3,
        col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)

Age × BMI Interaction Model (Odds Ratios)
Term	Odds Ratio	Std. Error	z-statistic	p-value	95% CI Lower	95% CI Upper
age_cont	1.004	0.042	0.102	0.918	0.929	1.108
age_cont:bmi_catNormal	1.058	0.043	1.306	0.192	0.957	1.147
age_cont:bmi_catOverweight	1.063	0.043	1.423	0.155	0.962	1.151
age_cont:bmi_catObese	1.054	0.042	1.232	0.218	0.954	1.140

# YOUR CODE HERE: Perform a likelihood ratio test comparing models with and without interaction

# Model without interaction
model_no_interaction <- glm(hypertension ~ age_cont + bmi_cat,
                            data = brfss_clean,
                            family = binomial(link = "logit"))

# Model with interaction
model_interaction <- glm(hypertension ~ age_cont * bmi_cat,
                         data = brfss_clean,
                         family = binomial(link = "logit"))

# Likelihood ratio test
anova(model_no_interaction, model_interaction, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: hypertension ~ age_cont + bmi_cat
## Model 2: hypertension ~ age_cont * bmi_cat
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1      1276     1568.1                     
## 2      1273     1566.1  3   1.9645   0.5798

# Display interaction results 
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(str_detect(term, "age_cont")) %>%  
  kable(
    caption = "Age × BMI Interaction Model (Odds Ratios)",
    digits = 3,
    col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover"),
    full_width = FALSE
  )

Age × BMI Interaction Model (Odds Ratios)
Term	Odds Ratio	Std. Error	z-statistic	p-value	95% CI Lower	95% CI Upper
age_cont	1.004	0.042	0.102	0.918	0.929	1.108
age_cont:bmi_catNormal	1.058	0.043	1.306	0.192	0.957	1.147
age_cont:bmi_catOverweight	1.063	0.043	1.423	0.155	0.962	1.151
age_cont:bmi_catObese	1.054	0.042	1.232	0.218	0.954	1.140

Questions:

Is the interaction term statistically significant?

No, the interaction between age and BMI is not statistically significant. All of the p-values are greater than 0.05, making none of them statistically significant.

What does this mean in epidemiologic terms (effect modification)? In epidemiological terms, BMI does not change the relationship between age and hypertension in this dataset.
Create a visualization showing predicted probabilities by age and BMI category

# Generate predicted probabilities by age and BMI
pred_interact <- ggpredict(model_interaction, terms = c("age_cont [18:80]", "bmi_cat"))

# Plot
p4 <- ggplot(pred_interact, aes(x = x, y = predicted, color = group, fill = group)) +
  geom_line(linewidth = 1.2) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, color = NA) +
  labs(
    title = "Predicted Probability of Hypertension by Age and BMI",
    subtitle = "Testing for Age × BMI Interaction",
    x = "Age (years)",
    y = "Predicted Probability of Hypertension",
    color = "bmi_cat",
    fill = "bmi_cat"
  ) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_color_manual(values = c("underweight" = "#E64B35", "normal" = "#4DBBD5")) +
  scale_fill_manual(values = c("overweight" = "#E64B35", "obese" = "#4DBBD5")) +
  
  scale_fill_manual(values = c(
    "Underweight" = "#E64B35", 
    "Normal" = "#4DBBD5", 
    "Overweight" = "#00A087", 
    "Obese" = "#3C5488"
  )) + 
  theme_minimal(base_size = 12) +
  theme(legend.position = "bottom")

ggplotly(p4)

Task 6: Model Diagnostics

# YOUR CODE HERE: Calculate VIF for your multiple regression model

# Calculate VIF
vif_values <- vif(model_logistic_multiple)

# Create VIF table
# For models with categorical variables, vif() returns GVIF (Generalized VIF)
if (is.matrix(vif_values)) {
  # If matrix (categorical variables present), extract GVIF^(1/(2*Df))
  vif_df <- data.frame(
    Variable = rownames(vif_values),
    VIF = vif_values[, "GVIF^(1/(2*Df))"]
  )
} else {
  # If vector (only continuous variables)
  vif_df <- data.frame(
    Variable = names(vif_values),
    VIF = as.numeric(vif_values)
  )
}

# Add interpretation
vif_df <- vif_df %>%
  arrange(desc(VIF)) %>%
  mutate(
    Interpretation = case_when(
      VIF < 5 ~ "Low (No concern)",
      VIF >= 5 & VIF < 10 ~ "Moderate (Monitor)",
      VIF >= 10 ~ "High (Problem)"
    )
  )

vif_df %>%
  kable(caption = "Variance Inflation Factors (VIF) for Multiple Regression Model",
        digits = 2,
        align = "lrc") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which(vif_df$VIF >= 10), bold = TRUE, color = "white", background = "#DC143C") %>%
  row_spec(which(vif_df$VIF >= 5 & vif_df$VIF < 10), background = "#FFA500") %>%
  row_spec(which(vif_df$VIF < 5), background = "#90EE90")

Variance Inflation Factors (VIF) for Multiple Regression Model
	Variable	VIF	Interpretation
age_cont	age_cont	1.06	Low (No concern)
current_smoker	current_smoker	1.04	Low (No concern)
bmi_cat	bmi_cat	1.02	Low (No concern)
phys_active	phys_active	1.01	Low (No concern)
sex	sex	1.01	Low (No concern)

# YOUR CODE HERE: Create a Cook's distance plot to identify influential observations

# Calculate Cook's distance
cooks_d <- cooks.distance(model_logistic_multiple)

# Create data frame
influence_df <- data.frame(
  observation = 1:length(cooks_d),
  cooks_d = cooks_d
) %>%
  mutate(influential = ifelse(cooks_d > 1, "Yes", "No"))

# Plot
p5 <- ggplot(influence_df, aes(x = observation, y = cooks_d, color = influential)) +
  geom_point(alpha = 0.6) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
  labs(
    title = "Cook's Distance: Identifying Influential Observations",
    subtitle = "Values > 1 indicate potentially influential observations",
    x = "Observation Number",
    y = "Cook's Distance",
    color = "Influential?"
  ) +
  scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
  theme_minimal(base_size = 12)

ggplotly(p5)

Questions:

Are there any concerns about multicollinearity?

All VIF values suggest that there is no evidence of multicollinearity.

Are there any influential observations that might affect your results?

There are no influential observations that might be affecting the results.

What would you do if you found serious violations?

If there were serious violations found, I would consider removing or combining correlated variables.

Task 7: Model Comparison

# YOUR CODE HERE: Compare three models using AIC and BIC
# Model A: Age only
# Model B: Age + sex + bmi_cat
# Model C: Age + sex + bmi_cat + phys_active + current_smoker

# Model 1: age only
model1 <- glm(hypertension ~ age_cont,
              data = brfss_clean,
              family = binomial)

# Model 2: Age + sex + bmi_cat
model2 <- glm(hypertension ~ age_cont + sex + bmi_cat,
              data = brfss_clean,
              family = binomial)

# Model 3: Age + sex + bmi_cat + phys_active + current_smoker
model3 <- glm(hypertension ~ age_cont + sex + bmi_cat + phys_active + current_smoker,
              data = brfss_clean,
              family = binomial)

# Likelihood ratio test
lrt_1_2 <- anova(model1, model2, test = "LRT")
lrt_2_3 <- anova(model2, model3, test = "LRT")

# Create comparison table
model_comp <- data.frame(
  Model = c("Model 1: age only",
            "Model 2: Age + sex + bmi_cat",
            "Model 3: Age + sex + bmi_cat + phys_active + current_smoker"),
  AIC = c(AIC(model1), AIC(model2), AIC(model3)),
  BIC = c(BIC(model1), BIC(model2), BIC(model3)),
  `Deviance` = c(deviance(model1), deviance(model2), deviance(model3)),
  check.names = FALSE
)

model_comp %>%
  kable(caption = "Model Comparison: AIC, BIC, and Deviance",
        digits = 2,
        align = "lrrr") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which.min(model_comp$AIC), bold = TRUE, background = "#d4edda")

Model Comparison: AIC, BIC, and Deviance
Model	AIC	BIC	Deviance
Model 1: age only	1636.61	1646.92	1632.61
Model 2: Age + sex + bmi_cat	1576.49	1607.42	1564.49
Model 3: Age + sex + bmi_cat + phys_active + current_smoker	1579.50	1620.74	1563.50

Questions:

Which model has the best fit based on AIC?

Model 2 has the lowest AIC, suggesting it provides the best fit to the data.

Is the added complexity of the full model justified?

The AIC and BIC results indicate that model 2 fits the data much better than the other two models. The added predictors does not improve model fit, so the added complexity is not justified.

Which model would you choose for your final analysis? Why?

For my final analysis, I would choose model 2-age and sex because it has the lowest AIC/BIC which indicates a better model fit. Model 2 has the lowest AIC, suggesting it provides the best fit to the data.

Summary

Introduction The goal of this analysis was to examine whether BMI and age are associated with hypertension among U.S. adults. There has been extensive research done showing that BMI and age are well established predictors of elevated blood pressure. This analysis uses data from the Behavioral Risk factor Surveillance System (BRFSS) 2023, to analyze the association between age, BMI, and hypertension while accounting for potential confounding factors. Methods The analysis used a subset of BRFSS 2023 survey data consisting of 1,281 randomly sampled participants with completed information on the relevant variables. Hypertension was treated as a binary variable, age was analyzed as a continuous variable, and BMI was analyzed as a categorical variable. A series of logistic regression models were fitted to estimate odds ratio and 95% confidence intervals for the association between predictors and hypertension. The analytics approach included the following: 1. A simple logistic regression model: assessing the unadjusted association between age and hypertension. 2. A multiple logistic regression model: adjusting for sex, BMI, physical activity, and smoking status. 3. An interaction mode: testing whether the association between age and hypertension differed by BMI category. Model Diagnostics Included: 1. Variance inflation factors(VIFs) to assess multicollinearity 2. Cooks distance 3. Model fit using AIC, BIC,and likelihood ratio tests Results The results showed that hypertension prevalence increased steadily across age groups, older adults experienced higher rates of hypertension compared to younger adults. Individuals who experienced obesity also experienced higher rates of hypertension relative to those with lower BMI. These patterns are consistent with the current research literature. In the unadjusted logistic regression model, age was significantly associated with hypertension (OR= 1.055; 95% CI: 1.045-1.065), indicating that each additional year of age was associated with a 5.5% increase in the odds of hypertension. After adjusting for BMI, age, physical activity, and smoking status, the association between age and hypertension remained statistically significant with a slight increase in OR from 1.055 to 1.061. There was no statistically significant interaction observed between age and BMI category, indicating that the effect of age on hypertension did not differ meaningfully across BMI categories. The model diagnostics indicated no concerns regarding multicollinearity and no influential observations based on cook’s distance. Model comparison results favored model 2, which demonstrated the best fit based on AIC and BIC. Interpretation The results indicate that both age and BMI are important predictors of hypertension, even after adjusting for behavioral and demographic covariates. There is a strong and consistent association between age and hypertension, showing the biological aging process. Individuals in higher BMI categories, particularly obese, showed higher odds of hypertension, however, estimates were imprecise with wide confidence intervals, indicating uncertainty in the strength of this association. Limitations When conducting any type of research, you should always consider the limitations of your study. The first limitation to the study is that BRFSS data relies on self-reported data, which may introduce misclassification or recall bias. Second, there were additional confounding factors that should be considered like diet, medication use, or genetics. Finally, we did not use complex survey designs for the BRFSS data, which would include weighted sampling, clustering, or stratification. This may have an effect on generalizability of the results and lead to bias in the data.

Key Concepts Covered

Statistical modeling describes relationships between variables
Regression types depend on the outcome variable type
Logistic regression is appropriate for binary outcomes
Multiple regression controls for confounding
Dummy variables represent categorical predictors
Interactions test for effect modification
Model diagnostics check assumptions and identify problems
Model comparison helps select the best model

Important Formulas

Logistic Regression:

\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]

Odds Ratio:

\[\text{OR} = e^{\beta_i}\]

Predicted Probability:

\[p = \frac{e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}{1 + e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}\]

References

Agresti, A. (2018). An Introduction to Categorical Data Analysis (3rd ed.). Wiley.
Hosmer, D. W., Lemeshow, S., & Sturdivant, R. X. (2013). Applied Logistic Regression (3rd ed.). Wiley.
Vittinghoff, E., Glidden, D. V., Shiboski, S. C., & McCulloch, C. E. (2012). Regression Methods in Biostatistics (2nd ed.). Springer.
Centers for Disease Control and Prevention. (2023). Behavioral Risk Factor Surveillance System.

Session Info

sessionInfo()

## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggstats_0.12.0   gtsummary_2.5.0  ggeffects_2.3.2  car_3.1-3       
##  [5] carData_3.0-5    broom_1.0.11     plotly_4.12.0    kableExtra_1.4.0
##  [9] knitr_1.51       haven_2.5.5      lubridate_1.9.4  forcats_1.0.1   
## [13] stringr_1.6.0    dplyr_1.1.4      purrr_1.2.1      readr_2.1.6     
## [17] tidyr_1.3.2      tibble_3.3.1     ggplot2_4.0.0    tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6         xfun_0.56            bslib_0.9.0         
##  [4] htmlwidgets_1.6.4    insight_1.4.6        tzdb_0.5.0          
##  [7] vctrs_0.6.5          tools_4.5.1          crosstalk_1.2.2     
## [10] generics_0.1.4       datawizard_1.3.0     pkgconfig_2.0.3     
## [13] data.table_1.18.0    RColorBrewer_1.1-3   S7_0.2.0            
## [16] lifecycle_1.0.5      compiler_4.5.1       farver_2.1.2        
## [19] textshaping_1.0.4    htmltools_0.5.9      sass_0.4.10         
## [22] yaml_2.3.12          lazyeval_0.2.2       Formula_1.2-5       
## [25] pillar_1.11.1        jquerylib_0.1.4      broom.helpers_1.22.0
## [28] cachem_1.1.0         abind_1.4-8          tidyselect_1.2.1    
## [31] digest_0.6.39        stringi_1.8.7        labeling_0.4.3      
## [34] labelled_2.16.0      fastmap_1.2.0        grid_4.5.1          
## [37] cli_3.6.5            magrittr_2.0.4       cards_0.7.1         
## [40] utf8_1.2.6           withr_3.0.2          scales_1.4.0        
## [43] backports_1.5.0      timechange_0.3.0     rmarkdown_2.30      
## [46] httr_1.4.7           otel_0.2.0           hms_1.1.4           
## [49] evaluate_1.0.5       viridisLite_0.4.2    rlang_1.1.6         
## [52] glue_1.8.0           xml2_1.5.1           svglite_2.2.2       
## [55] rstudioapi_0.18.0    jsonlite_2.0.0       R6_2.6.1            
## [58] systemfonts_1.3.1

Statistical Modeling in Epidemiology

EPI 553 - Advanced Epidemiologic Methods

Grace Beal

2026-02-24

Introduction

Setup and Data Preparation

Loading BRFSS 2023 Data

Creating a Working Subset

Data Recoding and Cleaning

Descriptive Statistics

Student Lab Activity

Lab Overview

Lab Instructions

Task 1: Explore the Outcome Variable

Task 2: Build a Simple Logistic Regression Model

Task 3: Create a Multiple Regression Model

Task 4: Interpret Dummy Variables

Task 5: Test for Interaction

Task 6: Model Diagnostics

If there were serious violations found, I would consider removing or combining correlated variables.

Task 7: Model Comparison

Summary

Key Concepts Covered

Important Formulas

References