Statistical modeling is a fundamental tool in epidemiology that allows us to:
This lecture introduces key concepts in regression modeling using real-world data from the Behavioral Risk Factor Surveillance System (BRFSS) 2023.
# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)
The BRFSS is a large-scale telephone survey that collects data on health-related risk behaviors, chronic health conditions, and use of preventive services from U.S. residents.
# Load the full BRFSS 2023 dataset
brfss_full <- read_xpt("/Users/sarah/OneDrive/Documents/EPI 553/data/LLCP2023.XPT") %>%
janitor::clean_names()
# Display dataset dimensions
names(brfss_full)
## [1] "state" "fmonth" "idate" "imonth" "iday" "iyear"
## [7] "dispcode" "seqno" "psu" "ctelenm1" "pvtresd1" "colghous"
## [13] "statere1" "celphon1" "ladult1" "numadult" "respslc1" "landsex2"
## [19] "lndsxbrt" "safetime" "ctelnum1" "cellfon5" "cadult1" "cellsex2"
## [25] "celsxbrt" "pvtresd3" "cclghous" "cstate1" "landline" "hhadult"
## [31] "sexvar" "genhlth" "physhlth" "menthlth" "poorhlth" "primins1"
## [37] "persdoc3" "medcost1" "checkup1" "exerany2" "exract12" "exeroft1"
## [43] "exerhmm1" "exract22" "exeroft2" "exerhmm2" "strength" "bphigh6"
## [49] "bpmeds1" "cholchk3" "toldhi3" "cholmed3" "cvdinfr4" "cvdcrhd4"
## [55] "cvdstrk3" "asthma3" "asthnow" "chcscnc1" "chcocnc1" "chccopd3"
## [61] "addepev3" "chckdny2" "havarth4" "diabete4" "diabage4" "marital"
## [67] "educa" "renthom1" "numhhol4" "numphon4" "cpdemo1c" "veteran3"
## [73] "employ1" "children" "income3" "pregnant" "weight2" "height3"
## [79] "deaf" "blind" "decide" "diffwalk" "diffdres" "diffalon"
## [85] "fall12mn" "fallinj5" "smoke100" "smokday2" "usenow3" "ecignow2"
## [91] "alcday4" "avedrnk3" "drnk3ge5" "maxdrnks" "flushot7" "flshtmy3"
## [97] "pneuvac4" "shingle2" "hivtst7" "hivtstd3" "seatbelt" "drnkdri2"
## [103] "covidpo1" "covidsm1" "covidact" "pdiabts1" "prediab2" "diabtype"
## [109] "insulin1" "chkhemo3" "eyeexam1" "diabeye1" "diabedu1" "feetsore"
## [115] "arthexer" "arthedu" "lmtjoin3" "arthdis2" "joinpai2" "lcsfirst"
## [121] "lcslast" "lcsnumcg" "lcsctsc1" "lcsscncr" "lcsctwhn" "hadmam"
## [127] "howlong" "cervscrn" "crvclcnc" "crvclpap" "crvclhpv" "hadhyst2"
## [133] "psatest1" "psatime1" "pcpsars2" "psasugs1" "pcstalk2" "hadsigm4"
## [139] "colnsigm" "colntes1" "sigmtes1" "lastsig4" "colncncr" "vircolo1"
## [145] "vclntes2" "smalstol" "stoltest" "stooldn2" "bldstfit" "sdnates1"
## [151] "cncrdiff" "cncrage" "cncrtyp2" "csrvtrt3" "csrvdoc1" "csrvsum"
## [157] "csrvrtrn" "csrvinst" "csrvinsr" "csrvdein" "csrvclin" "csrvpain"
## [163] "csrvctl2" "indortan" "numburn3" "sunprtct" "wkdayout" "wkendout"
## [169] "cimemlo1" "cdworry" "cddiscu1" "cdhous1" "cdsocia1" "caregiv1"
## [175] "crgvrel4" "crgvlng1" "crgvhrs1" "crgvprb3" "crgvalzd" "crgvper1"
## [181] "crgvhou1" "crgvexpt" "lastsmk2" "stopsmk2" "mentcigs" "mentecig"
## [187] "heattbco" "firearm5" "gunload" "loadulk2" "hasymp1" "hasymp2"
## [193] "hasymp3" "hasymp4" "hasymp5" "hasymp6" "strsymp1" "strsymp2"
## [199] "strsymp3" "strsymp4" "strsymp5" "strsymp6" "firstaid" "aspirin"
## [205] "birthsex" "somale" "sofemale" "trnsgndr" "marijan1" "marjsmok"
## [211] "marjeat" "marjvape" "marjdab" "marjothr" "usemrjn4" "acedeprs"
## [217] "acedrink" "acedrugs" "aceprisn" "acedivrc" "acepunch" "acehurt1"
## [223] "aceswear" "acetouch" "acetthem" "acehvsex" "aceadsaf" "aceadned"
## [229] "imfvpla4" "hpvadvc4" "hpvadsht" "tetanus1" "covidva1" "covacge1"
## [235] "covidnu2" "lsatisfy" "emtsuprt" "sdlonely" "sdhemply" "foodstmp"
## [241] "sdhfood1" "sdhbills" "sdhutils" "sdhtrnsp" "sdhstre1" "rrclass3"
## [247] "rrcognt2" "rrtreat" "rratwrk2" "rrhcare4" "rrphysm2" "rcsgend1"
## [253] "rcsxbrth" "rcsrltn2" "casthdx2" "casthno2" "qstver" "qstlang"
## [259] "metstat" "urbstat" "mscode" "ststr" "strwt" "rawrake"
## [265] "wt2rake" "imprace" "chispnc" "crace1" "cageg" "cllcpwt"
## [271] "dualuse" "dualcor" "llcpwt2" "llcpwt" "rfhlth" "phys14d"
## [277] "ment14d" "hlthpl1" "hcvu653" "totinda" "metvl12" "metvl22"
## [283] "maxvo21" "fc601" "actin13" "actin23" "padur1" "padur2"
## [289] "pafreq1" "pafreq2" "minac12" "minac22" "strfreq" "pamiss3"
## [295] "pamin13" "pamin23" "pa3min" "pavig13" "pavig23" "pa3vigm"
## [301] "pacat3" "paindx3" "pa150r4" "pa300r4" "pa30023" "pastrng"
## [307] "parec3" "pastae3" "rfhype6" "cholch3" "rfchol3" "michd"
## [313] "ltasth1" "casthm1" "asthms1" "drdxar2" "mrace1" "hispanc"
## [319] "race" "raceg21" "racegr3" "raceprv" "sex" "ageg5yr"
## [325] "age65yr" "age80" "age_g" "htin4" "htm4" "wtkg3"
## [331] "bmi5" "bmi5cat" "rfbmi5" "chldcnt" "educag" "incomg1"
## [337] "smoker3" "rfsmok3" "cureci2" "drnkany6" "drocdy4" "rfbing6"
## [343] "drnkwk2" "rfdrhv8" "flshot7" "pneumo3" "aidtst4" "rfseat2"
## [349] "rfseat3" "drnkdrv"
For computational efficiency and teaching purposes, we’ll create a subset with relevant variables and complete cases.
# Select variables of interest and create analytic dataset
set.seed(553) # For reproducibility
brfss_subset <- brfss_full %>%
select(
# Outcome: Diabetes status
diabete4,
# Demographics
age_g, # Age category
sex, # Sex
race, # Race/ethnicity
educag, # Education level
incomg1, # Income category
# Health behaviors
bmi5cat, # BMI category
exerany2, # Physical activity
smokday2, # Smoking frequency
# Health status
genhlth, # General health
rfhype6, # High blood pressure
rfchol3 # High cholesterol
) %>%
# Filter to complete cases only
drop_na() %>%
# Sample 2000 observations for manageable analysis
slice_sample(n = 2000)
# Display subset dimensions
cat("Working subset dimensions:",
nrow(brfss_subset), "observations,",
ncol(brfss_subset), "variables\n")
## Working subset dimensions: 2000 observations, 12 variables
# Create clean dataset with recoded variables
brfss_clean <- brfss_subset %>%
mutate(
# Outcome: Diabetes (binary)
diabetes = case_when(
diabete4 == 1 ~ 1, # Yes
diabete4 %in% c(2, 3, 4) ~ 0, # No, pre-diabetes, or gestational only
TRUE ~ NA_real_
),
# Age groups
age_group = factor(case_when(
age_g == 1 ~ "18-24",
age_g == 2 ~ "25-34",
age_g == 3 ~ "35-44",
age_g == 4 ~ "45-54",
age_g == 5 ~ "55-64",
age_g == 6 ~ "65+"
), levels = c("18-24", "25-34", "35-44", "45-54", "55-64", "65+")),
# Age continuous (midpoint of category)
age_cont = case_when(
age_g == 1 ~ 21,
age_g == 2 ~ 29.5,
age_g == 3 ~ 39.5,
age_g == 4 ~ 49.5,
age_g == 5 ~ 59.5,
age_g == 6 ~ 70
),
# Sex
sex = factor(ifelse(sex == 1, "Male", "Female")),
# Race/ethnicity
race = factor(case_when(
race == 1 ~ "White",
race == 2 ~ "Black",
race == 3 ~ "Native American",
race == 4 ~ "Asian",
race == 5 ~ "Native Hawaiian/PI",
race == 6 ~ "Other",
race == 7 ~ "Multiracial",
race == 8 ~ "Hispanic"
)),
# Education (simplified)
education = factor(case_when(
educag == 1 ~ "< High school",
educag == 2 ~ "High school graduate",
educag == 3 ~ "Some college",
educag == 4 ~ "College graduate"
), levels = c("< High school", "High school graduate", "Some college", "College graduate")),
# Income (simplified)
income = factor(case_when(
incomg1 == 1 ~ "< $25,000",
incomg1 == 2 ~ "$25,000-$49,999",
incomg1 == 3 ~ "$50,000-$74,999",
incomg1 == 4 ~ "$75,000+",
incomg1 == 5 ~ "Unknown"
), levels = c("< $25,000", "$25,000-$49,999", "$50,000-$74,999", "$75,000+", "Unknown")),
# BMI category
bmi_cat = factor(case_when(
bmi5cat == 1 ~ "Underweight",
bmi5cat == 2 ~ "Normal",
bmi5cat == 3 ~ "Overweight",
bmi5cat == 4 ~ "Obese"
), levels = c("Underweight", "Normal", "Overweight", "Obese")),
# Physical activity (binary)
phys_active = ifelse(exerany2 == 1, 1, 0),
# Current smoking
current_smoker = case_when(
smokday2 == 1 ~ 1, # Every day
smokday2 == 2 ~ 1, # Some days
smokday2 == 3 ~ 0, # Not at all
TRUE ~ 0
),
# General health (simplified)
gen_health = factor(case_when(
genhlth %in% c(1, 2) ~ "Excellent/Very good",
genhlth == 3 ~ "Good",
genhlth %in% c(4, 5) ~ "Fair/Poor"
), levels = c("Excellent/Very good", "Good", "Fair/Poor")),
# Hypertension
hypertension = ifelse(rfhype6 == 2, 1, 0),
# High cholesterol
high_chol = ifelse(rfchol3 == 2, 1, 0)
) %>%
# Select only the clean variables
select(diabetes, age_group, age_cont, sex, race, education, income,
bmi_cat, phys_active, current_smoker, gen_health,
hypertension, high_chol) %>%
# Remove any remaining missing values
drop_na()
In this lab, you will:
# YOUR CODE HERE: Create a frequency table of hypertension status
brfss_clean %>%
count(hypertension) %>%
mutate(
hypertension = ifelse(hypertension == 1, "Hypertension", "No Hypertension"),
Percent = round(n / sum(n) * 100, 1)
) %>%
kable(
caption = "Frequency of Hypertension in BRFSS Analytic Subset",
col.names = c("Hypertension Status", "Count", "Percent")
) %>%
kable_styling(
bootstrap_options = c("striped", "hover"),
full_width = FALSE
)
| Hypertension Status | Count | Percent |
|---|---|---|
| No Hypertension | 606 | 47.3 |
| Hypertension | 675 | 52.7 |
# YOUR CODE HERE: Calculate the prevalence of hypertension by age group
brfss_clean %>%
group_by(age_group) %>%
summarise(
N = n(),
Hypertension_Prev = round(mean(hypertension) * 100, 1)
) %>%
kable(
caption = "Prevalence of Hypertension by Age Group",
col.names = c("Age Group", "Sample Size", "Hypertension (%)")
) %>%
kable_styling(
bootstrap_options = c("striped", "hover"),
full_width = FALSE
)
| Age Group | Sample Size | Hypertension (%) |
|---|---|---|
| 18-24 | 12 | 8.3 |
| 25-34 | 77 | 19.5 |
| 35-44 | 138 | 30.4 |
| 45-54 | 161 | 37.9 |
| 55-64 | 266 | 51.5 |
| 65+ | 627 | 66.8 |
Questions:
# YOUR CODE HERE: Fit a simple logistic regression model
# Outcome: hypertension
# Predictor: age_cont
simple_logistic<- glm(hypertension~ age_cont, data = brfss_clean, family = binomial (link = "logit"))
# YOUR CODE HERE: Display the results with odds ratios
tidy(simple_logistic, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Simple Logistic Regression: Hypertension ~ Age (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)
| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.048 | 0.296 | -10.293 | 0 | 0.026 | 0.084 |
| age_cont | 1.055 | 0.005 | 10.996 | 0 | 1.045 | 1.065 |
Questions:
# YOUR CODE HERE: Fit a multiple logistic regression model
# Outcome: hypertension
# Predictors: age_cont, sex, bmi_cat, phys_active, current_smoker
model_logistic_multiple1 <- glm(hypertension ~ age_cont + sex + bmi_cat + phys_active + current_smoker,
data = brfss_clean, family = binomial(link = "logit")
)
# YOUR CODE HERE: Display the results
tidy(model_logistic_multiple1, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Multiple Logistic Regression: Hypertension ~ Age + Covariates (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
scroll_box(height = "400px")
| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.008 | 0.653 | -7.355 | 0.000 | 0.002 | 0.028 |
| age_cont | 1.061 | 0.005 | 11.234 | 0.000 | 1.050 | 1.073 |
| sexMale | 1.270 | 0.123 | 1.950 | 0.051 | 0.999 | 1.616 |
| bmi_catNormal | 2.097 | 0.546 | 1.356 | 0.175 | 0.759 | 6.756 |
| bmi_catOverweight | 3.241 | 0.543 | 2.166 | 0.030 | 1.183 | 10.385 |
| bmi_catObese | 6.585 | 0.545 | 3.459 | 0.001 | 2.394 | 21.176 |
| phys_active | 0.900 | 0.130 | -0.808 | 0.419 | 0.697 | 1.162 |
| current_smoker | 1.071 | 0.139 | 0.495 | 0.621 | 0.817 | 1.407 |
Questions:
# YOUR CODE HERE: Create a table showing the dummy variable coding for bmi_cat
dummy_table <- data.frame(
bmi_cat = c("Underweight", "Normal", "Overweight", "Obese"),
`Dummy 1 (Normal)` = c(0, 1, 0, 0),
`Dummy 2 (Overweight)` = c(0, 0, 1, 0),
`Dummy 3 (Obese)` = c(0, 0, 0, 1),
check.names = FALSE
)
dummy_table %>%
kable(caption = "Dummy Variable Coding for BMI category",
align = "lccc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(1, bold = TRUE, background = "#ffe6e6")
| bmi_cat | Dummy 1 (Normal) | Dummy 2 (Overweight) | Dummy 3 (Obese) |
|---|---|---|---|
| Underweight | 0 | 0 | 0 |
| Normal | 1 | 0 | 0 |
| Overweight | 0 | 1 | 0 |
| Obese | 0 | 0 | 1 |
# YOUR CODE HERE: Extract and display the odds ratios for BMI categories
bmi_coefs <- tidy(model_logistic_multiple1, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "bmi_cat")) %>%
mutate(
bmi_level = str_remove(term, "bmi_cat"),
bmi_level = factor(bmi_level,
levels = c("Normal",
"Overweight",
"Obese"))
)
bmi_coefs
## # A tibble: 3 × 8
## term estimate std.error statistic p.value conf.low conf.high bmi_level
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 bmi_catNorm… 2.10 0.546 1.36 1.75e-1 0.759 6.76 Normal
## 2 bmi_catOver… 3.24 0.543 2.17 3.03e-2 1.18 10.4 Overweig…
## 3 bmi_catObese 6.59 0.545 3.46 5.42e-4 2.39 21.2 Obese
Questions:
# YOUR CODE HERE: Fit a model with Age × BMI interaction
model_interaction <- glm(hypertension ~ age_cont * bmi_cat + sex + phys_active + current_smoker,
data = brfss_clean, family = binomial(link = "logit")
)
# Test if the effect of age on hypertension differs by BMI category
model_no_interaction <- glm(hypertension ~ age_cont + bmi_cat + sex + phys_active + current_smoker,
data = brfss_clean, family = binomial(link = "logit")
)
# YOUR CODE HERE: Perform a likelihood ratio test comparing models with and without interaction
lrt <- anova(model_no_interaction, model_interaction, test = "LRT")
#Predicted probabilities
pred_interact <- ggpredict(
model_interaction,
terms = c("age_cont [18:80]", "bmi_cat")
)
p4 <- ggplot(pred_interact, aes(x = x, y = predicted, color = group, fill = group)) +
geom_line(linewidth = 1.2) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, color = NA) +
labs(
title = "Predicted Probability of Hypertension by Age and BMI Category",
subtitle = "From Age × BMI Interaction Model",
x = "Age (years)",
y = "Predicted Probability of Hypertension",
color = "BMI Category",
fill = "BMI Category"
) +
scale_y_continuous(labels = scales::percent_format()) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")
p4
Questions:
# YOUR CODE HERE: Calculate VIF for your multiple regression model
vif <- vif(model_logistic_multiple1)
# YOUR CODE HERE: Create a Cook's distance plot to identify influential observations
cooks_distance <- cooks.distance(model_logistic_multiple1)
influence_df <- data.frame(
observation = 1:length(cooks_distance),
cooks_distance = cooks_distance
) %>%
mutate(influential = ifelse(cooks_distance > 1, "Yes", "No"))
# Plot
ggplot(influence_df, aes(x = observation, y = cooks_distance, color = influential)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
labs(
title = "Cook's Distance: Identifying Influential Observations",
subtitle = "Values > 1 indicate potentially influential observations",
x = "Observation Number",
y = "Cook's Distance",
color = "Influential?"
) +
scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
theme_minimal(base_size = 12)
# Count influential observations
n_influential <- sum(influence_df$influential == "Yes")
cat("Number of potentially influential observations:", n_influential, "\n")
## Number of potentially influential observations: 0
Questions:
# YOUR CODE HERE: Compare three models using AIC and BIC
# Model A: Age only
ModelA <- glm(hypertension ~age_cont,
data = brfss_clean,
family = binomial)
# Model B: Age + sex + bmi_cat
ModelB <- glm(hypertension ~ age_cont + sex + bmi_cat,
data=brfss_clean,
family=binomial)
# Model C: Age + sex + bmi_cat + phys_active + current_smoker
ModelC <- glm(hypertension ~ age_cont + sex + bmi_cat + phys_active + current_smoker,
data = brfss_clean,
family= binomial)
# YOUR CODE HERE: Create a comparison table
model_comparison <- data.frame(
Model = c("Model A: Age only",
"Model B: Age + sex + bmi_cat",
"Model C:Age + sex + bmi_cat + phys_active + current_smoker"),
AIC = c(AIC(ModelA), AIC(ModelB), AIC(ModelC)),
BIC = c (BIC(ModelA), BIC(ModelB), BIC(ModelC))
)
model_comparison %>%
kable(caption = "Model Comparison: AIC, BIC",
digits = 2,
align = "lrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which.min(model_comparison$AIC), bold = TRUE, background = "#d4edda")
| Model | AIC | BIC |
|---|---|---|
| Model A: Age only | 1636.61 | 1646.92 |
| Model B: Age + sex + bmi_cat | 1576.49 | 1607.42 |
| Model C:Age + sex + bmi_cat + phys_active + current_smoker | 1579.50 | 1620.74 |
Questions:
Write a brief report (1-2 pages) summarizing your findings:
Submission: Submit your completed R Markdown file and knitted HTML report.