Statistical modeling is a fundamental tool in epidemiology that allows us to:
This lecture introduces key concepts in regression modeling using real-world data from the Behavioral Risk Factor Surveillance System (BRFSS) 2023.
# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)The BRFSS is a large-scale telephone survey that collects data on health-related risk behaviors, chronic health conditions, and use of preventive services from U.S. residents.
# Load the full BRFSS 2023 dataset
brfss_full <- read_xpt("C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/brfss_2023.XPT") %>%
janitor::clean_names()
# Display dataset dimensions
names(brfss_full)## [1] "state" "fmonth" "idate" "imonth" "iday" "iyear"
## [7] "dispcode" "seqno" "psu" "ctelenm1" "pvtresd1" "colghous"
## [13] "statere1" "celphon1" "ladult1" "numadult" "respslc1" "landsex2"
## [19] "lndsxbrt" "safetime" "ctelnum1" "cellfon5" "cadult1" "cellsex2"
## [25] "celsxbrt" "pvtresd3" "cclghous" "cstate1" "landline" "hhadult"
## [31] "sexvar" "genhlth" "physhlth" "menthlth" "poorhlth" "primins1"
## [37] "persdoc3" "medcost1" "checkup1" "exerany2" "exract12" "exeroft1"
## [43] "exerhmm1" "exract22" "exeroft2" "exerhmm2" "strength" "bphigh6"
## [49] "bpmeds1" "cholchk3" "toldhi3" "cholmed3" "cvdinfr4" "cvdcrhd4"
## [55] "cvdstrk3" "asthma3" "asthnow" "chcscnc1" "chcocnc1" "chccopd3"
## [61] "addepev3" "chckdny2" "havarth4" "diabete4" "diabage4" "marital"
## [67] "educa" "renthom1" "numhhol4" "numphon4" "cpdemo1c" "veteran3"
## [73] "employ1" "children" "income3" "pregnant" "weight2" "height3"
## [79] "deaf" "blind" "decide" "diffwalk" "diffdres" "diffalon"
## [85] "fall12mn" "fallinj5" "smoke100" "smokday2" "usenow3" "ecignow2"
## [91] "alcday4" "avedrnk3" "drnk3ge5" "maxdrnks" "flushot7" "flshtmy3"
## [97] "pneuvac4" "shingle2" "hivtst7" "hivtstd3" "seatbelt" "drnkdri2"
## [103] "covidpo1" "covidsm1" "covidact" "pdiabts1" "prediab2" "diabtype"
## [109] "insulin1" "chkhemo3" "eyeexam1" "diabeye1" "diabedu1" "feetsore"
## [115] "arthexer" "arthedu" "lmtjoin3" "arthdis2" "joinpai2" "lcsfirst"
## [121] "lcslast" "lcsnumcg" "lcsctsc1" "lcsscncr" "lcsctwhn" "hadmam"
## [127] "howlong" "cervscrn" "crvclcnc" "crvclpap" "crvclhpv" "hadhyst2"
## [133] "psatest1" "psatime1" "pcpsars2" "psasugs1" "pcstalk2" "hadsigm4"
## [139] "colnsigm" "colntes1" "sigmtes1" "lastsig4" "colncncr" "vircolo1"
## [145] "vclntes2" "smalstol" "stoltest" "stooldn2" "bldstfit" "sdnates1"
## [151] "cncrdiff" "cncrage" "cncrtyp2" "csrvtrt3" "csrvdoc1" "csrvsum"
## [157] "csrvrtrn" "csrvinst" "csrvinsr" "csrvdein" "csrvclin" "csrvpain"
## [163] "csrvctl2" "indortan" "numburn3" "sunprtct" "wkdayout" "wkendout"
## [169] "cimemlo1" "cdworry" "cddiscu1" "cdhous1" "cdsocia1" "caregiv1"
## [175] "crgvrel4" "crgvlng1" "crgvhrs1" "crgvprb3" "crgvalzd" "crgvper1"
## [181] "crgvhou1" "crgvexpt" "lastsmk2" "stopsmk2" "mentcigs" "mentecig"
## [187] "heattbco" "firearm5" "gunload" "loadulk2" "hasymp1" "hasymp2"
## [193] "hasymp3" "hasymp4" "hasymp5" "hasymp6" "strsymp1" "strsymp2"
## [199] "strsymp3" "strsymp4" "strsymp5" "strsymp6" "firstaid" "aspirin"
## [205] "birthsex" "somale" "sofemale" "trnsgndr" "marijan1" "marjsmok"
## [211] "marjeat" "marjvape" "marjdab" "marjothr" "usemrjn4" "acedeprs"
## [217] "acedrink" "acedrugs" "aceprisn" "acedivrc" "acepunch" "acehurt1"
## [223] "aceswear" "acetouch" "acetthem" "acehvsex" "aceadsaf" "aceadned"
## [229] "imfvpla4" "hpvadvc4" "hpvadsht" "tetanus1" "covidva1" "covacge1"
## [235] "covidnu2" "lsatisfy" "emtsuprt" "sdlonely" "sdhemply" "foodstmp"
## [241] "sdhfood1" "sdhbills" "sdhutils" "sdhtrnsp" "sdhstre1" "rrclass3"
## [247] "rrcognt2" "rrtreat" "rratwrk2" "rrhcare4" "rrphysm2" "rcsgend1"
## [253] "rcsxbrth" "rcsrltn2" "casthdx2" "casthno2" "qstver" "qstlang"
## [259] "metstat" "urbstat" "mscode" "ststr" "strwt" "rawrake"
## [265] "wt2rake" "imprace" "chispnc" "crace1" "cageg" "cllcpwt"
## [271] "dualuse" "dualcor" "llcpwt2" "llcpwt" "rfhlth" "phys14d"
## [277] "ment14d" "hlthpl1" "hcvu653" "totinda" "metvl12" "metvl22"
## [283] "maxvo21" "fc601" "actin13" "actin23" "padur1" "padur2"
## [289] "pafreq1" "pafreq2" "minac12" "minac22" "strfreq" "pamiss3"
## [295] "pamin13" "pamin23" "pa3min" "pavig13" "pavig23" "pa3vigm"
## [301] "pacat3" "paindx3" "pa150r4" "pa300r4" "pa30023" "pastrng"
## [307] "parec3" "pastae3" "rfhype6" "cholch3" "rfchol3" "michd"
## [313] "ltasth1" "casthm1" "asthms1" "drdxar2" "mrace1" "hispanc"
## [319] "race" "raceg21" "racegr3" "raceprv" "sex" "ageg5yr"
## [325] "age65yr" "age80" "age_g" "htin4" "htm4" "wtkg3"
## [331] "bmi5" "bmi5cat" "rfbmi5" "chldcnt" "educag" "incomg1"
## [337] "smoker3" "rfsmok3" "cureci2" "drnkany6" "drocdy4" "rfbing6"
## [343] "drnkwk2" "rfdrhv8" "flshot7" "pneumo3" "aidtst4" "rfseat2"
## [349] "rfseat3" "drnkdrv"
For computational efficiency and teaching purposes, we’ll create a subset with relevant variables and complete cases.
# Select variables of interest and create analytic dataset
set.seed(553) # For reproducibility
brfss_subset <- brfss_full %>%
select(
# Outcome: Diabetes status
diabete4,
# Demographics
age_g, # Age category
sex, # Sex
race, # Race/ethnicity
educag, # Education level
incomg1, # Income category
# Health behaviors
bmi5cat, # BMI category
exerany2, # Physical activity
smokday2, # Smoking frequency
# Health status
genhlth, # General health
rfhype6, # High blood pressure
rfchol3 # High cholesterol
) %>%
# Filter to complete cases only
drop_na() %>%
# Sample 2000 observations for manageable analysis
slice_sample(n = 2000)
# Display subset dimensions
cat("Working subset dimensions:",
nrow(brfss_subset), "observations,",
ncol(brfss_subset), "variables\n")## Working subset dimensions: 2000 observations, 12 variables
# Create clean dataset with recoded variables
brfss_clean <- brfss_subset %>%
mutate(
# Outcome: Diabetes (binary)
diabetes = case_when(
diabete4 == 1 ~ 1, # Yes
diabete4 %in% c(2, 3, 4) ~ 0, # No, pre-diabetes, or gestational only
TRUE ~ NA_real_
),
# Age groups
age_group = factor(case_when(
age_g == 1 ~ "18-24",
age_g == 2 ~ "25-34",
age_g == 3 ~ "35-44",
age_g == 4 ~ "45-54",
age_g == 5 ~ "55-64",
age_g == 6 ~ "65+"
), levels = c("18-24", "25-34", "35-44", "45-54", "55-64", "65+")),
# Age continuous (midpoint of category)
age_cont = case_when(
age_g == 1 ~ 21,
age_g == 2 ~ 29.5,
age_g == 3 ~ 39.5,
age_g == 4 ~ 49.5,
age_g == 5 ~ 59.5,
age_g == 6 ~ 70
),
# Sex
sex = factor(ifelse(sex == 1, "Male", "Female")),
# Race/ethnicity
race = factor(case_when(
race == 1 ~ "White",
race == 2 ~ "Black",
race == 3 ~ "Native American",
race == 4 ~ "Asian",
race == 5 ~ "Native Hawaiian/PI",
race == 6 ~ "Other",
race == 7 ~ "Multiracial",
race == 8 ~ "Hispanic"
)),
# Education (simplified)
education = factor(case_when(
educag == 1 ~ "< High school",
educag == 2 ~ "High school graduate",
educag == 3 ~ "Some college",
educag == 4 ~ "College graduate"
), levels = c("< High school", "High school graduate", "Some college", "College graduate")),
# Income (simplified)
income = factor(case_when(
incomg1 == 1 ~ "< $25,000",
incomg1 == 2 ~ "$25,000-$49,999",
incomg1 == 3 ~ "$50,000-$74,999",
incomg1 == 4 ~ "$75,000+",
incomg1 == 5 ~ "Unknown"
), levels = c("< $25,000", "$25,000-$49,999", "$50,000-$74,999", "$75,000+", "Unknown")),
# BMI category
bmi_cat = factor(case_when(
bmi5cat == 1 ~ "Underweight",
bmi5cat == 2 ~ "Normal",
bmi5cat == 3 ~ "Overweight",
bmi5cat == 4 ~ "Obese"
), levels = c("Underweight", "Normal", "Overweight", "Obese")),
# Physical activity (binary)
phys_active = ifelse(exerany2 == 1, 1, 0),
# Current smoking
current_smoker = case_when(
smokday2 == 1 ~ 1, # Every day
smokday2 == 2 ~ 1, # Some days
smokday2 == 3 ~ 0, # Not at all
TRUE ~ 0
),
# General health (simplified)
gen_health = factor(case_when(
genhlth %in% c(1, 2) ~ "Excellent/Very good",
genhlth == 3 ~ "Good",
genhlth %in% c(4, 5) ~ "Fair/Poor"
), levels = c("Excellent/Very good", "Good", "Fair/Poor")),
# Hypertension
hypertension = ifelse(rfhype6 == 2, 1, 0),
# High cholesterol
high_chol = ifelse(rfchol3 == 2, 1, 0)
) %>%
# Select only the clean variables
select(diabetes, age_group, age_cont, sex, race, education, income,
bmi_cat, phys_active, current_smoker, gen_health,
hypertension, high_chol) %>%
# Remove any remaining missing values
drop_na()
# Save the cleaned subset for future use
write_rds(brfss_clean,
"C:/Users/graci/OneDrive/Documents/UA GRAD SCHOOL/2nd Semester/EPI553/brfss_2023_clean.rds")
cat("Clean dataset saved with", nrow(brfss_clean), "complete observations\n")## Clean dataset saved with 1281 complete observations
# Summary table by diabetes status
desc_table <- brfss_clean %>%
group_by(hypertension) %>%
summarise(
N = n(),
`Mean Age` = round(mean(age_cont), 1),
`% Male` = round(100 * mean(sex == "Male"), 1),
`% Obese` = round(100 * mean(bmi_cat == "Obese", na.rm = TRUE), 1),
`% Physically Active` = round(100 * mean(phys_active), 1),
`% Current Smoker` = round(100 * mean(current_smoker), 1),
`% Hypertension` = round(100 * mean(hypertension), 1),
`% High Cholesterol` = round(100 * mean(high_chol), 1)
) %>%
mutate(hypertension = ifelse(hypertension == 1, "Hypertension", "No Hypertension"))
desc_table %>%
kable(caption = "Descriptive Statistics by Diabetes Status",
align = "lrrrrrrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| hypertension | N | Mean Age | % Male | % Obese | % Physically Active | % Current Smoker | % Hypertension | % High Cholesterol |
|---|---|---|---|---|---|---|---|---|
| No Hypertension | 606 | 54.5 | 46.2 | 30.7 | 69.3 | 32.8 | 0 | 30.9 |
| Hypertension | 675 | 63.1 | 53.2 | 45.6 | 64.1 | 25.6 | 100 | 61.3 |
In this lab, you will:
##
## 0 1
## 606 675
# YOUR CODE HERE: Calculate the prevalence of hypertension by age group
brfss_clean %>%
group_by(age_group) %>%
summarise(
prevalence = mean(hypertension, na.rm = TRUE),
n = n()
)## # A tibble: 6 × 3
## age_group prevalence n
## <fct> <dbl> <int>
## 1 18-24 0.0833 12
## 2 25-34 0.195 77
## 3 35-44 0.304 138
## 4 45-54 0.379 161
## 5 55-64 0.515 266
## 6 65+ 0.668 627
Questions:
The overall prevalence of hypertension in this data set is 52.7%.
The prevalence of hypertension increases as age increases. This shows us that hypertension is strongly associated with aging. Older adults are more likely to have hypertension compared to younger adults.
# YOUR CODE HERE: Fit a simple logistic regression model
# Outcome: hypertension
# Predictor: age_cont
model_logistic_regression <- glm(hypertension ~ age_cont,
data = brfss_clean,
family = binomial(link = "logit"))
# YOUR CODE HERE: Display the results with odds ratios
tidy(model_logistic_regression, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Simple logistic Regression: Hypertension ~ Age (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.048 | 0.296 | -10.293 | 0 | 0.026 | 0.084 |
| age_cont | 1.055 | 0.005 | 10.996 | 0 | 1.045 | 1.065 |
Questions:
The odds ratio for age is 1.055. For each 1-year increase in age, the odds of having hypertension increases by 5.5%.
The association between hypertension and age is statistically significant. The p-value is less than 0.05. We can reject the null hypothesis.
The 95% confidence interval for the odds ratio is 1.045-1.065. We are 95% confident that the true odds ratio for age lies between 1.045 and 1.065. The confidence interval does not cross 1, which shows a positive significant association between age and hypertension.
# YOUR CODE HERE: Fit a multiple logistic regression model
# Outcome: hypertension
# Predictors: age_cont, sex, bmi_cat, phys_active, current_smoker
model_logistic_multiple <- glm(hypertension ~ age_cont + sex + bmi_cat +
phys_active + current_smoker,
data = brfss_clean,
family = binomial(link = "logit"))
# YOUR CODE HERE: Display the results
tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Multiple Logistic Regression: hypertension ~ Age + Covariates (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
scroll_box(height = "400px")| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.008 | 0.653 | -7.355 | 0.000 | 0.002 | 0.028 |
| age_cont | 1.061 | 0.005 | 11.234 | 0.000 | 1.050 | 1.073 |
| sexMale | 1.270 | 0.123 | 1.950 | 0.051 | 0.999 | 1.616 |
| bmi_catNormal | 2.097 | 0.546 | 1.356 | 0.175 | 0.759 | 6.756 |
| bmi_catOverweight | 3.241 | 0.543 | 2.166 | 0.030 | 1.183 | 10.385 |
| bmi_catObese | 6.585 | 0.545 | 3.459 | 0.001 | 2.394 | 21.176 |
| phys_active | 0.900 | 0.130 | -0.808 | 0.419 | 0.697 | 1.162 |
| current_smoker | 1.071 | 0.139 | 0.495 | 0.621 | 0.817 | 1.407 |
Questions:
How did the odds ratio for age change after adjusting for other variables? After adjusting for other variables, odds ratio increases from 1.055 to 1.061. This shows us that confounding factors negatively play a role in the association between hypertension and age, meaning the adjusted model underestimated the true association between age and hypertension.
What does this suggest about confounding? The increase in Odds Ratio after adjustment suggests that BMI, physical activity, sex, or smoking were negatively confounding the association between age and hypertension. These variables masked part of the true effect of age in the unadjusted model.
Which variables are the strongest predictors of hypertension? The strongest predictors of hypertension are age and obese BMI. Age and BMI had a p-value less than 0.05, making them statistically significant compared to the other confounding factors.
# YOUR CODE HERE: Create a table showing the dummy variable coding for bmi_cat
dummy_table <- data.frame(
bmi_cat = c("underweight","normal weight", "overweight", "obese"),
`Dummy 1 (normal weight)` = c(0, 1, 0, 0),
`Dummy 2 (overweight)` = c(0, 0, 1, 0),
`Dummy 3 (obese)` = c(0, 0, 0, 1),
check.names = FALSE
)
dummy_table %>%
kable(caption = "Dummy Variable Coding for Education",
align = "lccc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(1, bold = TRUE, background = "#ffe6e6") # Highlight reference category| bmi_cat | Dummy 1 (normal weight) | Dummy 2 (overweight) | Dummy 3 (obese) |
|---|---|---|---|
| underweight | 0 | 0 | 0 |
| normal weight | 1 | 0 | 0 |
| overweight | 0 | 1 | 0 |
| obese | 0 | 0 | 1 |
# YOUR CODE HERE: Extract and display the odds ratios for BMI categories
bmi_coefs <- tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "bmi_cat")) %>%
mutate(
bmi_level = str_remove(term, "bmi_cat"),
bmi_level = factor(bmi_level,
levels = c("underweight","normal weight", "overweight", "obese"))
)
# Add reference category
ref_row <- data.frame(
term = "bmi_catUnderweight",
estimate = 1.0,
std.error = 0,
statistic = NA,
p.value = NA,
conf.low = 1.0,
conf.high = 1.0,
bmi_level = factor("Underweight (Ref)",
levels = c("Underweight (Ref)", "Normal", "Overweight", "Obese"))
)
bmi_coefs_full <- bind_rows(ref_row, bmi_coefs) %>%
mutate(bmi_level = factor(bmi_level,
levels = c("Underweight (Ref)", "Normal", "Overweight", "Obese")))
# Plot
p3 <- ggplot(bmi_coefs_full, aes(x = bmi_level, y = estimate)) +
geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high),
size = 0.8, color = "darkblue") +
coord_flip() +
labs(
title = "Association Between BMI and Hypertension",
subtitle = "Adjusted Odds Ratios (reference: Underweight)",
x = "BMI Level",
y = "Odds Ratio (95% CI)"
) +
theme_minimal(base_size = 12)
ggplotly(p3)# Plot model coefficients with `ggcoef_model()`
ggcoef_model(model_logistic_multiple, exponentiate = TRUE,
include = c("bmi_cat"),
variable_labels = c(
bmi_cat = "bmi_cat"),
facet_labeller = ggplot2::label_wrap_gen(10)
)Questions:
The reference category for BMI is Underweight.
The odds ratio for the obese category is approximately 6 compared to the underweight reference group, with a wide confidence interval that crosses 1. In addition, the p-value is greater than 0.05 with a value of 0.062, making it not statistically significant.
Individuals who are obese may have a higher chance of experiencing hypertension compared to those who are underweight. The estimate indicates the odds could be about 6 times greater, but we are not certain, so we can’t say for sure that this difference really matters in clinical practice.
# YOUR CODE HERE: Fit a model with Age × BMI interaction
# Test if the effect of age on hypertension differs by BMI category
model_interaction <- glm(hypertension ~ age_cont * bmi_cat,
data = brfss_clean,
family = binomial(link = "logit"))
# Display interaction results
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "age_cont")) %>%
kable(caption = "Age × BMI Interaction Model (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| age_cont | 1.004 | 0.042 | 0.102 | 0.918 | 0.929 | 1.108 |
| age_cont:bmi_catNormal | 1.058 | 0.043 | 1.306 | 0.192 | 0.957 | 1.147 |
| age_cont:bmi_catOverweight | 1.063 | 0.043 | 1.423 | 0.155 | 0.962 | 1.151 |
| age_cont:bmi_catObese | 1.054 | 0.042 | 1.232 | 0.218 | 0.954 | 1.140 |
# YOUR CODE HERE: Perform a likelihood ratio test comparing models with and without interaction
# Model without interaction
model_no_interaction <- glm(hypertension ~ age_cont + bmi_cat,
data = brfss_clean,
family = binomial(link = "logit"))
# Model with interaction
model_interaction <- glm(hypertension ~ age_cont * bmi_cat,
data = brfss_clean,
family = binomial(link = "logit"))
# Likelihood ratio test
anova(model_no_interaction, model_interaction, test = "Chisq")## Analysis of Deviance Table
##
## Model 1: hypertension ~ age_cont + bmi_cat
## Model 2: hypertension ~ age_cont * bmi_cat
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 1276 1568.1
## 2 1273 1566.1 3 1.9645 0.5798
# Display interaction results
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "age_cont")) %>%
kable(
caption = "Age × BMI Interaction Model (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")
) %>%
kable_styling(
bootstrap_options = c("striped", "hover"),
full_width = FALSE
)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| age_cont | 1.004 | 0.042 | 0.102 | 0.918 | 0.929 | 1.108 |
| age_cont:bmi_catNormal | 1.058 | 0.043 | 1.306 | 0.192 | 0.957 | 1.147 |
| age_cont:bmi_catOverweight | 1.063 | 0.043 | 1.423 | 0.155 | 0.962 | 1.151 |
| age_cont:bmi_catObese | 1.054 | 0.042 | 1.232 | 0.218 | 0.954 | 1.140 |
Questions:
No, the interaction between age and BMI is not statistically significant. All of the p-values are greater than 0.05, making none of them statistically significant.
# Generate predicted probabilities by age and BMI
pred_interact <- ggpredict(model_interaction, terms = c("age_cont [18:80]", "bmi_cat"))
# Plot
p4 <- ggplot(pred_interact, aes(x = x, y = predicted, color = group, fill = group)) +
geom_line(linewidth = 1.2) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, color = NA) +
labs(
title = "Predicted Probability of Hypertension by Age and BMI",
subtitle = "Testing for Age × BMI Interaction",
x = "Age (years)",
y = "Predicted Probability of Hypertension",
color = "bmi_cat",
fill = "bmi_cat"
) +
scale_y_continuous(labels = scales::percent_format()) +
scale_color_manual(values = c("underweight" = "#E64B35", "normal" = "#4DBBD5")) +
scale_fill_manual(values = c("overweight" = "#E64B35", "obese" = "#4DBBD5")) +
scale_fill_manual(values = c(
"Underweight" = "#E64B35",
"Normal" = "#4DBBD5",
"Overweight" = "#00A087",
"Obese" = "#3C5488"
)) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")
ggplotly(p4)# YOUR CODE HERE: Calculate VIF for your multiple regression model
# Calculate VIF
vif_values <- vif(model_logistic_multiple)
# Create VIF table
# For models with categorical variables, vif() returns GVIF (Generalized VIF)
if (is.matrix(vif_values)) {
# If matrix (categorical variables present), extract GVIF^(1/(2*Df))
vif_df <- data.frame(
Variable = rownames(vif_values),
VIF = vif_values[, "GVIF^(1/(2*Df))"]
)
} else {
# If vector (only continuous variables)
vif_df <- data.frame(
Variable = names(vif_values),
VIF = as.numeric(vif_values)
)
}
# Add interpretation
vif_df <- vif_df %>%
arrange(desc(VIF)) %>%
mutate(
Interpretation = case_when(
VIF < 5 ~ "Low (No concern)",
VIF >= 5 & VIF < 10 ~ "Moderate (Monitor)",
VIF >= 10 ~ "High (Problem)"
)
)
vif_df %>%
kable(caption = "Variance Inflation Factors (VIF) for Multiple Regression Model",
digits = 2,
align = "lrc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which(vif_df$VIF >= 10), bold = TRUE, color = "white", background = "#DC143C") %>%
row_spec(which(vif_df$VIF >= 5 & vif_df$VIF < 10), background = "#FFA500") %>%
row_spec(which(vif_df$VIF < 5), background = "#90EE90")| Variable | VIF | Interpretation | |
|---|---|---|---|
| age_cont | age_cont | 1.06 | Low (No concern) |
| current_smoker | current_smoker | 1.04 | Low (No concern) |
| bmi_cat | bmi_cat | 1.02 | Low (No concern) |
| phys_active | phys_active | 1.01 | Low (No concern) |
| sex | sex | 1.01 | Low (No concern) |
# YOUR CODE HERE: Create a Cook's distance plot to identify influential observations
# Calculate Cook's distance
cooks_d <- cooks.distance(model_logistic_multiple)
# Create data frame
influence_df <- data.frame(
observation = 1:length(cooks_d),
cooks_d = cooks_d
) %>%
mutate(influential = ifelse(cooks_d > 1, "Yes", "No"))
# Plot
p5 <- ggplot(influence_df, aes(x = observation, y = cooks_d, color = influential)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
labs(
title = "Cook's Distance: Identifying Influential Observations",
subtitle = "Values > 1 indicate potentially influential observations",
x = "Observation Number",
y = "Cook's Distance",
color = "Influential?"
) +
scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
theme_minimal(base_size = 12)
ggplotly(p5)Questions:
All VIF values suggest that there is no evidence of multicollinearity.
There are no influential observations that might be affecting the results.
Introduction The goal of this analysis was to examine whether BMI and age are associated with hypertension among U.S. adults. There has been extensive research done showing that BMI and age are well established predictors of elevated blood pressure. This analysis uses data from the Behavioral Risk factor Surveillance System (BRFSS) 2023, to analyze the association between age, BMI, and hypertension while accounting for potential confounding factors. Methods The analysis used a subset of BRFSS 2023 survey data consisting of 1,281 randomly sampled participants with completed information on the relevant variables. Hypertension was treated as a binary variable, age was analyzed as a continuous variable, and BMI was analyzed as a categorical variable. A series of logistic regression models were fitted to estimate odds ratio and 95% confidence intervals for the association between predictors and hypertension. The analytics approach included the following: 1. A simple logistic regression model: assessing the unadjusted association between age and hypertension. 2. A multiple logistic regression model: adjusting for sex, BMI, physical activity, and smoking status. 3. An interaction mode: testing whether the association between age and hypertension differed by BMI category. Model Diagnostics Included: 1. Variance inflation factors(VIFs) to assess multicollinearity 2. Cooks distance 3. Model fit using AIC, BIC,and likelihood ratio tests Results The results showed that hypertension prevalence increased steadily across age groups, older adults experienced higher rates of hypertension compared to younger adults. Individuals who experienced obesity also experienced higher rates of hypertension relative to those with lower BMI. These patterns are consistent with the current research literature. In the unadjusted logistic regression model, age was significantly associated with hypertension (OR= 1.055; 95% CI: 1.045-1.065), indicating that each additional year of age was associated with a 5.5% increase in the odds of hypertension. After adjusting for BMI, age, physical activity, and smoking status, the association between age and hypertension remained statistically significant with a slight increase in OR from 1.055 to 1.061. There was no statistically significant interaction observed between age and BMI category, indicating that the effect of age on hypertension did not differ meaningfully across BMI categories. The model diagnostics indicated no concerns regarding multicollinearity and no influential observations based on cook’s distance. Model comparison results favored model 2, which demonstrated the best fit based on AIC and BIC. Interpretation The results indicate that both age and BMI are important predictors of hypertension, even after adjusting for behavioral and demographic covariates. There is a strong and consistent association between age and hypertension, showing the biological aging process. Individuals in higher BMI categories, particularly obese, showed higher odds of hypertension, however, estimates were imprecise with wide confidence intervals, indicating uncertainty in the strength of this association. Limitations When conducting any type of research, you should always consider the limitations of your study. The first limitation to the study is that BRFSS data relies on self-reported data, which may introduce misclassification or recall bias. Second, there were additional confounding factors that should be considered like diet, medication use, or genetics. Finally, we did not use complex survey designs for the BRFSS data, which would include weighted sampling, clustering, or stratification. This may have an effect on generalizability of the results and lead to bias in the data.
Logistic Regression:
\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]
Odds Ratio:
\[\text{OR} = e^{\beta_i}\]
Predicted Probability:
\[p = \frac{e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}{1 + e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}\]
Session Info
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggstats_0.12.0 gtsummary_2.5.0 ggeffects_2.3.2 car_3.1-3
## [5] carData_3.0-5 broom_1.0.11 plotly_4.12.0 kableExtra_1.4.0
## [9] knitr_1.51 haven_2.5.5 lubridate_1.9.4 forcats_1.0.1
## [13] stringr_1.6.0 dplyr_1.1.4 purrr_1.2.1 readr_2.1.6
## [17] tidyr_1.3.2 tibble_3.3.1 ggplot2_4.0.0 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.56 bslib_0.9.0
## [4] htmlwidgets_1.6.4 insight_1.4.6 tzdb_0.5.0
## [7] vctrs_0.6.5 tools_4.5.1 crosstalk_1.2.2
## [10] generics_0.1.4 datawizard_1.3.0 pkgconfig_2.0.3
## [13] data.table_1.18.0 RColorBrewer_1.1-3 S7_0.2.0
## [16] lifecycle_1.0.5 compiler_4.5.1 farver_2.1.2
## [19] textshaping_1.0.4 htmltools_0.5.9 sass_0.4.10
## [22] yaml_2.3.12 lazyeval_0.2.2 Formula_1.2-5
## [25] pillar_1.11.1 jquerylib_0.1.4 broom.helpers_1.22.0
## [28] cachem_1.1.0 abind_1.4-8 tidyselect_1.2.1
## [31] digest_0.6.39 stringi_1.8.7 labeling_0.4.3
## [34] labelled_2.16.0 fastmap_1.2.0 grid_4.5.1
## [37] cli_3.6.5 magrittr_2.0.4 cards_0.7.1
## [40] utf8_1.2.6 withr_3.0.2 scales_1.4.0
## [43] backports_1.5.0 timechange_0.3.0 rmarkdown_2.30
## [46] httr_1.4.7 otel_0.2.0 hms_1.1.4
## [49] evaluate_1.0.5 viridisLite_0.4.2 rlang_1.1.6
## [52] glue_1.8.0 xml2_1.5.1 svglite_2.2.2
## [55] rstudioapi_0.18.0 jsonlite_2.0.0 R6_2.6.1
## [58] systemfonts_1.3.1