Statistical modeling is a fundamental tool in epidemiology that allows us to:
This lecture introduces key concepts in regression modeling using real-world data from the Behavioral Risk Factor Surveillance System (BRFSS) 2023.
# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)The BRFSS is a large-scale telephone survey that collects data on health-related risk behaviors, chronic health conditions, and use of preventive services from U.S. residents.
# Load the full BRFSS 2023 dataset
brfss_full <- read_xpt("C:/Users/tahia/OneDrive/Desktop/UAlbany PhD/Epi 553/LLCP2023XPT/LLCP2023.XPT") %>%
janitor::clean_names()
# Display dataset dimensions
names(brfss_full)## [1] "state" "fmonth" "idate" "imonth" "iday" "iyear"
## [7] "dispcode" "seqno" "psu" "ctelenm1" "pvtresd1" "colghous"
## [13] "statere1" "celphon1" "ladult1" "numadult" "respslc1" "landsex2"
## [19] "lndsxbrt" "safetime" "ctelnum1" "cellfon5" "cadult1" "cellsex2"
## [25] "celsxbrt" "pvtresd3" "cclghous" "cstate1" "landline" "hhadult"
## [31] "sexvar" "genhlth" "physhlth" "menthlth" "poorhlth" "primins1"
## [37] "persdoc3" "medcost1" "checkup1" "exerany2" "exract12" "exeroft1"
## [43] "exerhmm1" "exract22" "exeroft2" "exerhmm2" "strength" "bphigh6"
## [49] "bpmeds1" "cholchk3" "toldhi3" "cholmed3" "cvdinfr4" "cvdcrhd4"
## [55] "cvdstrk3" "asthma3" "asthnow" "chcscnc1" "chcocnc1" "chccopd3"
## [61] "addepev3" "chckdny2" "havarth4" "diabete4" "diabage4" "marital"
## [67] "educa" "renthom1" "numhhol4" "numphon4" "cpdemo1c" "veteran3"
## [73] "employ1" "children" "income3" "pregnant" "weight2" "height3"
## [79] "deaf" "blind" "decide" "diffwalk" "diffdres" "diffalon"
## [85] "fall12mn" "fallinj5" "smoke100" "smokday2" "usenow3" "ecignow2"
## [91] "alcday4" "avedrnk3" "drnk3ge5" "maxdrnks" "flushot7" "flshtmy3"
## [97] "pneuvac4" "shingle2" "hivtst7" "hivtstd3" "seatbelt" "drnkdri2"
## [103] "covidpo1" "covidsm1" "covidact" "pdiabts1" "prediab2" "diabtype"
## [109] "insulin1" "chkhemo3" "eyeexam1" "diabeye1" "diabedu1" "feetsore"
## [115] "arthexer" "arthedu" "lmtjoin3" "arthdis2" "joinpai2" "lcsfirst"
## [121] "lcslast" "lcsnumcg" "lcsctsc1" "lcsscncr" "lcsctwhn" "hadmam"
## [127] "howlong" "cervscrn" "crvclcnc" "crvclpap" "crvclhpv" "hadhyst2"
## [133] "psatest1" "psatime1" "pcpsars2" "psasugs1" "pcstalk2" "hadsigm4"
## [139] "colnsigm" "colntes1" "sigmtes1" "lastsig4" "colncncr" "vircolo1"
## [145] "vclntes2" "smalstol" "stoltest" "stooldn2" "bldstfit" "sdnates1"
## [151] "cncrdiff" "cncrage" "cncrtyp2" "csrvtrt3" "csrvdoc1" "csrvsum"
## [157] "csrvrtrn" "csrvinst" "csrvinsr" "csrvdein" "csrvclin" "csrvpain"
## [163] "csrvctl2" "indortan" "numburn3" "sunprtct" "wkdayout" "wkendout"
## [169] "cimemlo1" "cdworry" "cddiscu1" "cdhous1" "cdsocia1" "caregiv1"
## [175] "crgvrel4" "crgvlng1" "crgvhrs1" "crgvprb3" "crgvalzd" "crgvper1"
## [181] "crgvhou1" "crgvexpt" "lastsmk2" "stopsmk2" "mentcigs" "mentecig"
## [187] "heattbco" "firearm5" "gunload" "loadulk2" "hasymp1" "hasymp2"
## [193] "hasymp3" "hasymp4" "hasymp5" "hasymp6" "strsymp1" "strsymp2"
## [199] "strsymp3" "strsymp4" "strsymp5" "strsymp6" "firstaid" "aspirin"
## [205] "birthsex" "somale" "sofemale" "trnsgndr" "marijan1" "marjsmok"
## [211] "marjeat" "marjvape" "marjdab" "marjothr" "usemrjn4" "acedeprs"
## [217] "acedrink" "acedrugs" "aceprisn" "acedivrc" "acepunch" "acehurt1"
## [223] "aceswear" "acetouch" "acetthem" "acehvsex" "aceadsaf" "aceadned"
## [229] "imfvpla4" "hpvadvc4" "hpvadsht" "tetanus1" "covidva1" "covacge1"
## [235] "covidnu2" "lsatisfy" "emtsuprt" "sdlonely" "sdhemply" "foodstmp"
## [241] "sdhfood1" "sdhbills" "sdhutils" "sdhtrnsp" "sdhstre1" "rrclass3"
## [247] "rrcognt2" "rrtreat" "rratwrk2" "rrhcare4" "rrphysm2" "rcsgend1"
## [253] "rcsxbrth" "rcsrltn2" "casthdx2" "casthno2" "qstver" "qstlang"
## [259] "metstat" "urbstat" "mscode" "ststr" "strwt" "rawrake"
## [265] "wt2rake" "imprace" "chispnc" "crace1" "cageg" "cllcpwt"
## [271] "dualuse" "dualcor" "llcpwt2" "llcpwt" "rfhlth" "phys14d"
## [277] "ment14d" "hlthpl1" "hcvu653" "totinda" "metvl12" "metvl22"
## [283] "maxvo21" "fc601" "actin13" "actin23" "padur1" "padur2"
## [289] "pafreq1" "pafreq2" "minac12" "minac22" "strfreq" "pamiss3"
## [295] "pamin13" "pamin23" "pa3min" "pavig13" "pavig23" "pa3vigm"
## [301] "pacat3" "paindx3" "pa150r4" "pa300r4" "pa30023" "pastrng"
## [307] "parec3" "pastae3" "rfhype6" "cholch3" "rfchol3" "michd"
## [313] "ltasth1" "casthm1" "asthms1" "drdxar2" "mrace1" "hispanc"
## [319] "race" "raceg21" "racegr3" "raceprv" "sex" "ageg5yr"
## [325] "age65yr" "age80" "age_g" "htin4" "htm4" "wtkg3"
## [331] "bmi5" "bmi5cat" "rfbmi5" "chldcnt" "educag" "incomg1"
## [337] "smoker3" "rfsmok3" "cureci2" "drnkany6" "drocdy4" "rfbing6"
## [343] "drnkwk2" "rfdrhv8" "flshot7" "pneumo3" "aidtst4" "rfseat2"
## [349] "rfseat3" "drnkdrv"
For computational efficiency and teaching purposes, we’ll create a subset with relevant variables and complete cases.
# Select variables of interest and create analytic dataset
set.seed(553) # For reproducibility
brfss_subset <- brfss_full %>%
select(
# Outcome: Diabetes status
diabete4,
# Demographics
age_g, # Age category
sex, # Sex
race, # Race/ethnicity
educag, # Education level
incomg1, # Income category
# Health behaviors
bmi5cat, # BMI category
exerany2, # Physical activity
smokday2, # Smoking frequency
# Health status
genhlth, # General health
rfhype6, # High blood pressure
rfchol3 # High cholesterol
) %>%
# Filter to complete cases only
drop_na() %>%
# Sample 2000 observations for manageable analysis
slice_sample(n = 2000)
# Display subset dimensions
cat("Working subset dimensions:",
nrow(brfss_subset), "observations,",
ncol(brfss_subset), "variables\n")## Working subset dimensions: 2000 observations, 12 variables
# Create clean dataset with recoded variables
brfss_clean <- brfss_subset %>%
mutate(
# Outcome: Diabetes (binary)
diabetes = case_when(
diabete4 == 1 ~ 1, # Yes
diabete4 %in% c(2, 3, 4) ~ 0, # No, pre-diabetes, or gestational only
TRUE ~ NA_real_
),
# Age groups
age_group = factor(case_when(
age_g == 1 ~ "18-24",
age_g == 2 ~ "25-34",
age_g == 3 ~ "35-44",
age_g == 4 ~ "45-54",
age_g == 5 ~ "55-64",
age_g == 6 ~ "65+"
), levels = c("18-24", "25-34", "35-44", "45-54", "55-64", "65+")),
# Age continuous (midpoint of category)
age_cont = case_when(
age_g == 1 ~ 21,
age_g == 2 ~ 29.5,
age_g == 3 ~ 39.5,
age_g == 4 ~ 49.5,
age_g == 5 ~ 59.5,
age_g == 6 ~ 70
),
# Sex
sex = factor(ifelse(sex == 1, "Male", "Female")),
# Race/ethnicity
race = factor(case_when(
race == 1 ~ "White",
race == 2 ~ "Black",
race == 3 ~ "Native American",
race == 4 ~ "Asian",
race == 5 ~ "Native Hawaiian/PI",
race == 6 ~ "Other",
race == 7 ~ "Multiracial",
race == 8 ~ "Hispanic"
)),
# Education (simplified)
education = factor(case_when(
educag == 1 ~ "< High school",
educag == 2 ~ "High school graduate",
educag == 3 ~ "Some college",
educag == 4 ~ "College graduate"
), levels = c("< High school", "High school graduate", "Some college", "College graduate")),
# Income (simplified)
income = factor(case_when(
incomg1 == 1 ~ "< $25,000",
incomg1 == 2 ~ "$25,000-$49,999",
incomg1 == 3 ~ "$50,000-$74,999",
incomg1 == 4 ~ "$75,000+",
incomg1 == 5 ~ "Unknown"
), levels = c("< $25,000", "$25,000-$49,999", "$50,000-$74,999", "$75,000+", "Unknown")),
# BMI category
bmi_cat = factor(case_when(
bmi5cat == 1 ~ "Underweight",
bmi5cat == 2 ~ "Normal",
bmi5cat == 3 ~ "Overweight",
bmi5cat == 4 ~ "Obese"
), levels = c("Underweight", "Normal", "Overweight", "Obese")),
# Physical activity (binary)
phys_active = ifelse(exerany2 == 1, 1, 0),
# Current smoking
current_smoker = case_when(
smokday2 == 1 ~ 1, # Every day
smokday2 == 2 ~ 1, # Some days
smokday2 == 3 ~ 0, # Not at all
TRUE ~ 0
),
# General health (simplified)
gen_health = factor(case_when(
genhlth %in% c(1, 2) ~ "Excellent/Very good",
genhlth == 3 ~ "Good",
genhlth %in% c(4, 5) ~ "Fair/Poor"
), levels = c("Excellent/Very good", "Good", "Fair/Poor")),
# Hypertension
hypertension = ifelse(rfhype6 == 2, 1, 0),
# High cholesterol
high_chol = ifelse(rfchol3 == 2, 1, 0)
) %>%
# Select only the clean variables
select(diabetes, age_group, age_cont, sex, race, education, income,
bmi_cat, phys_active, current_smoker, gen_health,
hypertension, high_chol) %>%
# Remove any remaining missing values
drop_na()
# Save the cleaned subset for future use
write_rds(brfss_clean,
"C:/Users/tahia/OneDrive/Desktop/UAlbany PhD/Epi 553/Lab 4/brfss_subset_2023.rds")
cat("Clean dataset saved with", nrow(brfss_clean), "complete observations\n")## Clean dataset saved with 1281 complete observations
# Summary table by diabetes status
desc_table <- brfss_clean %>%
group_by(diabetes) %>%
summarise(
N = n(),
`Mean Age` = round(mean(age_cont), 1),
`% Male` = round(100 * mean(sex == "Male"), 1),
`% Obese` = round(100 * mean(bmi_cat == "Obese", na.rm = TRUE), 1),
`% Physically Active` = round(100 * mean(phys_active), 1),
`% Current Smoker` = round(100 * mean(current_smoker), 1),
`% Hypertension` = round(100 * mean(hypertension), 1),
`% High Cholesterol` = round(100 * mean(high_chol), 1)
) %>%
mutate(diabetes = ifelse(diabetes == 1, "Diabetes", "No Diabetes"))
desc_table %>%
kable(caption = "Descriptive Statistics by Diabetes Status",
align = "lrrrrrrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| diabetes | N | Mean Age | % Male | % Obese | % Physically Active | % Current Smoker | % Hypertension | % High Cholesterol |
|---|---|---|---|---|---|---|---|---|
| No Diabetes | 1053 | 58.2 | 49.0 | 34.8 | 69.4 | 29.3 | 47.5 | 42.5 |
| Diabetes | 228 | 63.1 | 53.9 | 56.1 | 53.5 | 27.6 | 76.8 | 67.1 |
A statistical model is a mathematical representation of the relationship between:
\[f(Y) = \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \cdots + \beta_p X_p + \epsilon\]
Where:
The choice of regression model depends on the type of outcome variable:
| Outcome Type | Regression Type | Link Function | Example |
|---|---|---|---|
| Continuous | Linear | Identity: Y | Blood pressure, BMI |
| Binary | Logistic | Logit: log(p/(1-p)) | Disease status, mortality |
| Count | Poisson/Negative Binomial | Log: log(Y) | Number of infections |
| Time-to-event | Cox Proportional Hazards | Log: log(h(t)) | Survival time |
Let’s model the relationship between age and diabetes prevalence.
# Simple linear regression: diabetes ~ age
model_linear_simple <- lm(diabetes ~ age_cont, data = brfss_clean)
# Display results
tidy(model_linear_simple, conf.int = TRUE) %>%
kable(caption = "Simple Linear Regression: Diabetes ~ Age",
digits = 4,
col.names = c("Term", "Estimate", "Std. Error", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Estimate | Std. Error | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | -0.0632 | 0.0481 | -1.3125 | 0.1896 | -0.1576 | 0.0312 |
| age_cont | 0.0041 | 0.0008 | 5.1368 | 0.0000 | 0.0025 | 0.0056 |
Interpretation:
# Create scatter plot with regression line
p1 <- ggplot(brfss_clean, aes(x = age_cont, y = diabetes)) +
geom_jitter(alpha = 0.2, width = 0.5, height = 0.02, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1.2) +
labs(
title = "Relationship Between Age and Diabetes",
subtitle = "Simple Linear Regression",
x = "Age (years)",
y = "Probability of Diabetes"
) +
theme_minimal(base_size = 12)
ggplotly(p1) %>%
layout(hovermode = "closest")Diabetes Prevalence by Age
Problem with linear regression for binary outcomes:
Solution: Logistic Regression
Uses the logit link function to ensure predicted probabilities stay between 0 and 1:
\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]
# Simple logistic regression: diabetes ~ age
model_logistic_simple <- glm(diabetes ~ age_cont,
data = brfss_clean,
family = binomial(link = "logit"))
# Display results with odds ratios
tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Simple Logistic Regression: Diabetes ~ Age (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.029 | 0.423 | -8.390 | 0 | 0.012 | 0.064 |
| age_cont | 1.034 | 0.007 | 4.978 | 0 | 1.021 | 1.048 |
Interpretation:
Predicted Diabetes Probability by Age
# Generate predicted probabilities
pred_data <- data.frame(age_cont = seq(18, 80, by = 1))
pred_data$predicted_prob <- predict(model_logistic_simple,
newdata = pred_data,
type = "response")
# Plot
p2 <- ggplot(pred_data, aes(x = age_cont, y = predicted_prob)) +
geom_line(color = "darkred", linewidth = 1.5) +
geom_ribbon(aes(ymin = predicted_prob - 0.02,
ymax = predicted_prob + 0.02),
alpha = 0.2, fill = "darkred") +
labs(
title = "Predicted Probability of Diabetes by Age",
subtitle = "Simple Logistic Regression",
x = "Age (years)",
y = "Predicted Probability of Diabetes"
) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0, 0.6)) +
theme_minimal(base_size = 12)
ggplotly(p2)Predicted Diabetes Probability by Age
A confounder is a variable that:
Example: The relationship between age and diabetes may be confounded by BMI, physical activity, and other factors.
# Multiple logistic regression with potential confounders
model_logistic_multiple <- glm(diabetes ~ age_cont + sex + bmi_cat +
phys_active + current_smoker + education,
data = brfss_clean,
family = binomial(link = "logit"))
# Display results
tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Multiple Logistic Regression: Diabetes ~ Age + Covariates (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
scroll_box(height = "400px")| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.009 | 1.177 | -4.001 | 0.000 | 0.000 | 0.065 |
| age_cont | 1.041 | 0.007 | 5.515 | 0.000 | 1.027 | 1.057 |
| sexMale | 1.191 | 0.154 | 1.133 | 0.257 | 0.880 | 1.613 |
| bmi_catNormal | 1.971 | 1.052 | 0.645 | 0.519 | 0.378 | 36.309 |
| bmi_catOverweight | 3.155 | 1.044 | 1.101 | 0.271 | 0.621 | 57.679 |
| bmi_catObese | 6.834 | 1.041 | 1.845 | 0.065 | 1.354 | 124.675 |
| phys_active | 0.589 | 0.157 | -3.373 | 0.001 | 0.433 | 0.802 |
| current_smoker | 1.213 | 0.178 | 1.085 | 0.278 | 0.852 | 1.716 |
| educationHigh school graduate | 0.634 | 0.288 | -1.579 | 0.114 | 0.364 | 1.131 |
| educationSome college | 0.542 | 0.294 | -2.081 | 0.037 | 0.307 | 0.977 |
| educationCollege graduate | 0.584 | 0.305 | -1.763 | 0.078 | 0.324 | 1.074 |
Interpretation:
Categorical variables with \(k\) levels are represented using \(k-1\) dummy variables (indicator variables).
Education has 4 levels: 1. < High school (reference category) 2. High school graduate 3. Some college 4. College graduate
R automatically creates 3 dummy variables:
# Extract dummy variable coding
dummy_table <- data.frame(
Education = c("< High school", "High school graduate", "Some college", "College graduate"),
`Dummy 1 (HS grad)` = c(0, 1, 0, 0),
`Dummy 2 (Some college)` = c(0, 0, 1, 0),
`Dummy 3 (College grad)` = c(0, 0, 0, 1),
check.names = FALSE
)
dummy_table %>%
kable(caption = "Dummy Variable Coding for Education",
align = "lccc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(1, bold = TRUE, background = "#ffe6e6") # Highlight reference category| Education | Dummy 1 (HS grad) | Dummy 2 (Some college) | Dummy 3 (College grad) |
|---|---|---|---|
| < High school | 0 | 0 | 0 |
| High school graduate | 1 | 0 | 0 |
| Some college | 0 | 1 | 0 |
| College graduate | 0 | 0 | 1 |
Reference Category: The category with all zeros (< High school) is the reference group. All other categories are compared to this reference.
# Extract education coefficients
educ_coefs <- tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "education")) %>%
mutate(
education_level = str_remove(term, "education"),
education_level = factor(education_level,
levels = c("High school graduate",
"Some college",
"College graduate"))
)
# Add reference category
ref_row <- data.frame(
term = "education< High school",
estimate = 1.0,
std.error = 0,
statistic = NA,
p.value = NA,
conf.low = 1.0,
conf.high = 1.0,
education_level = factor("< High school (Ref)",
levels = c("< High school (Ref)",
"High school graduate",
"Some college",
"College graduate"))
)
educ_coefs_full <- bind_rows(ref_row, educ_coefs) %>%
mutate(education_level = factor(education_level,
levels = c("< High school (Ref)",
"High school graduate",
"Some college",
"College graduate")))
# Plot
p3 <- ggplot(educ_coefs_full, aes(x = education_level, y = estimate)) +
geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high),
size = 0.8, color = "darkblue") +
coord_flip() +
labs(
title = "Association Between Education and Diabetes",
subtitle = "Adjusted Odds Ratios (reference: < High school)",
x = "Education Level",
y = "Odds Ratio (95% CI)"
) +
theme_minimal(base_size = 12)
ggplotly(p3)Odds Ratios for Education Levels
# Plot model coefficients with `ggcoef_model()`
ggcoef_model(model_logistic_multiple, exponentiate = TRUE,
include = c("education"),
variable_labels = c(
education = "Education"),
facet_labeller = ggplot2::label_wrap_gen(10)
)An interaction exists when the effect of one variable on the outcome differs across levels of another variable.
Epidemiologic term: Effect modification
Does the effect of age on diabetes differ between males and females?
# Model with interaction term
model_interaction <- glm(diabetes ~ age_cont * sex + bmi_cat + phys_active,
data = brfss_clean,
family = binomial(link = "logit"))
# Display interaction results
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "age_cont")) %>%
kable(caption = "Age × Sex Interaction Model (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| age_cont | 1.031 | 0.009 | 3.178 | 0.001 | 1.012 | 1.051 |
| age_cont:sexMale | 1.015 | 0.014 | 1.084 | 0.278 | 0.988 | 1.044 |
Interpretation:
# Generate predicted probabilities by sex
pred_interact <- ggpredict(model_interaction, terms = c("age_cont [18:80]", "sex"))
# Plot
p4 <- ggplot(pred_interact, aes(x = x, y = predicted, color = group, fill = group)) +
geom_line(linewidth = 1.2) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, color = NA) +
labs(
title = "Predicted Probability of Diabetes by Age and Sex",
subtitle = "Testing for Age × Sex Interaction",
x = "Age (years)",
y = "Predicted Probability of Diabetes",
color = "Sex",
fill = "Sex"
) +
scale_y_continuous(labels = scales::percent_format()) +
scale_color_manual(values = c("Female" = "#E64B35", "Male" = "#4DBBD5")) +
scale_fill_manual(values = c("Female" = "#E64B35", "Male" = "#4DBBD5")) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")
ggplotly(p4)Age-Diabetes Relationship by Sex
Every regression model makes assumptions about the data. If assumptions are violated, results may be invalid.
Variance Inflation Factor (VIF): Measures how much the variance of a coefficient is inflated due to correlation with other predictors.
# Calculate VIF
vif_values <- vif(model_logistic_multiple)
# Create VIF table
# For models with categorical variables, vif() returns GVIF (Generalized VIF)
if (is.matrix(vif_values)) {
# If matrix (categorical variables present), extract GVIF^(1/(2*Df))
vif_df <- data.frame(
Variable = rownames(vif_values),
VIF = vif_values[, "GVIF^(1/(2*Df))"]
)
} else {
# If vector (only continuous variables)
vif_df <- data.frame(
Variable = names(vif_values),
VIF = as.numeric(vif_values)
)
}
# Add interpretation
vif_df <- vif_df %>%
arrange(desc(VIF)) %>%
mutate(
Interpretation = case_when(
VIF < 5 ~ "Low (No concern)",
VIF >= 5 & VIF < 10 ~ "Moderate (Monitor)",
VIF >= 10 ~ "High (Problem)"
)
)
vif_df %>%
kable(caption = "Variance Inflation Factors (VIF) for Multiple Regression Model",
digits = 2,
align = "lrc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which(vif_df$VIF >= 10), bold = TRUE, color = "white", background = "#DC143C") %>%
row_spec(which(vif_df$VIF >= 5 & vif_df$VIF < 10), background = "#FFA500") %>%
row_spec(which(vif_df$VIF < 5), background = "#90EE90")| Variable | VIF | Interpretation | |
|---|---|---|---|
| age_cont | age_cont | 1.05 | Low (No concern) |
| current_smoker | current_smoker | 1.05 | Low (No concern) |
| phys_active | phys_active | 1.02 | Low (No concern) |
| sex | sex | 1.01 | Low (No concern) |
| education | education | 1.01 | Low (No concern) |
| bmi_cat | bmi_cat | 1.01 | Low (No concern) |
Cook’s Distance: Measures how much the model would change if an observation were removed.
# Calculate Cook's distance
cooks_d <- cooks.distance(model_logistic_multiple)
# Create data frame
influence_df <- data.frame(
observation = 1:length(cooks_d),
cooks_d = cooks_d
) %>%
mutate(influential = ifelse(cooks_d > 1, "Yes", "No"))
# Plot
p5 <- ggplot(influence_df, aes(x = observation, y = cooks_d, color = influential)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
labs(
title = "Cook's Distance: Identifying Influential Observations",
subtitle = "Values > 1 indicate potentially influential observations",
x = "Observation Number",
y = "Cook's Distance",
color = "Influential?"
) +
scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
theme_minimal(base_size = 12)
ggplotly(p5)Cook’s Distance for Influential Observations
# Count influential observations
n_influential <- sum(influence_df$influential == "Yes")
cat("Number of potentially influential observations:", n_influential, "\n")## Number of potentially influential observations: 0
Use Likelihood Ratio Test to compare nested models:
# Model 1: Age only
model1 <- glm(diabetes ~ age_cont,
data = brfss_clean,
family = binomial)
# Model 2: Age + Sex
model2 <- glm(diabetes ~ age_cont + sex,
data = brfss_clean,
family = binomial)
# Model 3: Full model
model3 <- model_logistic_multiple
# Likelihood ratio test
lrt_1_2 <- anova(model1, model2, test = "LRT")
lrt_2_3 <- anova(model2, model3, test = "LRT")
# Create comparison table
model_comp <- data.frame(
Model = c("Model 1: Age only",
"Model 2: Age + Sex",
"Model 3: Full model"),
AIC = c(AIC(model1), AIC(model2), AIC(model3)),
BIC = c(BIC(model1), BIC(model2), BIC(model3)),
`Deviance` = c(deviance(model1), deviance(model2), deviance(model3)),
check.names = FALSE
)
model_comp %>%
kable(caption = "Model Comparison: AIC, BIC, and Deviance",
digits = 2,
align = "lrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which.min(model_comp$AIC), bold = TRUE, background = "#d4edda")| Model | AIC | BIC | Deviance |
|---|---|---|---|
| Model 1: Age only | 1175.08 | 1185.39 | 1171.08 |
| Model 2: Age + Sex | 1175.85 | 1191.32 | 1169.85 |
| Model 3: Full model | 1122.65 | 1179.36 | 1100.65 |
Interpretation:
All statistical models include an error term (\(\epsilon\)) to account for:
\[Y = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p + \epsilon\]
Key points:
In this lab, you will:
# YOUR CODE HERE: Create a frequency table of hypertension status
# Summary table by hypertension status
table(brfss_clean$hypertension)##
## 0 1
## 606 675
## [1] 52.69321
# YOUR CODE HERE: Calculate the prevalence of hypertension by age group
prev_by_age <- brfss_clean %>%
group_by(age_group) %>%
summarise(
n = n(),
ht_cases = sum(hypertension == 1, na.rm = TRUE),
prevalence = mean(hypertension == 1, na.rm = TRUE) * 100,
.groups = "drop"
)
prev_by_age## # A tibble: 6 × 4
## age_group n ht_cases prevalence
## <fct> <int> <int> <dbl>
## 1 18-24 12 1 8.33
## 2 25-34 77 15 19.5
## 3 35-44 138 42 30.4
## 4 45-54 161 61 37.9
## 5 55-64 266 137 51.5
## 6 65+ 627 419 66.8
Questions:
Ans: The overall prevalence of hypertension is 52.69%.
Ans: As per the prevalence of hypertension by age group, it can be seen that the prevalence of hypertension increases by the age group. the youngest age group has the lowest prevalence rate compared to the older age group.
# YOUR CODE HERE: Fit a simple logistic regression model
# Outcome: hypertension
# Predictor: age_cont
model_logistic_simple1 <- glm(hypertension ~ age_cont,
data = brfss_clean,
family = binomial(link = "logit"))
# Display results with odds ratios
tidy(model_logistic_simple1, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Simple Logistic Regression: Hypertension ~ Age (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.048 | 0.296 | -10.293 | 0 | 0.026 | 0.084 |
| age_cont | 1.055 | 0.005 | 10.996 | 0 | 1.045 | 1.065 |
# Create scatter plot with regression line
p1 <- ggplot(model_logistic_simple1, aes(x = age_cont, y = hypertension)) +
geom_jitter(alpha = 0.2, width = 0.5, height = 0.02, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1.2) +
labs(
title = "Relationship Between Age and Hypertension",
subtitle = "Simple Linear Regression",
x = "Age (years)",
y = "Odds Ratio of Hypertension"
) +
theme_minimal(base_size = 12)
ggplotly(p1) %>%
layout(hovermode = "closest")Hypertension Prevalence by Age
Questions:
What is the odds ratio for age? Interpret this value. Ans: The odds ratio of 1.034 indicates that for each one-year increase in age, the odds of hypertension increase by approximately 3.4%, holding sex, BMI, physical activity, and smoking constant.
Is the association statistically significant?
Ans:Yes. The association is statistically significant because the 95% confidence interval does not include 1.0, and the p-value is less than 0.05.
Ans: 1.021-1.048, This means we are 95% confident that the true increase in odds of hypertension per one-year increase in age lies between 2.1% and 4.8%.
# YOUR CODE HERE: Fit a multiple logistic regression model
# Outcome: hypertension
# Predictors: age_cont, sex, bmi_cat, phys_active, current_smoker
model_logistic_multiple1 <- glm(hypertension ~ age_cont + sex + bmi_cat +
phys_active + current_smoker,
data = brfss_clean,
family = binomial(link = "logit"))
# Display results
tidy(model_logistic_multiple1, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Multiple Logistic Regression: Hypertension ~ Age + Covariates (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
scroll_box(height = "400px")| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.008 | 0.653 | -7.355 | 0.000 | 0.002 | 0.028 |
| age_cont | 1.061 | 0.005 | 11.234 | 0.000 | 1.050 | 1.073 |
| sexMale | 1.270 | 0.123 | 1.950 | 0.051 | 0.999 | 1.616 |
| bmi_catNormal | 2.097 | 0.546 | 1.356 | 0.175 | 0.759 | 6.756 |
| bmi_catOverweight | 3.241 | 0.543 | 2.166 | 0.030 | 1.183 | 10.385 |
| bmi_catObese | 6.585 | 0.545 | 3.459 | 0.001 | 2.394 | 21.176 |
| phys_active | 0.900 | 0.130 | -0.808 | 0.419 | 0.697 | 1.162 |
| current_smoker | 1.071 | 0.139 | 0.495 | 0.621 | 0.817 | 1.407 |
Questions:
Ans: with out adjusting the confounders, the odds ratio for age was 1.03 and after adjusting the confounders, it is 1.06, indicating not much effect of the counfounders are impacting the association.
Ans:After adjusting the confounder, there is not difference in the odds ratio of age categories. so it can be said that these confounders does nt have much impact of the association of age with hypertension.
And: From the multiple regression moedl, the BMI has the most high number as a predistor of hypertention.
# YOUR CODE HERE: Create a table showing the dummy variable coding for bmi_cat
# Extract dummy variable coding
dummy_table1 <- data.frame(
BMI = c("Normal", "Overweight", "Obese"),
`Dummy 1 (Overweight)` = c(0, 1, 0),
`Dummy 2 (Obese)` = c(0, 0, 1),
check.names = FALSE
)
dummy_table1 %>%
kable(caption = "Dummy Variable Coding for BMI", align = "lcc") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(1, bold = TRUE, background = "#ffe6e6")| BMI | Dummy 1 (Overweight) | Dummy 2 (Obese) |
|---|---|---|
| Normal | 0 | 0 |
| Overweight | 1 | 0 |
| Obese | 0 | 1 |
# YOUR CODE HERE: Extract and display the odds ratios for BMI categories
# Extract BMI odds ratios
bmi_coefs <- tidy(model_logistic_multiple1, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "^bmi_cat")) %>%
mutate(
bmi_cat = str_remove(term, "^bmi_cat_?"),
bmi_cat = factor(bmi_cat, levels = c("Overweight", "Obese"))
)
# Add reference category
ref_row1 <- data.frame(
term = "bmi_catNormal",
estimate = 1.0,
std.error = NA,
statistic = NA,
p.value = NA,
conf.low = 1.0,
conf.high = 1.0,
bmi_cat = factor("Normal (Ref)", levels = c("Normal (Ref)", "Overweight", "Obese"))
)
bmi_coefs_full <- bind_rows(ref_row1, bmi_coefs) %>%
mutate(bmi_cat = factor(bmi_cat, levels = c("Normal (Ref)", "Overweight", "Obese")))
# Plot
p3 <- ggplot(bmi_coefs_full, aes(x = bmi_cat, y = estimate)) +
geom_hline(yintercept = 1, linetype = "dashed") +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high)) +
coord_flip() +
labs(
title = "Association Between BMI and Hypertension",
subtitle = "Adjusted Odds Ratios (reference: Normal)",
x = "BMI",
y = "Odds Ratio (95% CI)"
) +
theme_minimal(base_size = 12)
ggplotly(p3)Questions:
Ans: Normal
Ans: Compared to individuals with normal BMI, obese individuals have higher odds of hypertension. Specifically, the odds ratio indicates that obese individuals have approximately 6.58 times the odds of having hypertension relative to those with normal BMI, after adjusting for age, sex, physical activity, and smoking status.
# YOUR CODE HERE: Fit a model with Age × BMI interaction
model_interaction1 <- glm(diabetes ~ age_cont * bmi_cat,
data = brfss_clean,
family = binomial(link = "logit"))
# Display interaction results
tidy(model_interaction1, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "age_cont")) %>%
kable(caption = "Age × BMI Interaction Model (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| age_cont | 3.530 | 40.609 | 0.031 | 0.975 | 853637484 | 84628801 |
| age_cont:bmi_catNormal | 0.291 | 40.609 | -0.030 | 0.976 | 0 | 0 |
| age_cont:bmi_catOverweight | 0.296 | 40.609 | -0.030 | 0.976 | 0 | 0 |
| age_cont:bmi_catObese | 0.294 | 40.609 | -0.030 | 0.976 | 0 | 0 |
# YOUR CODE HERE: Perform a likelihood ratio test comparing models with and without interaction
# Reduced model (without interaction)
model_reduced <- glm(
hypertension ~ age_cont + bmi_cat + sex + phys_active + current_smoker,
data = brfss_clean,
family = binomial(link = "logit")
)
# Full model (with Age × BMI interaction)
model_full <- glm(
hypertension ~ age_cont * bmi_cat + sex + phys_active + current_smoker,
data = brfss_clean,
family = binomial(link = "logit")
)
# Perform Likelihood Ratio Test
lrt_result <- anova(model_reduced, model_full, test = "LRT")
# Display clean table
broom::tidy(lrt_result) %>%
knitr::kable(
caption = "Likelihood Ratio Test Comparing Models With and Without Age × BMI Interaction",
digits = 4
) %>%
kableExtra::kable_styling(
bootstrap_options = c("striped", "hover"),
full_width = FALSE
)| term | df.residual | residual.deviance | df | deviance | p.value |
|---|---|---|---|---|---|
| hypertension ~ age_cont + bmi_cat + sex + phys_active + current_smoker | 1273 | 1563.496 | NA | NA | NA |
| hypertension ~ age_cont * bmi_cat + sex + phys_active + current_smoker | 1270 | 1561.260 | 3 | 2.2363 | 0.5248 |
# Test if the effect of age on hypertension differs by BMI category
pred_interact <- ggpredict(
model_full,
terms = c("age_cont [18:80]", "bmi_cat")
)
# Plot
p4 <- ggplot(pred_interact,
aes(x = x, y = predicted, color = group, fill = group)) +
geom_line(linewidth = 1.2) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high),
alpha = 0.2, color = NA) +
labs(
title = "Predicted Probability of Hypertension by Age and BMI Category",
subtitle = "Age × BMI Interaction Model",
x = "Age (years)",
y = "Predicted Probability of Hypertension",
color = "BMI Category",
fill = "BMI Category"
) +
scale_y_continuous(labels = scales::percent_format()) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")
p4Questions:
Ans: No, The likelihood ratio test comparing the model with and without the Age × BMI interaction yielded a p-value of 0.5248, which is greater than 0.05. Therefore, the interaction term is not statistically significant.
Ans: In epidemiologic terms, this indicates no evidence of effect modification by BMI category.
# YOUR CODE HERE: Calculate VIF for your multiple regression model
# Calculate VIF
vif_values <- car::vif(model_logistic_multiple1)
# Handle categorical variables (GVIF adjustment)
if (is.matrix(vif_values)) {
vif_df <- data.frame(
Variable = rownames(vif_values),
GVIF = vif_values[, "GVIF"],
Df = vif_values[, "Df"],
Adjusted_VIF = vif_values[, "GVIF^(1/(2*Df))"]
)
} else {
vif_df <- data.frame(
Variable = names(vif_values),
Adjusted_VIF = as.numeric(vif_values)
)
}
# Display table
knitr::kable(
vif_df,
caption = "Variance Inflation Factors (VIF) for Hypertension Logistic Model",
digits = 3
) %>%
kableExtra::kable_styling(
bootstrap_options = c("striped", "hover"),
full_width = FALSE
)| Variable | GVIF | Df | Adjusted_VIF | |
|---|---|---|---|---|
| age_cont | age_cont | 1.127 | 1 | 1.061 |
| sex | sex | 1.017 | 1 | 1.008 |
| bmi_cat | bmi_cat | 1.103 | 3 | 1.016 |
| phys_active | phys_active | 1.025 | 1 | 1.012 |
| current_smoker | current_smoker | 1.074 | 1 | 1.036 |
# YOUR CODE HERE: Create a Cook's distance plot to identify influential observations
# Calculate Cook's distance
cooks_d <- cooks.distance(model_logistic_multiple1)
# Create data frame
influence_df <- data.frame(
observation = 1:length(cooks_d),
cooks_d = cooks_d
)
# Common threshold
threshold <- 4 / nrow(brfss_clean)
influence_df <- influence_df %>%
dplyr::mutate(
influential = ifelse(cooks_d > threshold, "Yes", "No")
)
# Plot
p_cook <- ggplot(influence_df, aes(x = observation, y = cooks_d)) +
geom_point(aes(color = influential), alpha = 0.6) +
geom_hline(yintercept = threshold, linetype = "dashed", color = "red") +
labs(
title = "Cook's Distance Plot",
subtitle = paste("Threshold =", round(threshold, 4)),
x = "Observation Number",
y = "Cook's Distance",
color = "Influential?"
) +
scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
theme_minimal(base_size = 12)
p_cook## [1] 22
Questions:
Ans: No. All adjusted VIF values are close to 1 and well below the common threshold of 5, indicating no evidence of problematic multicollinearity among the predictors.
Ans: A small number of observations exceed the Cook’s distance threshold (4/n), suggesting potential mild influence.
Ans: If serious multicollinearity were present, I would consider removing or combining correlated variables.
# YOUR CODE HERE: Compare three models using AIC and BIC
# Model A: Age only
model_A <- glm(
hypertension ~ age_cont,
data = brfss_clean,
family = binomial(link = "logit")
)
# Model B: Age + sex + bmi_cat
model_B <- glm(
hypertension ~ age_cont + sex + bmi_cat,
data = brfss_clean,
family = binomial(link = "logit")
)
# Model C: Age + sex + bmi_cat + phys_active + current_smoker
model_C <- glm(
hypertension ~ age_cont + sex + bmi_cat + phys_active + current_smoker,
data = brfss_clean,
family = binomial(link = "logit")
)
# Create comparison table
model_comp <- tibble::tibble(
Model = c("Model A: Age only",
"Model B: Age + sex + BMI",
"Model C: Age + sex + BMI + activity + smoking"),
AIC = c(AIC(model_A), AIC(model_B), AIC(model_C)),
BIC = c(BIC(model_A), BIC(model_B), BIC(model_C)),
Deviance = c(deviance(model_A), deviance(model_B), deviance(model_C))
) %>%
dplyr::arrange(AIC)
# Display table
model_comp %>%
knitr::kable(
caption = "Model Comparison for Hypertension (Lower AIC/BIC = Better Fit)",
digits = 2
) %>%
kableExtra::kable_styling(
bootstrap_options = c("striped", "hover"),
full_width = FALSE
)| Model | AIC | BIC | Deviance |
|---|---|---|---|
| Model B: Age + sex + BMI | 1576.49 | 1607.42 | 1564.49 |
| Model C: Age + sex + BMI + activity + smoking | 1579.50 | 1620.74 | 1563.50 |
| Model A: Age only | 1636.61 | 1646.92 | 1632.61 |
Questions:
Ans: Model C (Age + Sex + BMI + Physical Activity + Smoking) has the lowest AIC and BIC, indicating the best balance between model fit and model complexity.
Ans:There is no evidence of problematic multicollinearity among predictors in the multiple logistic regression model.
Ans: I would choose Model C for the final analysis because it provides the best model fit based on AIC and BIC while also adjusting for important confounders such as BMI, physical activity, and smoking.
Write a brief report (1-2 pages) summarizing your findings:
Submission: Submit your completed R Markdown file and knitted HTML report.
The goal of this analysis was to examine factors associated with hypertension among adults using BRFSS 2023 data. Specifically, I wanted to assess whether age, sex, BMI, physical activity, and smoking are related to hypertension. I also tested whether the effect of age on hypertension differs across BMI categories (i.e., effect modification).
Hypertension (yes/no) was the binary outcome. I used logistic regression to estimate odds ratios (ORs) and 95% confidence intervals (CIs).
Three models were compared:
Model A: Age only
Model B: Age + Sex + BMI
Model C: Age + Sex + BMI + Physical Activity + Smoking
Model fit was evaluated using AIC and BIC. I tested effect modification by including an Age × BMI interaction term and performing a likelihood ratio test (LRT). Model diagnostics included variance inflation factors (VIF) for multicollinearity and Cook’s distance for influential observations.
The overall prevalence of hypertension in the sample was about 53%.
Age was significantly associated with hypertension (OR = 1.034, 95% CI: 1.021–1.048), meaning each additional year of age increased the odds of hypertension by about 3.4%. BMI showed a strong association, with obese individuals having substantially higher odds of hypertension compared to those with normal BMI. Physical activity appeared protective. Smoking showed weaker associations after adjustment.
The Age × BMI interaction was not statistically significant (LRT p = 0.525), indicating no evidence that the age–hypertension relationship differs by BMI category.
Model C had the lowest AIC and BIC, suggesting the best overall fit.
Diagnostic checks showed no multicollinearity concerns (all VIFs close to 1). Although a few observations exceeded the Cook’s distance threshold, none appeared highly influential.
Age and BMI are the strongest predictors of hypertension in this analysis. The risk of hypertension increases steadily with age and is much higher among overweight and obese individuals. The lack of a significant interaction suggests that the age-related increase in hypertension is fairly consistent across BMI categories.
This analysis is cross-sectional, so causality cannot be established. Hypertension and other variables are self-reported, which may introduce misclassification. In addition, survey weights were not applied, which may limit generalizability to the U.S. population.
Logistic Regression:
\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]
Odds Ratio:
\[\text{OR} = e^{\beta_i}\]
Predicted Probability:
\[p = \frac{e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}{1 + e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}\]
Session Info
## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggstats_0.12.0 gtsummary_2.5.0 ggeffects_2.3.2 car_3.1-3
## [5] carData_3.0-5 broom_1.0.11 plotly_4.12.0 kableExtra_1.4.0
## [9] knitr_1.51 haven_2.5.5 lubridate_1.9.4 forcats_1.0.1
## [13] stringr_1.6.0 dplyr_1.2.0 purrr_1.2.1 readr_2.1.6
## [17] tidyr_1.3.2 tibble_3.3.1 ggplot2_4.0.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.56 bslib_0.9.0
## [4] htmlwidgets_1.6.4 insight_1.4.4 lattice_0.22-7
## [7] tzdb_0.5.0 crosstalk_1.2.2 vctrs_0.7.1
## [10] tools_4.5.2 generics_0.1.4 datawizard_1.3.0
## [13] pkgconfig_2.0.3 Matrix_1.7-4 data.table_1.18.0
## [16] RColorBrewer_1.1-3 S7_0.2.1 lifecycle_1.0.5
## [19] compiler_4.5.2 farver_2.1.2 textshaping_1.0.4
## [22] janitor_2.2.1 codetools_0.2-20 snakecase_0.11.1
## [25] htmltools_0.5.9 sass_0.4.10 yaml_2.3.12
## [28] lazyeval_0.2.2 Formula_1.2-5 pillar_1.11.1
## [31] jquerylib_0.1.4 broom.helpers_1.22.0 cachem_1.1.0
## [34] abind_1.4-8 nlme_3.1-168 tidyselect_1.2.1
## [37] digest_0.6.39 stringi_1.8.7 labeling_0.4.3
## [40] splines_4.5.2 labelled_2.16.0 fastmap_1.2.0
## [43] grid_4.5.2 cli_3.6.5 magrittr_2.0.4
## [46] cards_0.7.1 utf8_1.2.6 withr_3.0.2
## [49] scales_1.4.0 backports_1.5.0 timechange_0.3.0
## [52] rmarkdown_2.30 httr_1.4.7 otel_0.2.0
## [55] hms_1.1.4 evaluate_1.0.5 viridisLite_0.4.2
## [58] mgcv_1.9-3 rlang_1.1.7 glue_1.8.0
## [61] xml2_1.5.2 svglite_2.2.2 rstudioapi_0.18.0
## [64] jsonlite_2.0.0 R6_2.6.1 systemfonts_1.3.1