Statistical modeling is a fundamental tool in epidemiology that allows us to:
This lecture introduces key concepts in regression modeling using real-world data from the Behavioral Risk Factor Surveillance System (BRFSS) 2023.
## systemfonts and textshaping have been compiled with different versions of Freetype. Because of this, textshaping will not use the font cache provided by systemfonts
The BRFSS is a large-scale telephone survey that collects data on health-related risk behaviors, chronic health conditions, and use of preventive services from U.S. residents.
# Load the full BRFSS 2023 dataset
brfss_full <- read_xpt("/Users/mm992584/Library/CloudStorage/OneDrive-UniversityatAlbany-SUNY/Spring 2026/Epi 553/Lectures Notes/Modeling/LLCP2023.XPT") %>%
janitor::clean_names()
# Display dataset dimensions
names(brfss_full)## [1] "state" "fmonth" "idate" "imonth" "iday" "iyear"
## [7] "dispcode" "seqno" "psu" "ctelenm1" "pvtresd1" "colghous"
## [13] "statere1" "celphon1" "ladult1" "numadult" "respslc1" "landsex2"
## [19] "lndsxbrt" "safetime" "ctelnum1" "cellfon5" "cadult1" "cellsex2"
## [25] "celsxbrt" "pvtresd3" "cclghous" "cstate1" "landline" "hhadult"
## [31] "sexvar" "genhlth" "physhlth" "menthlth" "poorhlth" "primins1"
## [37] "persdoc3" "medcost1" "checkup1" "exerany2" "exract12" "exeroft1"
## [43] "exerhmm1" "exract22" "exeroft2" "exerhmm2" "strength" "bphigh6"
## [49] "bpmeds1" "cholchk3" "toldhi3" "cholmed3" "cvdinfr4" "cvdcrhd4"
## [55] "cvdstrk3" "asthma3" "asthnow" "chcscnc1" "chcocnc1" "chccopd3"
## [61] "addepev3" "chckdny2" "havarth4" "diabete4" "diabage4" "marital"
## [67] "educa" "renthom1" "numhhol4" "numphon4" "cpdemo1c" "veteran3"
## [73] "employ1" "children" "income3" "pregnant" "weight2" "height3"
## [79] "deaf" "blind" "decide" "diffwalk" "diffdres" "diffalon"
## [85] "fall12mn" "fallinj5" "smoke100" "smokday2" "usenow3" "ecignow2"
## [91] "alcday4" "avedrnk3" "drnk3ge5" "maxdrnks" "flushot7" "flshtmy3"
## [97] "pneuvac4" "shingle2" "hivtst7" "hivtstd3" "seatbelt" "drnkdri2"
## [103] "covidpo1" "covidsm1" "covidact" "pdiabts1" "prediab2" "diabtype"
## [109] "insulin1" "chkhemo3" "eyeexam1" "diabeye1" "diabedu1" "feetsore"
## [115] "arthexer" "arthedu" "lmtjoin3" "arthdis2" "joinpai2" "lcsfirst"
## [121] "lcslast" "lcsnumcg" "lcsctsc1" "lcsscncr" "lcsctwhn" "hadmam"
## [127] "howlong" "cervscrn" "crvclcnc" "crvclpap" "crvclhpv" "hadhyst2"
## [133] "psatest1" "psatime1" "pcpsars2" "psasugs1" "pcstalk2" "hadsigm4"
## [139] "colnsigm" "colntes1" "sigmtes1" "lastsig4" "colncncr" "vircolo1"
## [145] "vclntes2" "smalstol" "stoltest" "stooldn2" "bldstfit" "sdnates1"
## [151] "cncrdiff" "cncrage" "cncrtyp2" "csrvtrt3" "csrvdoc1" "csrvsum"
## [157] "csrvrtrn" "csrvinst" "csrvinsr" "csrvdein" "csrvclin" "csrvpain"
## [163] "csrvctl2" "indortan" "numburn3" "sunprtct" "wkdayout" "wkendout"
## [169] "cimemlo1" "cdworry" "cddiscu1" "cdhous1" "cdsocia1" "caregiv1"
## [175] "crgvrel4" "crgvlng1" "crgvhrs1" "crgvprb3" "crgvalzd" "crgvper1"
## [181] "crgvhou1" "crgvexpt" "lastsmk2" "stopsmk2" "mentcigs" "mentecig"
## [187] "heattbco" "firearm5" "gunload" "loadulk2" "hasymp1" "hasymp2"
## [193] "hasymp3" "hasymp4" "hasymp5" "hasymp6" "strsymp1" "strsymp2"
## [199] "strsymp3" "strsymp4" "strsymp5" "strsymp6" "firstaid" "aspirin"
## [205] "birthsex" "somale" "sofemale" "trnsgndr" "marijan1" "marjsmok"
## [211] "marjeat" "marjvape" "marjdab" "marjothr" "usemrjn4" "acedeprs"
## [217] "acedrink" "acedrugs" "aceprisn" "acedivrc" "acepunch" "acehurt1"
## [223] "aceswear" "acetouch" "acetthem" "acehvsex" "aceadsaf" "aceadned"
## [229] "imfvpla4" "hpvadvc4" "hpvadsht" "tetanus1" "covidva1" "covacge1"
## [235] "covidnu2" "lsatisfy" "emtsuprt" "sdlonely" "sdhemply" "foodstmp"
## [241] "sdhfood1" "sdhbills" "sdhutils" "sdhtrnsp" "sdhstre1" "rrclass3"
## [247] "rrcognt2" "rrtreat" "rratwrk2" "rrhcare4" "rrphysm2" "rcsgend1"
## [253] "rcsxbrth" "rcsrltn2" "casthdx2" "casthno2" "qstver" "qstlang"
## [259] "metstat" "urbstat" "mscode" "ststr" "strwt" "rawrake"
## [265] "wt2rake" "imprace" "chispnc" "crace1" "cageg" "cllcpwt"
## [271] "dualuse" "dualcor" "llcpwt2" "llcpwt" "rfhlth" "phys14d"
## [277] "ment14d" "hlthpl1" "hcvu653" "totinda" "metvl12" "metvl22"
## [283] "maxvo21" "fc601" "actin13" "actin23" "padur1" "padur2"
## [289] "pafreq1" "pafreq2" "minac12" "minac22" "strfreq" "pamiss3"
## [295] "pamin13" "pamin23" "pa3min" "pavig13" "pavig23" "pa3vigm"
## [301] "pacat3" "paindx3" "pa150r4" "pa300r4" "pa30023" "pastrng"
## [307] "parec3" "pastae3" "rfhype6" "cholch3" "rfchol3" "michd"
## [313] "ltasth1" "casthm1" "asthms1" "drdxar2" "mrace1" "hispanc"
## [319] "race" "raceg21" "racegr3" "raceprv" "sex" "ageg5yr"
## [325] "age65yr" "age80" "age_g" "htin4" "htm4" "wtkg3"
## [331] "bmi5" "bmi5cat" "rfbmi5" "chldcnt" "educag" "incomg1"
## [337] "smoker3" "rfsmok3" "cureci2" "drnkany6" "drocdy4" "rfbing6"
## [343] "drnkwk2" "rfdrhv8" "flshot7" "pneumo3" "aidtst4" "rfseat2"
## [349] "rfseat3" "drnkdrv"
For computational efficiency and teaching purposes, we’ll create a subset with relevant variables and complete cases.
# Select variables of interest and create analytic dataset
set.seed(553) # For reproducibility
brfss_subset <- brfss_full %>%
select(
# Outcome: Diabetes status
diabete4,
# Demographics
age_g, # Age category
sex, # Sex
race, # Race/ethnicity
educag, # Education level
incomg1, # Income category
# Health behaviors
bmi5cat, # BMI category
exerany2, # Physical activity
smokday2, # Smoking frequency
# Health status
genhlth, # General health
rfhype6, # High blood pressure
rfchol3 # High cholesterol
) %>%
# Filter to complete cases only
drop_na() %>%
# Sample 2000 observations for manageable analysis
slice_sample(n = 2000)
# Display subset dimensions
cat("Working subset dimensions:",
nrow(brfss_subset), "observations,",
ncol(brfss_subset), "variables\n")## Working subset dimensions: 2000 observations, 12 variables
# Create clean dataset with recoded variables
brfss_clean <- brfss_subset %>%
mutate(
# Outcome: Diabetes (binary)
diabetes = case_when(
diabete4 == 1 ~ 1, # Yes
diabete4 %in% c(2, 3, 4) ~ 0, # No, pre-diabetes, or gestational only
TRUE ~ NA_real_
),
# Age groups
age_group = factor(case_when(
age_g == 1 ~ "18-24",
age_g == 2 ~ "25-34",
age_g == 3 ~ "35-44",
age_g == 4 ~ "45-54",
age_g == 5 ~ "55-64",
age_g == 6 ~ "65+"
), levels = c("18-24", "25-34", "35-44", "45-54", "55-64", "65+")),
# Age continuous (midpoint of category)
age_cont = case_when(
age_g == 1 ~ 21,
age_g == 2 ~ 29.5,
age_g == 3 ~ 39.5,
age_g == 4 ~ 49.5,
age_g == 5 ~ 59.5,
age_g == 6 ~ 70
),
# Sex
sex = factor(ifelse(sex == 1, "Male", "Female")),
# Race/ethnicity
race = factor(case_when(
race == 1 ~ "White",
race == 2 ~ "Black",
race == 3 ~ "Native American",
race == 4 ~ "Asian",
race == 5 ~ "Native Hawaiian/PI",
race == 6 ~ "Other",
race == 7 ~ "Multiracial",
race == 8 ~ "Hispanic"
)),
# Education (simplified)
education = factor(case_when(
educag == 1 ~ "< High school",
educag == 2 ~ "High school graduate",
educag == 3 ~ "Some college",
educag == 4 ~ "College graduate"
), levels = c("< High school", "High school graduate", "Some college", "College graduate")),
# Income (simplified)
income = factor(case_when(
incomg1 == 1 ~ "< $25,000",
incomg1 == 2 ~ "$25,000-$49,999",
incomg1 == 3 ~ "$50,000-$74,999",
incomg1 == 4 ~ "$75,000+",
incomg1 == 5 ~ "Unknown"
), levels = c("< $25,000", "$25,000-$49,999", "$50,000-$74,999", "$75,000+", "Unknown")),
# BMI category
bmi_cat = factor(case_when(
bmi5cat == 1 ~ "Underweight",
bmi5cat == 2 ~ "Normal",
bmi5cat == 3 ~ "Overweight",
bmi5cat == 4 ~ "Obese"
), levels = c("Underweight", "Normal", "Overweight", "Obese")),
# Physical activity (binary)
phys_active = ifelse(exerany2 == 1, 1, 0),
# Current smoking
current_smoker = case_when(
smokday2 == 1 ~ 1, # Every day
smokday2 == 2 ~ 1, # Some days
smokday2 == 3 ~ 0, # Not at all
TRUE ~ 0
),
# General health (simplified)
gen_health = factor(case_when(
genhlth %in% c(1, 2) ~ "Excellent/Very good",
genhlth == 3 ~ "Good",
genhlth %in% c(4, 5) ~ "Fair/Poor"
), levels = c("Excellent/Very good", "Good", "Fair/Poor")),
# Hypertension
hypertension = ifelse(rfhype6 == 1, 1, 0),
# High cholesterol
high_chol = ifelse(rfchol3 == 1, 1, 0)
) %>%
# Select only the clean variables
select(diabetes, age_group, age_cont, sex, race, education, income,
bmi_cat, phys_active, current_smoker, gen_health,
hypertension, high_chol) %>%
# Remove any remaining missing values
drop_na()
# Save the cleaned subset for future use
write_rds(brfss_clean,
"/Users/mm992584/Library/CloudStorage/OneDrive-UniversityatAlbany-SUNY/Spring 2026/Epi 553/Lectures Notes/Modeling/brfss_subset_2023.rds")
cat("Clean dataset saved with", nrow(brfss_clean), "complete observations\n")## Clean dataset saved with 1281 complete observations
# Summary table by diabetes status
desc_table <- brfss_clean %>%
group_by(diabetes) %>%
summarise(
N = n(),
`Mean Age` = round(mean(age_cont), 1),
`% Male` = round(100 * mean(sex == "Male"), 1),
`% Obese` = round(100 * mean(bmi_cat == "Obese", na.rm = TRUE), 1),
`% Physically Active` = round(100 * mean(phys_active), 1),
`% Current Smoker` = round(100 * mean(current_smoker), 1),
`% Hypertension` = round(100 * mean(hypertension), 1),
`% High Cholesterol` = round(100 * mean(high_chol), 1)
) %>%
mutate(diabetes = ifelse(diabetes == 1, "Diabetes", "No Diabetes"))
desc_table %>%
kable(caption = "Descriptive Statistics by Diabetes Status",
align = "lrrrrrrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| diabetes | N | Mean Age | % Male | % Obese | % Physically Active | % Current Smoker | % Hypertension | % High Cholesterol |
|---|---|---|---|---|---|---|---|---|
| No Diabetes | 1053 | 58.2 | 49.0 | 34.8 | 69.4 | 29.3 | 52.3 | 57.0 |
| Diabetes | 228 | 63.1 | 53.9 | 56.1 | 53.5 | 27.6 | 22.8 | 32.5 |
A statistical model is a mathematical representation of the relationship between:
\[f(Y) = \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \cdots + \beta_p X_p + \epsilon\]
Where:
The choice of regression model depends on the type of outcome variable:
| Outcome Type | Regression Type | Link Function | Example |
|---|---|---|---|
| Continuous | Linear | Identity: Y | Blood pressure, BMI |
| Binary | Logistic | Logit: log(p/(1-p)) | Disease status, mortality |
| Count | Poisson/Negative Binomial | Log: log(Y) | Number of infections |
| Time-to-event | Cox Proportional Hazards | Log: log(h(t)) | Survival time |
Let’s model the relationship between age and diabetes prevalence.
# Simple linear regression: diabetes ~ age
model_linear_simple <- lm(diabetes ~ age_cont, data = brfss_clean)
# Display results
tidy(model_linear_simple, conf.int = TRUE) %>%
kable(caption = "Simple Linear Regression: Diabetes ~ Age",
digits = 4,
col.names = c("Term", "Estimate", "Std. Error", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Estimate | Std. Error | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | -0.0632 | 0.0481 | -1.3125 | 0.1896 | -0.1576 | 0.0312 |
| age_cont | 0.0041 | 0.0008 | 5.1368 | 0.0000 | 0.0025 | 0.0056 |
Interpretation:
# Create scatter plot with regression line
p1 <- ggplot(brfss_clean, aes(x = age_cont, y = diabetes)) +
geom_jitter(alpha = 0.2, width = 0.5, height = 0.02, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1.2) +
labs(
title = "Relationship Between Age and Diabetes",
subtitle = "Simple Linear Regression",
x = "Age (years)",
y = "Probability of Diabetes"
) +
theme_minimal(base_size = 12)
ggplotly(p1) %>%
layout(hovermode = "closest")Diabetes Prevalence by Age
Problem with linear regression for binary outcomes:
Solution: Logistic Regression
Uses the logit link function to ensure predicted probabilities stay between 0 and 1:
\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]
# Simple logistic regression: diabetes ~ age
model_logistic_simple <- glm(diabetes ~ age_cont,
data = brfss_clean,
family = binomial(link = "logit"))
# Display results with odds ratios
tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Simple Logistic Regression: Diabetes ~ Age (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.029 | 0.423 | -8.390 | 0 | 0.012 | 0.064 |
| age_cont | 1.034 | 0.007 | 4.978 | 0 | 1.021 | 1.048 |
Interpretation:
# Generate predicted probabilities
pred_data <- data.frame(age_cont = seq(18, 80, by = 1))
pred_data$predicted_prob <- predict(model_logistic_simple,
newdata = pred_data,
type = "response")
# Plot
p2 <- ggplot(pred_data, aes(x = age_cont, y = predicted_prob)) +
geom_line(color = "darkred", linewidth = 1.5) +
geom_ribbon(aes(ymin = predicted_prob - 0.02,
ymax = predicted_prob + 0.02),
alpha = 0.2, fill = "darkred") +
labs(
title = "Predicted Probability of Diabetes by Age",
subtitle = "Simple Logistic Regression",
x = "Age (years)",
y = "Predicted Probability of Diabetes"
) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0, 0.6)) +
theme_minimal(base_size = 12)
ggplotly(p2)Predicted Diabetes Probability by Age
A confounder is a variable that:
Example: The relationship between age and diabetes may be confounded by BMI, physical activity, and other factors.
# Multiple logistic regression with potential confounders
model_logistic_multiple <- glm(diabetes ~ age_cont + sex + bmi_cat +
phys_active + current_smoker + education,
data = brfss_clean,
family = binomial(link = "logit"))
# Display results
tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Multiple Logistic Regression: Diabetes ~ Age + Covariates (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
scroll_box(height = "400px")| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.009 | 1.177 | -4.001 | 0.000 | 0.000 | 0.065 |
| age_cont | 1.041 | 0.007 | 5.515 | 0.000 | 1.027 | 1.057 |
| sexMale | 1.191 | 0.154 | 1.133 | 0.257 | 0.880 | 1.613 |
| bmi_catNormal | 1.971 | 1.052 | 0.645 | 0.519 | 0.378 | 36.309 |
| bmi_catOverweight | 3.155 | 1.044 | 1.101 | 0.271 | 0.621 | 57.679 |
| bmi_catObese | 6.834 | 1.041 | 1.845 | 0.065 | 1.354 | 124.675 |
| phys_active | 0.589 | 0.157 | -3.373 | 0.001 | 0.433 | 0.802 |
| current_smoker | 1.213 | 0.178 | 1.085 | 0.278 | 0.852 | 1.716 |
| educationHigh school graduate | 0.634 | 0.288 | -1.579 | 0.114 | 0.364 | 1.131 |
| educationSome college | 0.542 | 0.294 | -2.081 | 0.037 | 0.307 | 0.977 |
| educationCollege graduate | 0.584 | 0.305 | -1.763 | 0.078 | 0.324 | 1.074 |
Interpretation:
Categorical variables with \(k\) levels are represented using \(k-1\) dummy variables (indicator variables).
Education has 4 levels: 1. < High school (reference category) 2. High school graduate 3. Some college 4. College graduate
R automatically creates 3 dummy variables:
# Extract dummy variable coding
dummy_table <- data.frame(
Education = c("< High school", "High school graduate", "Some college", "College graduate"),
`Dummy 1 (HS grad)` = c(0, 1, 0, 0),
`Dummy 2 (Some college)` = c(0, 0, 1, 0),
`Dummy 3 (College grad)` = c(0, 0, 0, 1),
check.names = FALSE
)
dummy_table %>%
kable(caption = "Dummy Variable Coding for Education",
align = "lccc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(1, bold = TRUE, background = "#ffe6e6") # Highlight reference category| Education | Dummy 1 (HS grad) | Dummy 2 (Some college) | Dummy 3 (College grad) |
|---|---|---|---|
| < High school | 0 | 0 | 0 |
| High school graduate | 1 | 0 | 0 |
| Some college | 0 | 1 | 0 |
| College graduate | 0 | 0 | 1 |
Reference Category: The category with all zeros (< High school) is the reference group. All other categories are compared to this reference.
# Extract education coefficients
educ_coefs <- tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "education")) %>%
mutate(
education_level = str_remove(term, "education"),
education_level = factor(education_level,
levels = c("High school graduate",
"Some college",
"College graduate"))
)
# Add reference category
ref_row <- data.frame(
term = "education< High school",
estimate = 1.0,
std.error = 0,
statistic = NA,
p.value = NA,
conf.low = 1.0,
conf.high = 1.0,
education_level = factor("< High school (Ref)",
levels = c("< High school (Ref)",
"High school graduate",
"Some college",
"College graduate"))
)
educ_coefs_full <- bind_rows(ref_row, educ_coefs) %>%
mutate(education_level = factor(education_level,
levels = c("< High school (Ref)",
"High school graduate",
"Some college",
"College graduate")))
# Plot
p3 <- ggplot(educ_coefs_full, aes(x = education_level, y = estimate)) +
geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high),
size = 0.8, color = "darkblue") +
coord_flip() +
labs(
title = "Association Between Education and Diabetes",
subtitle = "Adjusted Odds Ratios (reference: < High school)",
x = "Education Level",
y = "Odds Ratio (95% CI)"
) +
theme_minimal(base_size = 12)
ggplotly(p3)Odds Ratios for Education Levels
An interaction exists when the effect of one variable on the outcome differs across levels of another variable.
Epidemiologic term: Effect modification
Does the effect of age on diabetes differ between males and females?
# Model with interaction term
model_interaction <- glm(diabetes ~ age_cont * sex + bmi_cat + phys_active,
data = brfss_clean,
family = binomial(link = "logit"))
# Display interaction results
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "age_cont")) %>%
kable(caption = "Age × Sex Interaction Model (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| age_cont | 1.031 | 0.009 | 3.178 | 0.001 | 1.012 | 1.051 |
| age_cont:sexMale | 1.015 | 0.014 | 1.084 | 0.278 | 0.988 | 1.044 |
Interpretation:
# Generate predicted probabilities by sex
pred_interact <- ggpredict(model_interaction, terms = c("age_cont [18:80]", "sex"))
# Plot
p4 <- ggplot(pred_interact, aes(x = x, y = predicted, color = group, fill = group)) +
geom_line(linewidth = 1.2) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, color = NA) +
labs(
title = "Predicted Probability of Diabetes by Age and Sex",
subtitle = "Testing for Age × Sex Interaction",
x = "Age (years)",
y = "Predicted Probability of Diabetes",
color = "Sex",
fill = "Sex"
) +
scale_y_continuous(labels = scales::percent_format()) +
scale_color_manual(values = c("Female" = "#E64B35", "Male" = "#4DBBD5")) +
scale_fill_manual(values = c("Female" = "#E64B35", "Male" = "#4DBBD5")) +
theme_minimal(base_size = 12) +
theme(legend.position = "top")
ggplotly(p4)Age-Diabetes Relationship by Sex
Every regression model makes assumptions about the data. If assumptions are violated, results may be invalid.
Variance Inflation Factor (VIF): Measures how much the variance of a coefficient is inflated due to correlation with other predictors.
# Calculate VIF
vif_values <- vif(model_logistic_multiple)
# Create VIF table
# For models with categorical variables, vif() returns GVIF (Generalized VIF)
if (is.matrix(vif_values)) {
# If matrix (categorical variables present), extract GVIF^(1/(2*Df))
vif_df <- data.frame(
Variable = rownames(vif_values),
VIF = vif_values[, "GVIF^(1/(2*Df))"]
)
} else {
# If vector (only continuous variables)
vif_df <- data.frame(
Variable = names(vif_values),
VIF = as.numeric(vif_values)
)
}
# Add interpretation
vif_df <- vif_df %>%
arrange(desc(VIF)) %>%
mutate(
Interpretation = case_when(
VIF < 5 ~ "Low (No concern)",
VIF >= 5 & VIF < 10 ~ "Moderate (Monitor)",
VIF >= 10 ~ "High (Problem)"
)
)
vif_df %>%
kable(caption = "Variance Inflation Factors (VIF) for Multiple Regression Model",
digits = 2,
align = "lrc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which(vif_df$VIF >= 10), bold = TRUE, color = "white", background = "#DC143C") %>%
row_spec(which(vif_df$VIF >= 5 & vif_df$VIF < 10), background = "#FFA500") %>%
row_spec(which(vif_df$VIF < 5), background = "#90EE90")| Variable | VIF | Interpretation | |
|---|---|---|---|
| age_cont | age_cont | 1.05 | Low (No concern) |
| current_smoker | current_smoker | 1.05 | Low (No concern) |
| phys_active | phys_active | 1.02 | Low (No concern) |
| sex | sex | 1.01 | Low (No concern) |
| education | education | 1.01 | Low (No concern) |
| bmi_cat | bmi_cat | 1.01 | Low (No concern) |
Cook’s Distance: Measures how much the model would change if an observation were removed.
# Calculate Cook's distance
cooks_d <- cooks.distance(model_logistic_multiple)
# Create data frame
influence_df <- data.frame(
observation = 1:length(cooks_d),
cooks_d = cooks_d
) %>%
mutate(influential = ifelse(cooks_d > 1, "Yes", "No"))
# Plot
p5 <- ggplot(influence_df, aes(x = observation, y = cooks_d, color = influential)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
labs(
title = "Cook's Distance: Identifying Influential Observations",
subtitle = "Values > 1 indicate potentially influential observations",
x = "Observation Number",
y = "Cook's Distance",
color = "Influential?"
) +
scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
theme_minimal(base_size = 12)
ggplotly(p5)Cook’s Distance for Influential Observations
# Count influential observations
n_influential <- sum(influence_df$influential == "Yes")
cat("Number of potentially influential observations:", n_influential, "\n")## Number of potentially influential observations: 0
Use Likelihood Ratio Test to compare nested models:
# Model 1: Age only
model1 <- glm(diabetes ~ age_cont,
data = brfss_clean,
family = binomial)
# Model 2: Age + Sex
model2 <- glm(diabetes ~ age_cont + sex,
data = brfss_clean,
family = binomial)
# Model 3: Full model
model3 <- model_logistic_multiple
# Likelihood ratio test
lrt_1_2 <- anova(model1, model2, test = "LRT")
lrt_2_3 <- anova(model2, model3, test = "LRT")
# Create comparison table
model_comp <- data.frame(
Model = c("Model 1: Age only",
"Model 2: Age + Sex",
"Model 3: Full model"),
AIC = c(AIC(model1), AIC(model2), AIC(model3)),
BIC = c(BIC(model1), BIC(model2), BIC(model3)),
`Deviance` = c(deviance(model1), deviance(model2), deviance(model3)),
check.names = FALSE
)
model_comp %>%
kable(caption = "Model Comparison: AIC, BIC, and Deviance",
digits = 2,
align = "lrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which.min(model_comp$AIC), bold = TRUE, background = "#d4edda")| Model | AIC | BIC | Deviance |
|---|---|---|---|
| Model 1: Age only | 1175.08 | 1185.39 | 1171.08 |
| Model 2: Age + Sex | 1175.85 | 1191.32 | 1169.85 |
| Model 3: Full model | 1122.65 | 1179.36 | 1100.65 |
Interpretation:
All statistical models include an error term (\(\epsilon\)) to account for:
\[Y = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p + \epsilon\]
Key points:
In this lab, you will:
# YOUR CODE HERE: Create a frequency table of hypertension status
# YOUR CODE HERE: Calculate the prevalence of hypertension by age groupQuestions:
# YOUR CODE HERE: Fit a simple logistic regression model
# Outcome: hypertension
# Predictor: age_cont
# YOUR CODE HERE: Display the results with odds ratiosQuestions:
# YOUR CODE HERE: Fit a multiple logistic regression model
# Outcome: hypertension
# Predictors: age_cont, sex, bmi_cat, phys_active, current_smoker
# YOUR CODE HERE: Display the resultsQuestions:
# YOUR CODE HERE: Create a table showing the dummy variable coding for bmi_cat
# YOUR CODE HERE: Extract and display the odds ratios for BMI categoriesQuestions:
# YOUR CODE HERE: Fit a model with Age × BMI interaction
# Test if the effect of age on hypertension differs by BMI category
# YOUR CODE HERE: Perform a likelihood ratio test comparing models with and without interactionQuestions:
# YOUR CODE HERE: Calculate VIF for your multiple regression model
# YOUR CODE HERE: Create a Cook's distance plot to identify influential observationsQuestions:
# YOUR CODE HERE: Compare three models using AIC and BIC
# Model A: Age only
# Model B: Age + sex + bmi_cat
# Model C: Age + sex + bmi_cat + phys_active + current_smoker
# YOUR CODE HERE: Create a comparison tableQuestions:
Write a brief report (1-2 pages) summarizing your findings:
Submission: Submit your completed R Markdown file and knitted HTML report.
Logistic Regression:
\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]
Odds Ratio:
\[\text{OR} = e^{\beta_i}\]
Predicted Probability:
\[p = \frac{e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}{1 + e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}\]
Session Info
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.2
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggeffects_2.3.2 car_3.1-5 carData_3.0-6 broom_1.0.12
## [5] plotly_4.12.0 kableExtra_1.4.0 knitr_1.51 haven_2.5.5
## [9] lubridate_1.9.4 forcats_1.0.1 stringr_1.6.0 dplyr_1.2.0
## [13] purrr_1.2.1 readr_2.1.6 tidyr_1.3.2 tibble_3.3.1
## [17] ggplot2_4.0.2 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.56.2 bslib_0.10.0
## [4] htmlwidgets_1.6.4 insight_1.4.5 lattice_0.22-7
## [7] tzdb_0.5.0 crosstalk_1.2.2 vctrs_0.7.1
## [10] tools_4.5.2 generics_0.1.4 datawizard_1.3.0
## [13] curl_7.0.0 pkgconfig_2.0.3 Matrix_1.7-4
## [16] telegram.bot_3.0.2 data.table_1.18.2.1 RColorBrewer_1.1-3
## [19] S7_0.2.1 lifecycle_1.0.5 compiler_4.5.2
## [22] farver_2.1.2 textshaping_1.0.4 snakecase_0.11.1
## [25] httpuv_1.6.16 htmltools_0.5.9 sass_0.4.10
## [28] yaml_2.3.12 lazyeval_0.2.2 Formula_1.2-5
## [31] later_1.4.5 pillar_1.11.1 jquerylib_0.1.4
## [34] openssl_2.3.4 cachem_1.1.0 abind_1.4-8
## [37] nlme_3.1-168 sjlabelled_1.2.0 tidyselect_1.2.1
## [40] digest_0.6.39 stringi_1.8.7 labeling_0.4.3
## [43] splines_4.5.2 fastmap_1.2.0 grid_4.5.2
## [46] cli_3.6.5 magrittr_2.0.4 withr_3.0.2
## [49] backports_1.5.0 scales_1.4.0 promises_1.5.0
## [52] timechange_0.4.0 rmarkdown_2.30 httr_1.4.7
## [55] otel_0.2.0 askpass_1.2.1 hms_1.1.4
## [58] evaluate_1.0.5 viridisLite_0.4.2 mgcv_1.9-4
## [61] rlang_1.1.7 Rcpp_1.1.1 glue_1.8.0
## [64] xml2_1.5.2 svglite_2.2.2 rstudioapi_0.18.0
## [67] jsonlite_2.0.0 R6_2.6.1 systemfonts_1.3.1.9000