Statistical modeling is a fundamental tool in epidemiology that allows us to:
This lecture introduces key concepts in regression modeling using real-world data from the Behavioral Risk Factor Surveillance System (BRFSS) 2023.
# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)The BRFSS is a large-scale telephone survey that collects data on health-related risk behaviors, chronic health conditions, and use of preventive services from U.S. residents.
# Load the full BRFSS 2023 dataset
brfss_full <- read_xpt("C:/Users/God's Icon/Desktop/553-Statistical Inference/553 Lab Codes/data/LLCP2023.XPT") %>%
janitor::clean_names()
# Display dataset dimensions
names(brfss_full)## [1] "state" "fmonth" "idate" "imonth" "iday" "iyear"
## [7] "dispcode" "seqno" "psu" "ctelenm1" "pvtresd1" "colghous"
## [13] "statere1" "celphon1" "ladult1" "numadult" "respslc1" "landsex2"
## [19] "lndsxbrt" "safetime" "ctelnum1" "cellfon5" "cadult1" "cellsex2"
## [25] "celsxbrt" "pvtresd3" "cclghous" "cstate1" "landline" "hhadult"
## [31] "sexvar" "genhlth" "physhlth" "menthlth" "poorhlth" "primins1"
## [37] "persdoc3" "medcost1" "checkup1" "exerany2" "exract12" "exeroft1"
## [43] "exerhmm1" "exract22" "exeroft2" "exerhmm2" "strength" "bphigh6"
## [49] "bpmeds1" "cholchk3" "toldhi3" "cholmed3" "cvdinfr4" "cvdcrhd4"
## [55] "cvdstrk3" "asthma3" "asthnow" "chcscnc1" "chcocnc1" "chccopd3"
## [61] "addepev3" "chckdny2" "havarth4" "diabete4" "diabage4" "marital"
## [67] "educa" "renthom1" "numhhol4" "numphon4" "cpdemo1c" "veteran3"
## [73] "employ1" "children" "income3" "pregnant" "weight2" "height3"
## [79] "deaf" "blind" "decide" "diffwalk" "diffdres" "diffalon"
## [85] "fall12mn" "fallinj5" "smoke100" "smokday2" "usenow3" "ecignow2"
## [91] "alcday4" "avedrnk3" "drnk3ge5" "maxdrnks" "flushot7" "flshtmy3"
## [97] "pneuvac4" "shingle2" "hivtst7" "hivtstd3" "seatbelt" "drnkdri2"
## [103] "covidpo1" "covidsm1" "covidact" "pdiabts1" "prediab2" "diabtype"
## [109] "insulin1" "chkhemo3" "eyeexam1" "diabeye1" "diabedu1" "feetsore"
## [115] "arthexer" "arthedu" "lmtjoin3" "arthdis2" "joinpai2" "lcsfirst"
## [121] "lcslast" "lcsnumcg" "lcsctsc1" "lcsscncr" "lcsctwhn" "hadmam"
## [127] "howlong" "cervscrn" "crvclcnc" "crvclpap" "crvclhpv" "hadhyst2"
## [133] "psatest1" "psatime1" "pcpsars2" "psasugs1" "pcstalk2" "hadsigm4"
## [139] "colnsigm" "colntes1" "sigmtes1" "lastsig4" "colncncr" "vircolo1"
## [145] "vclntes2" "smalstol" "stoltest" "stooldn2" "bldstfit" "sdnates1"
## [151] "cncrdiff" "cncrage" "cncrtyp2" "csrvtrt3" "csrvdoc1" "csrvsum"
## [157] "csrvrtrn" "csrvinst" "csrvinsr" "csrvdein" "csrvclin" "csrvpain"
## [163] "csrvctl2" "indortan" "numburn3" "sunprtct" "wkdayout" "wkendout"
## [169] "cimemlo1" "cdworry" "cddiscu1" "cdhous1" "cdsocia1" "caregiv1"
## [175] "crgvrel4" "crgvlng1" "crgvhrs1" "crgvprb3" "crgvalzd" "crgvper1"
## [181] "crgvhou1" "crgvexpt" "lastsmk2" "stopsmk2" "mentcigs" "mentecig"
## [187] "heattbco" "firearm5" "gunload" "loadulk2" "hasymp1" "hasymp2"
## [193] "hasymp3" "hasymp4" "hasymp5" "hasymp6" "strsymp1" "strsymp2"
## [199] "strsymp3" "strsymp4" "strsymp5" "strsymp6" "firstaid" "aspirin"
## [205] "birthsex" "somale" "sofemale" "trnsgndr" "marijan1" "marjsmok"
## [211] "marjeat" "marjvape" "marjdab" "marjothr" "usemrjn4" "acedeprs"
## [217] "acedrink" "acedrugs" "aceprisn" "acedivrc" "acepunch" "acehurt1"
## [223] "aceswear" "acetouch" "acetthem" "acehvsex" "aceadsaf" "aceadned"
## [229] "imfvpla4" "hpvadvc4" "hpvadsht" "tetanus1" "covidva1" "covacge1"
## [235] "covidnu2" "lsatisfy" "emtsuprt" "sdlonely" "sdhemply" "foodstmp"
## [241] "sdhfood1" "sdhbills" "sdhutils" "sdhtrnsp" "sdhstre1" "rrclass3"
## [247] "rrcognt2" "rrtreat" "rratwrk2" "rrhcare4" "rrphysm2" "rcsgend1"
## [253] "rcsxbrth" "rcsrltn2" "casthdx2" "casthno2" "qstver" "qstlang"
## [259] "metstat" "urbstat" "mscode" "ststr" "strwt" "rawrake"
## [265] "wt2rake" "imprace" "chispnc" "crace1" "cageg" "cllcpwt"
## [271] "dualuse" "dualcor" "llcpwt2" "llcpwt" "rfhlth" "phys14d"
## [277] "ment14d" "hlthpl1" "hcvu653" "totinda" "metvl12" "metvl22"
## [283] "maxvo21" "fc601" "actin13" "actin23" "padur1" "padur2"
## [289] "pafreq1" "pafreq2" "minac12" "minac22" "strfreq" "pamiss3"
## [295] "pamin13" "pamin23" "pa3min" "pavig13" "pavig23" "pa3vigm"
## [301] "pacat3" "paindx3" "pa150r4" "pa300r4" "pa30023" "pastrng"
## [307] "parec3" "pastae3" "rfhype6" "cholch3" "rfchol3" "michd"
## [313] "ltasth1" "casthm1" "asthms1" "drdxar2" "mrace1" "hispanc"
## [319] "race" "raceg21" "racegr3" "raceprv" "sex" "ageg5yr"
## [325] "age65yr" "age80" "age_g" "htin4" "htm4" "wtkg3"
## [331] "bmi5" "bmi5cat" "rfbmi5" "chldcnt" "educag" "incomg1"
## [337] "smoker3" "rfsmok3" "cureci2" "drnkany6" "drocdy4" "rfbing6"
## [343] "drnkwk2" "rfdrhv8" "flshot7" "pneumo3" "aidtst4" "rfseat2"
## [349] "rfseat3" "drnkdrv"
For computational efficiency and teaching purposes, we’ll create a subset with relevant variables and complete cases.
# Select variables of interest and create analytic dataset
set.seed(553) # For reproducibility
brfss_subset <- brfss_full %>%
select(
# Outcome: Diabetes status
diabete4,
# Demographics
age_g, # Age category
sex, # Sex
race, # Race/ethnicity
educag, # Education level
incomg1, # Income category
# Health behaviors
bmi5cat, # BMI category
exerany2, # Physical activity
smokday2, # Smoking frequency
# Health status
genhlth, # General health
rfhype6, # High blood pressure
rfchol3 # High cholesterol
) %>%
# Filter to complete cases only
drop_na() %>%
# Sample 2000 observations for manageable analysis
slice_sample(n = 2000)
# Display subset dimensions
cat("Working subset dimensions:",
nrow(brfss_subset), "observations,",
ncol(brfss_subset), "variables\n")## Working subset dimensions: 2000 observations, 12 variables
# Create clean dataset with recoded variables
brfss_clean <- brfss_subset %>%
mutate(
# Outcome: Diabetes (binary)
diabetes = case_when(
diabete4 == 1 ~ 1, # Yes
diabete4 %in% c(2, 3, 4) ~ 0, # No, pre-diabetes, or gestational only
TRUE ~ NA_real_
),
# Age groups
age_group = factor(case_when(
age_g == 1 ~ "18-24",
age_g == 2 ~ "25-34",
age_g == 3 ~ "35-44",
age_g == 4 ~ "45-54",
age_g == 5 ~ "55-64",
age_g == 6 ~ "65+"
), levels = c("18-24", "25-34", "35-44", "45-54", "55-64", "65+")),
# Age continuous (midpoint of category)
age_cont = case_when(
age_g == 1 ~ 21,
age_g == 2 ~ 29.5,
age_g == 3 ~ 39.5,
age_g == 4 ~ 49.5,
age_g == 5 ~ 59.5,
age_g == 6 ~ 70
),
# Sex
sex = factor(ifelse(sex == 1, "Male", "Female")),
# Race/ethnicity
race = factor(case_when(
race == 1 ~ "White",
race == 2 ~ "Black",
race == 3 ~ "Native American",
race == 4 ~ "Asian",
race == 5 ~ "Native Hawaiian/PI",
race == 6 ~ "Other",
race == 7 ~ "Multiracial",
race == 8 ~ "Hispanic"
)),
# Education (simplified)
education = factor(case_when(
educag == 1 ~ "< High school",
educag == 2 ~ "High school graduate",
educag == 3 ~ "Some college",
educag == 4 ~ "College graduate"
), levels = c("< High school", "High school graduate", "Some college", "College graduate")),
# Income (simplified)
income = factor(case_when(
incomg1 == 1 ~ "< $25,000",
incomg1 == 2 ~ "$25,000-$49,999",
incomg1 == 3 ~ "$50,000-$74,999",
incomg1 == 4 ~ "$75,000+",
incomg1 == 5 ~ "Unknown"
), levels = c("< $25,000", "$25,000-$49,999", "$50,000-$74,999", "$75,000+", "Unknown")),
# BMI category
bmi_cat = factor(case_when(
bmi5cat == 1 ~ "Underweight",
bmi5cat == 2 ~ "Normal",
bmi5cat == 3 ~ "Overweight",
bmi5cat == 4 ~ "Obese"
), levels = c("Underweight", "Normal", "Overweight", "Obese")),
# Physical activity (binary)
phys_active = ifelse(exerany2 == 1, 1, 0),
# Current smoking
current_smoker = case_when(
smokday2 == 1 ~ 1, # Every day
smokday2 == 2 ~ 1, # Some days
smokday2 == 3 ~ 0, # Not at all
TRUE ~ 0
),
# General health (simplified)
gen_health = factor(case_when(
genhlth %in% c(1, 2) ~ "Excellent/Very good",
genhlth == 3 ~ "Good",
genhlth %in% c(4, 5) ~ "Fair/Poor"
), levels = c("Excellent/Very good", "Good", "Fair/Poor")),
# Hypertension
hypertension = ifelse(rfhype6 == 2, 1, 0),
# High cholesterol
high_chol = ifelse(rfchol3 == 2, 1, 0)
) %>%
# Select only the clean variables
select(diabetes, age_group, age_cont, sex, race, education, income,
bmi_cat, phys_active, current_smoker, gen_health,
hypertension, high_chol) %>%
# Remove any remaining missing values
drop_na()
# Save the cleaned subset for future use
write_rds (brfss_clean, "C:/Users/God's Icon/Desktop/553-Statistical Inference/553 Lab Codes/data/LLCP2023_clean/brfss_subset.rds")
cat("Clean dataset saved with", nrow(brfss_clean), "complete observations\n")## Clean dataset saved with 1281 complete observations
# Summary table by diabetes status
desc_table <- brfss_clean %>%
group_by(diabetes) %>%
summarise(
N = n(),
`Mean Age` = round(mean(age_cont), 1),
`% Male` = round(100 * mean(sex == "Male"), 1),
`% Obese` = round(100 * mean(bmi_cat == "Obese", na.rm = TRUE), 1),
`% Physically Active` = round(100 * mean(phys_active), 1),
`% Current Smoker` = round(100 * mean(current_smoker), 1),
`% Hypertension` = round(100 * mean(hypertension), 1),
`% High Cholesterol` = round(100 * mean(high_chol), 1)
) %>%
mutate(diabetes = ifelse(diabetes == 1, "Diabetes", "No Diabetes"))
desc_table %>%
kable(caption = "Descriptive Statistics by Diabetes Status",
align = "lrrrrrrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| diabetes | N | Mean Age | % Male | % Obese | % Physically Active | % Current Smoker | % Hypertension | % High Cholesterol |
|---|---|---|---|---|---|---|---|---|
| No Diabetes | 1053 | 58.2 | 49.0 | 34.8 | 69.4 | 29.3 | 47.5 | 42.5 |
| Diabetes | 228 | 63.1 | 53.9 | 56.1 | 53.5 | 27.6 | 76.8 | 67.1 |
A statistical model is a mathematical representation of the relationship between:
\[f(Y) = \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \cdots + \beta_p X_p + \epsilon\]
Where:
The choice of regression model depends on the type of outcome variable:
| Outcome Type | Regression Type | Link Function | Example |
|---|---|---|---|
| Continuous | Linear | Identity: Y | Blood pressure, BMI |
| Binary | Logistic | Logit: log(p/(1-p)) | Disease status, mortality |
| Count | Poisson/Negative Binomial | Log: log(Y) | Number of infections |
| Time-to-event | Cox Proportional Hazards | Log: log(h(t)) | Survival time |
Let’s model the relationship between age and diabetes prevalence.
# Simple linear regression: diabetes ~ age
model_linear_simple <- lm(diabetes ~ age_cont, data = brfss_clean)
# Display results
tidy(model_linear_simple, conf.int = TRUE) %>%
kable(caption = "Simple Linear Regression: Diabetes ~ Age",
digits = 4,
col.names = c("Term", "Estimate", "Std. Error", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Estimate | Std. Error | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | -0.0632 | 0.0481 | -1.3125 | 0.1896 | -0.1576 | 0.0312 |
| age_cont | 0.0041 | 0.0008 | 5.1368 | 0.0000 | 0.0025 | 0.0056 |
Interpretation:
# Create scatter plot with regression line
p1 <- ggplot(brfss_clean, aes(x = age_cont, y = diabetes)) +
geom_jitter(alpha = 0.2, width = 0.5, height = 0.02, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1.2) +
labs(
title = "Relationship Between Age and Diabetes",
subtitle = "Simple Linear Regression",
x = "Age (years)",
y = "Probability of Diabetes"
) +
theme_minimal(base_size = 12)
ggplotly(p1) %>%
layout(hovermode = "closest")Diabetes Prevalence by Age
Problem with linear regression for binary outcomes:
Solution: Logistic Regression
Uses the logit link function to ensure predicted probabilities stay between 0 and 1:
\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]
# Simple logistic regression: diabetes ~ age
model_logistic_simple <- glm(diabetes ~ age_cont,
data = brfss_clean,
family = binomial(link = "logit"))
# Display results with odds ratios
tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Simple Logistic Regression: Diabetes ~ Age (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.029 | 0.423 | -8.390 | 0 | 0.012 | 0.064 |
| age_cont | 1.034 | 0.007 | 4.978 | 0 | 1.021 | 1.048 |
Interpretation:
Predicted Diabetes Probability by Age
# Generate predicted probabilities
pred_data <- data.frame(age_cont = seq(18, 80, by = 1))
pred_data$predicted_prob <- predict(model_logistic_simple,
newdata = pred_data,
type = "response")
# Plot
p2 <- ggplot(pred_data, aes(x = age_cont, y = predicted_prob)) +
geom_line(color = "darkred", linewidth = 1.5) +
geom_ribbon(aes(ymin = predicted_prob - 0.02,
ymax = predicted_prob + 0.02),
alpha = 0.2, fill = "darkred") +
labs(
title = "Predicted Probability of Diabetes by Age",
subtitle = "Simple Logistic Regression",
x = "Age (years)",
y = "Predicted Probability of Diabetes"
) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0, 0.6)) +
theme_minimal(base_size = 12)
ggplotly(p2)Predicted Diabetes Probability by Age
A confounder is a variable that:
Example: The relationship between age and diabetes may be confounded by BMI, physical activity, and other factors.
# Multiple logistic regression with potential confounders
model_logistic_multiple <- glm(diabetes ~ age_cont + sex + bmi_cat +
phys_active + current_smoker + education,
data = brfss_clean,
family = binomial(link = "logit"))
# Display results
tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Multiple Logistic Regression: Diabetes ~ Age + Covariates (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
scroll_box(height = "400px")| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.009 | 1.177 | -4.001 | 0.000 | 0.000 | 0.065 |
| age_cont | 1.041 | 0.007 | 5.515 | 0.000 | 1.027 | 1.057 |
| sexMale | 1.191 | 0.154 | 1.133 | 0.257 | 0.880 | 1.613 |
| bmi_catNormal | 1.971 | 1.052 | 0.645 | 0.519 | 0.378 | 36.309 |
| bmi_catOverweight | 3.155 | 1.044 | 1.101 | 0.271 | 0.621 | 57.679 |
| bmi_catObese | 6.834 | 1.041 | 1.845 | 0.065 | 1.354 | 124.675 |
| phys_active | 0.589 | 0.157 | -3.373 | 0.001 | 0.433 | 0.802 |
| current_smoker | 1.213 | 0.178 | 1.085 | 0.278 | 0.852 | 1.716 |
| educationHigh school graduate | 0.634 | 0.288 | -1.579 | 0.114 | 0.364 | 1.131 |
| educationSome college | 0.542 | 0.294 | -2.081 | 0.037 | 0.307 | 0.977 |
| educationCollege graduate | 0.584 | 0.305 | -1.763 | 0.078 | 0.324 | 1.074 |
Interpretation:
Categorical variables with \(k\) levels are represented using \(k-1\) dummy variables (indicator variables).
Education has 4 levels: 1. < High school (reference category) 2. High school graduate 3. Some college 4. College graduate
R automatically creates 3 dummy variables:
# Extract dummy variable coding
dummy_table <- data.frame(
Education = c("< High school", "High school graduate", "Some college", "College graduate"),
`Dummy 1 (HS grad)` = c(0, 1, 0, 0),
`Dummy 2 (Some college)` = c(0, 0, 1, 0),
`Dummy 3 (College grad)` = c(0, 0, 0, 1),
check.names = FALSE
)
dummy_table %>%
kable(caption = "Dummy Variable Coding for Education",
align = "lccc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(1, bold = TRUE, background = "#ffe6e6") # Highlight reference category| Education | Dummy 1 (HS grad) | Dummy 2 (Some college) | Dummy 3 (College grad) |
|---|---|---|---|
| < High school | 0 | 0 | 0 |
| High school graduate | 1 | 0 | 0 |
| Some college | 0 | 1 | 0 |
| College graduate | 0 | 0 | 1 |
Reference Category: The category with all zeros (< High school) is the reference group. All other categories are compared to this reference.
# Extract education coefficients
educ_coefs <- tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "education")) %>%
mutate(
education_level = str_remove(term, "education"),
education_level = factor(education_level,
levels = c("High school graduate",
"Some college",
"College graduate"))
)
# Add reference category
ref_row <- data.frame(
term = "education< High school",
estimate = 1.0,
std.error = 0,
statistic = NA,
p.value = NA,
conf.low = 1.0,
conf.high = 1.0,
education_level = factor("< High school (Ref)",
levels = c("< High school (Ref)",
"High school graduate",
"Some college",
"College graduate"))
)
educ_coefs_full <- bind_rows(ref_row, educ_coefs) %>%
mutate(education_level = factor(education_level,
levels = c("< High school (Ref)",
"High school graduate",
"Some college",
"College graduate")))
# Plot
p3 <- ggplot(educ_coefs_full, aes(x = education_level, y = estimate)) +
geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high),
size = 0.8, color = "darkblue") +
coord_flip() +
labs(
title = "Association Between Education and Diabetes",
subtitle = "Adjusted Odds Ratios (reference: < High school)",
x = "Education Level",
y = "Odds Ratio (95% CI)"
) +
theme_minimal(base_size = 12)
ggplotly(p3)Odds Ratios for Education Levels
# Plot model coefficients with `ggcoef_model()`
ggcoef_model(model_logistic_multiple, exponentiate = TRUE,
include = c("education"),
variable_labels = c(
education = "Education"),
facet_labeller = ggplot2::label_wrap_gen(10)
)An interaction exists when the effect of one variable on the outcome differs across levels of another variable.
Epidemiologic term: Effect modification
Does the effect of age on diabetes differ between males and females?
# Model with interaction term
model_interaction <- glm(diabetes ~ age_cont * sex + bmi_cat + phys_active,
data = brfss_clean,
family = binomial(link = "logit"))
# Display interaction results
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "age_cont")) %>%
kable(caption = "Age × Sex Interaction Model (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| age_cont | 1.031 | 0.009 | 3.178 | 0.001 | 1.012 | 1.051 |
| age_cont:sexMale | 1.015 | 0.014 | 1.084 | 0.278 | 0.988 | 1.044 |
Interpretation:
# Generate predicted probabilities by sex
pred_interact <- ggpredict(model_interaction, terms = c("age_cont [18:80]", "sex"))
# Plot
p4 <- ggplot(pred_interact, aes(x = x, y = predicted, color = group, fill = group)) +
geom_line(linewidth = 1.2) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, color = NA) +
labs(
title = "Predicted Probability of Diabetes by Age and Sex",
subtitle = "Testing for Age × Sex Interaction",
x = "Age (years)",
y = "Predicted Probability of Diabetes",
color = "Sex",
fill = "Sex"
) +
scale_y_continuous(labels = scales::percent_format()) +
scale_color_manual(values = c("Female" = "#E64B35", "Male" = "#4DBBD5")) +
scale_fill_manual(values = c("Female" = "#E64B35", "Male" = "#4DBBD5")) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")
ggplotly(p4)Age-Diabetes Relationship by Sex
Every regression model makes assumptions about the data. If assumptions are violated, results may be invalid.
Variance Inflation Factor (VIF): Measures how much the variance of a coefficient is inflated due to correlation with other predictors.
# Calculate VIF
vif_values <- vif(model_logistic_multiple)
# Create VIF table
# For models with categorical variables, vif() returns GVIF (Generalized VIF)
if (is.matrix(vif_values)) {
# If matrix (categorical variables present), extract GVIF^(1/(2*Df))
vif_df <- data.frame(
Variable = rownames(vif_values),
VIF = vif_values[, "GVIF^(1/(2*Df))"]
)
} else {
# If vector (only continuous variables)
vif_df <- data.frame(
Variable = names(vif_values),
VIF = as.numeric(vif_values)
)
}
# Add interpretation
vif_df <- vif_df %>%
arrange(desc(VIF)) %>%
mutate(
Interpretation = case_when(
VIF < 5 ~ "Low (No concern)",
VIF >= 5 & VIF < 10 ~ "Moderate (Monitor)",
VIF >= 10 ~ "High (Problem)"
)
)
vif_df %>%
kable(caption = "Variance Inflation Factors (VIF) for Multiple Regression Model",
digits = 2,
align = "lrc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which(vif_df$VIF >= 10), bold = TRUE, color = "white", background = "#DC143C") %>%
row_spec(which(vif_df$VIF >= 5 & vif_df$VIF < 10), background = "#FFA500") %>%
row_spec(which(vif_df$VIF < 5), background = "#90EE90")| Variable | VIF | Interpretation | |
|---|---|---|---|
| age_cont | age_cont | 1.05 | Low (No concern) |
| current_smoker | current_smoker | 1.05 | Low (No concern) |
| phys_active | phys_active | 1.02 | Low (No concern) |
| sex | sex | 1.01 | Low (No concern) |
| education | education | 1.01 | Low (No concern) |
| bmi_cat | bmi_cat | 1.01 | Low (No concern) |
Cook’s Distance: Measures how much the model would change if an observation were removed.
# Calculate Cook's distance
cooks_d <- cooks.distance(model_logistic_multiple)
# Create data frame
influence_df <- data.frame(
observation = 1:length(cooks_d),
cooks_d = cooks_d
) %>%
mutate(influential = ifelse(cooks_d > 1, "Yes", "No"))
# Plot
p5 <- ggplot(influence_df, aes(x = observation, y = cooks_d, color = influential)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
labs(
title = "Cook's Distance: Identifying Influential Observations",
subtitle = "Values > 1 indicate potentially influential observations",
x = "Observation Number",
y = "Cook's Distance",
color = "Influential?"
) +
scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
theme_minimal(base_size = 12)
ggplotly(p5)Cook’s Distance for Influential Observations
# Count influential observations
n_influential <- sum(influence_df$influential == "Yes")
cat("Number of potentially influential observations:", n_influential, "\n")## Number of potentially influential observations: 0
Use Likelihood Ratio Test to compare nested models:
# Model 1: Age only
model1 <- glm(diabetes ~ age_cont,
data = brfss_clean,
family = binomial)
# Model 2: Age + Sex
model2 <- glm(diabetes ~ age_cont + sex,
data = brfss_clean,
family = binomial)
# Model 3: Full model
model3 <- model_logistic_multiple
# Likelihood ratio test
lrt_1_2 <- anova(model1, model2, test = "LRT")
lrt_2_3 <- anova(model2, model3, test = "LRT")
# Create comparison table
model_comp <- data.frame(
Model = c("Model 1: Age only",
"Model 2: Age + Sex",
"Model 3: Full model"),
AIC = c(AIC(model1), AIC(model2), AIC(model3)),
BIC = c(BIC(model1), BIC(model2), BIC(model3)),
`Deviance` = c(deviance(model1), deviance(model2), deviance(model3)),
check.names = FALSE
)
model_comp %>%
kable(caption = "Model Comparison: AIC, BIC, and Deviance",
digits = 2,
align = "lrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which.min(model_comp$AIC), bold = TRUE, background = "#d4edda")| Model | AIC | BIC | Deviance |
|---|---|---|---|
| Model 1: Age only | 1175.08 | 1185.39 | 1171.08 |
| Model 2: Age + Sex | 1175.85 | 1191.32 | 1169.85 |
| Model 3: Full model | 1122.65 | 1179.36 | 1100.65 |
Interpretation:
All statistical models include an error term (\(\epsilon\)) to account for:
\[Y = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p + \epsilon\]
Key points:
In this lab, you will:
# YOUR CODE HERE: Create a frequency table of hypertension status
hyper_freq <- brfss_clean %>%
count(hypertension) %>%
mutate(
Status = ifelse(hypertension == 1, "Hypertension", "No Hypertension"),
Percent = round(100 * n / sum(n), 1)
) %>%
select(Status, n, Percent)
hyper_freq %>%
kable(caption = "Frequency Table: Hypertension Status",
col.names = c("Status", "Count", "Percent (%)"),
align = "lrr") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Status | Count | Percent (%) |
|---|---|---|
| No Hypertension | 606 | 47.3 |
| Hypertension | 675 | 52.7 |
# YOUR CODE HERE: Calculate the prevalence of hypertension by age group
hyper_by_age <- brfss_clean %>%
group_by(age_group) %>%
summarise(
N = n(),
Cases = sum(hypertension),
Prevalence_pct = round(100 * mean(hypertension), 1)
)
hyper_by_age %>%
kable(caption = "Hypertension Prevalence by Age Group",
col.names = c("Age Group", "N", "Cases", "Prevalence (%)"),
align = "lrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Age Group | N | Cases | Prevalence (%) |
|---|---|---|---|
| 18-24 | 12 | 1 | 8.3 |
| 25-34 | 77 | 15 | 19.5 |
| 35-44 | 138 | 42 | 30.4 |
| 45-54 | 161 | 61 | 37.9 |
| 55-64 | 266 | 137 | 51.5 |
| 65+ | 627 | 419 | 66.8 |
p_age <- ggplot(hyper_by_age, aes(x = age_group, y = Prevalence_pct, fill = age_group)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = paste0(Prevalence_pct, "%")), vjust = -0.4, size = 3.5) +
labs(
title = "Hypertension Prevalence by Age Group",
subtitle = "BRFSS 2023 Sample (n = 2,000)",
x = "Age Group",
y = "Prevalence (%)"
) +
scale_fill_brewer(palette = "Blues") +
theme_minimal(base_size = 12)
# Save the static ggplot figure
ggsave(
filename = "C:/Users/God's Icon/Desktop/553-Statistical Inference/553 Lab Codes/Modeling_Lecture_files/figure-html/Hypertension_Prevalence_by_Age.png",
plot = p_age,
width = 10,
height = 6,
dpi = 300
)
# Then display interactive version
ggplotly(p_age)Questions:
Answer:
The overall prevalence of hypertension is approximately 40–45% in this sample, which is consistent with national estimates for U.S. adults.This indicates that roughly 4 out of every 10 adults in the dataset have hypertension.
Answer:
Hypertension prevalence increases steadily with age. It is lowest among adults aged 18–24 (under 10%) and rises progressively across each age group, exceeding 50% among those aged 55–64 and reaching roughly two-thirds among adults aged 65 and older. This pattern shows a strong and consistent age gradient in hypertension risk.
# YOUR CODE HERE: Fit a simple logistic regression model
# Outcome: hypertension
# Predictor: age_cont
model_simple <- glm(hypertension ~ age_cont,
data = brfss_clean,
family = binomial(link = "logit"))
# YOUR CODE HERE: Display the results with odds ratios
tidy(model_simple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Simple Logistic Regression: Hypertension ~ Age (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error",
"z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.048 | 0.296 | -10.293 | 0 | 0.026 | 0.084 |
| age_cont | 1.055 | 0.005 | 10.996 | 0 | 1.045 | 1.065 |
# Predicted probability curve
pred_simple <- data.frame(age_cont = seq(18, 80, by = 1))
pred_simple$prob <- predict(model_simple, newdata = pred_simple, type = "response")
p_simple <- ggplot(pred_simple, aes(x = age_cont, y = prob)) +
geom_line(color = "steelblue", linewidth = 1.5) +
labs(
title = "Predicted Probability of Hypertension by Age",
subtitle = "Simple Logistic Regression",
x = "Age (years)",
y = "Predicted Probability"
) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0, 1)) +
theme_minimal(base_size = 12)
ggplotly(p_simple)Questions:
The odds ratio for age will be approximately 1.055, meaning for each 1-year increase in age, the odds of hypertension increase by 5%.
The p-value IS < 0.001, confirming this is highly statistically significanT association.
The 95% CI does not cross 1.0, which supports a true positive association.
Adding more predictors allows us to control for confounding. For example, older people also tend to have higher BMI and be less physically active; both of which increase hypertension risk. Without adjusting for these, we might overestimate the effect of age alone.
# YOUR CODE HERE: Fit a multiple logistic regression model
# Outcome: hypertension
# Predictors: age_cont, sex, bmi_cat, phys_active, current_smoker
model_multiple <- glm(hypertension ~ age_cont + sex + bmi_cat + phys_active + current_smoker,
data = brfss_clean,
family = binomial(link = "logit"))
# YOUR CODE HERE: Display the results
tidy(model_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Multiple Logistic Regression: Hypertension ~ Age + Covariates",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error",
"z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
save_kable(
file = "C:/Users/God's Icon/Desktop/553-Statistical Inference/553 Lab Codes/Modeling_Lecture_files/figure-html/Hypertension_Regression_Table.html"
)This is the adjusted odds ratio
Questions:
Age: The odds ratio for age changed slightly from 1.055 in the unadjusted simple logistic regression model to 1.061 in the adjusted multiple logistic regression model. For each 1-year increase in age, the odds of hypertension increase by 6.1%, after adjusting for sex, BMI, physical activity, and smoking.
SexMale: males have 27% higher odds of hypertension compared to females, after adjusting for other variables. However, the p-value is 0.051, just above the conventional 0.05 threshold and the lower bound of the 95% CI just touches 0.999, which barely includes 1.0. This result sits right on the boundary of statistical significance.
BMI (Obese): is typically the strongest predictor, obese individuals have substantially higher odds of hypertension compared to normal-weight individuals.
Physical activity: is usually protective (OR < 1), meaning active individuals have lower odds of hypertension. Sex differences in hypertension are also evident, males tend to have higher odds than females in younger age groups.
The minimal change in the age odds ratio from 1.055 to 1.061, a difference of less than 1%, suggests that very little confounding is present. The variables added to the model (sex, BMI, physical activity, and smoking) did not meaningfully distort the crude age–hypertension relationship. In epidemiology, a change of 10% or more in the odds ratio after adjustment is typically considered evidence of meaningful confounding. Since the change here falls well below that threshold, we can conclude that the unadjusted and adjusted estimates tell essentially the same story, age independently and robustly predicts hypertension in this sample.
Based on the magnitude of the odds ratios and statistical significance, the variables rank as follows.
Obesity is the strongest predictor — obese individuals have 6.6 times the odds of hypertension compared to underweight individuals (OR = 6.585, p = 0.001), which is by far the largest effect in the model.
Age is the second strongest and most statistically robust predictor, with a highly significant association (OR = 1.061, p < 0.001) that compounds meaningfully over decades.
Overweight BMI is the third significant predictor (OR = 3.241, p = 0.030), showing a clear dose-response pattern alongside the obese category, the heavier the BMI the higher the odds.
Sex (male) approaches significance (OR = 1.270, p = 0.051) and may be a meaningful predictor in a larger sample.
Physical activity (OR = 0.900, p = 0.419) and current smoking (OR = 1.071, p = 0.621) were not statistically significant in this model, though physical activity showed the expected protective direction.
# YOUR CODE HERE: Create a table showing the dummy variable coding for bmi_cat
dummy_bmi <- data.frame(
`BMI Category` = c("Underweight", "Normal", "Overweight", "Obese"),
`Dummy 1 (Underweight)` = c(1, 0, 0, 0),
`Dummy 2 (Overweight)` = c(0, 0, 1, 0),
`Dummy 3 (Obese)` = c(0, 0, 0, 1),
check.names = FALSE
)
dummy_bmi %>%
kable(caption = "Dummy Variable Coding for BMI (Reference: Normal Weight)",
align = "lccc") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(2, bold = TRUE, background = "#ffe6e6") # highlight reference row| BMI Category | Dummy 1 (Underweight) | Dummy 2 (Overweight) | Dummy 3 (Obese) |
|---|---|---|---|
| Underweight | 1 | 0 | 0 |
| Normal | 0 | 0 | 0 |
| Overweight | 0 | 1 | 0 |
| Obese | 0 | 0 | 1 |
# YOUR CODE HERE: Extract and display the odds ratios for BMI categories
# Extract and display BMI odds ratios
bmi_ors <- tidy(model_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "bmi_cat"))
# adding the reference row for completeness
ref_row <- data.frame(
term = "bmi_catNormal (Reference)",
estimate = 1.00, std.error = NA, statistic = NA,
p.value = NA, conf.low = 1.00, conf.high = 1.00
)
bmi_ors_full <- bind_rows(ref_row, bmi_ors) %>%
mutate(term = str_remove(term, "bmi_cat"))
bmi_ors_full %>%
kable(caption = "Odds Ratios for BMI Categories (Reference: Normal Weight)",
digits = 3,
col.names = c("BMI Category", "Odds Ratio", "Std. Error",
"z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| BMI Category | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| Normal (Reference) | 1.000 | NA | NA | NA | 1.000 | 1.000 |
| Normal | 2.097 | 0.546 | 1.356 | 0.175 | 0.759 | 6.756 |
| Overweight | 3.241 | 0.543 | 2.166 | 0.030 | 1.183 | 10.385 |
| Obese | 6.585 | 0.545 | 3.459 | 0.001 | 2.394 | 21.176 |
Questions:
The reference category is Normal weight, this is the group to which all other BMI groups are compared. Its OR is fixed at 1.0 by definition.
If the OR for Obese is, say, 2.5, this means obese individuals have 2.5 times the odds of hypertension compared to normal-weight individuals, after adjusting for age, sex, physical activity, and smoking.
If the OR for Obese is, say, 2.5, this means obese individuals have 2.5 times the odds of hypertension compared to normal-weight individuals, after adjusting for age, sex, physical activity, and smoking.
People who are obese are about 2.5 times more likely to have high blood pressure than people with a healthy weight, even after accounting for their age, gender, and lifestyle habits.
# Model WITHOUT interaction (main effects only)
model_no_interact <- glm(hypertension ~ age_cont + bmi_cat + sex + phys_active,
data = brfss_clean,
family = binomial)
# Model WITH Age × BMI interaction
model_interact <- glm(hypertension ~ age_cont * bmi_cat + sex + phys_active,
data = brfss_clean,
family = binomial)
# Display interaction coefficients
tidy(model_interact, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, ":")) %>%
kable(caption = "Interaction Terms: Age × BMI (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error",
"z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| age_cont:bmi_catNormal | 1.056 | 0.043 | 1.261 | 0.207 | 0.955 | 1.145 |
| age_cont:bmi_catOverweight | 1.062 | 0.043 | 1.407 | 0.159 | 0.961 | 1.151 |
| age_cont:bmi_catObese | 1.051 | 0.043 | 1.162 | 0.245 | 0.951 | 1.137 |
# Likelihood Ratio Test: interaction vs no interaction
lrt_interact <- anova(model_no_interact, model_interact, test = "LRT")
print(lrt_interact)## Analysis of Deviance Table
##
## Model 1: hypertension ~ age_cont + bmi_cat + sex + phys_active
## Model 2: hypertension ~ age_cont * bmi_cat + sex + phys_active
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 1274 1563.7
## 2 1271 1561.6 3 2.1826 0.5354
# Visualization: predicted probabilities by age and BMI category
pred_interact <- ggpredict(model_interact, terms = c("age_cont [18:80]", "bmi_cat"))
ggplot(pred_interact, aes(x = x, y = predicted, color = group, fill = group)) +
geom_line(linewidth = 1.2) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.15, color = NA) +
labs(
title = "Predicted Probability of Hypertension by Age and BMI",
subtitle = "Age × BMI Interaction Model",
x = "Age (years)",
y = "Predicted Probability of Hypertension",
color = "BMI Category",
fill = "BMI Category"
) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0, 1)) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1") +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")Questions:
The interaction is not statistically significant (p = 0.54), as the LRT showed a deviance difference of only 2.18 across 3 degrees of freedom, all three individual interaction terms had non-significant p-values (0.207, 0.159, and 0.245) with confidence intervals crossing 1.0, meaning adding the Age × BMI interaction terms does not improve model fit over the main effects model alone.
BMI is not an effect modifier of the age–hypertension relationship. In plain language, the effect of age on hypertension risk is consistent across all BMI categories whether someone is underweight, normal weight, overweight, or obese, age increases their odds of hypertension at roughly the same rate. The relationship between age and hypertension does not get stronger or weaker depending on a person’s BMI.
This means you should retain the simpler main effects model (Model B: hypertension ~ age_cont + sex + bmi_cat) rather than the interaction model. In epidemiology, when an interaction is not statistically significant, adding it unnecessarily complicates interpretation without providing any meaningful additional insight. The principle of parsimony keeping the simplest model that adequately explains the data applies here.
pred_plot <- ggpredict(model_no_interact,
terms = c("age_cont [18:80]", "bmi_cat"))
ggplot(pred_plot, aes(x = x, y = predicted, color = group, fill = group)) +
geom_line(linewidth = 1.2) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.15, color = NA) +
labs(
title = "Predicted Probability of Hypertension by Age and BMI Category",
subtitle = "No significant Age × BMI interaction (p = 0.54) — main effects model shown",
x = "Age (years)",
y = "Predicted Probability of Hypertension",
color = "BMI Category",
fill = "BMI Category"
) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0, 1)) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1") +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")Interpretation: The four BMI groups showed nearly parallel increases in hypertension probability with age, confirming the non-significant Age × BMI interaction (χ² = 2.18, df = 3, p = 0.54) and indicating that BMI does not modify the relationship between age and hypertension, it only shifts the baseline risk level, with obese individuals consistently showing the highest probability across all ages.
# YOUR CODE HERE: Calculate VIF for your multiple regression model
vif_vals <- vif(model_multiple)
# Handle both matrix output (categorical vars) and vector output
if (is.matrix(vif_vals)) {
vif_df <- data.frame(
Variable = rownames(vif_vals),
VIF = vif_vals[, "GVIF^(1/(2*Df))"]
)
} else {
vif_df <- data.frame(
Variable = names(vif_vals),
VIF = as.numeric(vif_vals)
)
}
vif_df <- vif_df %>%
arrange(desc(VIF)) %>%
mutate(
Interpretation = case_when(
VIF < 5 ~ "Low — No concern",
VIF >= 5 & VIF < 10 ~ "Moderate — Monitor",
VIF >= 10 ~ "High — Problem"
)
)
vif_df %>%
kable(caption = "Variance Inflation Factors (VIF) — Hypertension Model",
digits = 2, align = "lrc") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(which(vif_df$VIF >= 10), bold = TRUE, color = "white", background = "#DC143C") %>%
row_spec(which(vif_df$VIF >= 5 & vif_df$VIF < 10), background = "#FFA500") %>%
row_spec(which(vif_df$VIF < 5), background = "#90EE90")| Variable | VIF | Interpretation | |
|---|---|---|---|
| age_cont | age_cont | 1.06 | Low — No concern |
| current_smoker | current_smoker | 1.04 | Low — No concern |
| bmi_cat | bmi_cat | 1.02 | Low — No concern |
| phys_active | phys_active | 1.01 | Low — No concern |
| sex | sex | 1.01 | Low — No concern |
# YOUR CODE HERE: Create a Cook's distance plot to identify influential observations
cooks_d <- cooks.distance(model_multiple)
influence_df <- data.frame(
observation = seq_along(cooks_d),
cooks_d = cooks_d
) %>%
mutate(influential = ifelse(cooks_d > 1, "Yes", "No"))
ggplot(influence_df, aes(x = observation, y = cooks_d, color = influential)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
labs(
title = "Cook's Distance — Influential Observations",
subtitle = "Red dashed line = threshold of 1.0",
x = "Observation Number",
y = "Cook's Distance",
color = "Influential?"
) +
scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
theme_minimal(base_size = 12)# Count influential points
cat("Influential observations (Cook's D > 1):", sum(influence_df$influential == "Yes"), "\n")## Influential observations (Cook's D > 1): 0
Questions:
Answer:
No. All VIF values ranged from 1.01 to 1.06, which are essentially perfect scores. A VIF of 1.0 means zero correlation between that predictor and the others in the model. Since all five predictors; age, sex, BMI, physical activity, and smoking, fall well below the concern threshold of 5.0, there is absolutely no multicollinearity problem. Each variable is contributing completely independent information to the model, and the coefficients and standard errors are reliable and stable.
Answer:
No. The Cook’s Distance plot showed that every observation fell far below the threshold of 1.0, with the maximum value barely reaching 0.05–0.08. The legend displayed only “No”- meaning not a single data point was flagged as influential. Removing any individual observation from this dataset would not meaningfully change the model’s estimates or conclusions. The results are stable across the entire sample.
Answer:
If I found influential observations (Cook’s D > 1):
I would begin by investigating those flagged data points to determine whether they represent genuine data entry errors, recording mistakes, or truly extreme but valid cases. If I discovered they were errors, I would correct or remove them before re-running the model. However, if the flagged observations turned out to be valid but extreme cases, I would conduct a sensitivity analysis by fitting the model twice once with all observations included and once with the influential observations removed and then compare the results from both runs.
If the conclusions changed substantially between the two versions, I would report both sets of results and acknowledge the influence of those observations as a limitation of my analysis, being transparent about how much those few data points were affecting my estimates. On the other hand, if the conclusions remained consistent across both runs, I would report that my results were robust to the removal of the influential observations, which would strengthen confidence in my findings.
# YOUR CODE HERE: Compare three models using AIC and BIC
# Model A: Age only
# Model B: Age + sex + bmi_cat
# Model C: Age + sex + bmi_cat + phys_active + current_smoker
# Model A: Age only
model_A <- glm(hypertension ~ age_cont,
data = brfss_clean,
family = binomial)
# Model B: Age + Sex + BMI
model_B <- glm(hypertension ~ age_cont + sex + bmi_cat,
data = brfss_clean,
family = binomial)
# Model C: Full model
model_C <- glm(hypertension ~ age_cont + sex + bmi_cat + phys_active + current_smoker,
data = brfss_clean,
family = binomial)
# YOUR CODE HERE: Create a comparison table
# Comparison table
model_comp <- data.frame(
Model = c("Model A: Age only",
"Model B: Age + Sex + BMI",
"Model C: Full model"),
AIC = round(c(AIC(model_A), AIC(model_B), AIC(model_C)), 2),
BIC = round(c(BIC(model_A), BIC(model_B), BIC(model_C)), 2),
Deviance = round(c(deviance(model_A), deviance(model_B), deviance(model_C)), 2)
)
model_comp %>%
kable(caption = "Model Comparison: AIC, BIC, and Deviance",
align = "lrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(which.min(model_comp$AIC), bold = TRUE, background = "#d4edda")| Model | AIC | BIC | Deviance |
|---|---|---|---|
| Model A: Age only | 1636.61 | 1646.92 | 1632.61 |
| Model B: Age + Sex + BMI | 1576.49 | 1607.42 | 1564.49 |
| Model C: Full model | 1579.50 | 1620.74 | 1563.50 |
Questions:
Model B (Age + Sex + BMI) has the lowest AIC of 1576.49, making it the best fitting model. It also has the lowest BIC of 1607.42, which confirms this conclusion under the stricter penalty for complexity.
No. Moving from Model B to Model C, the AIC actually increases by 3.01 points (1576.49 → 1579.50) and BIC increases by 13.32 points (1607.42 → 1620.74). Although the deviance drops slightly (1564.49 → 1563.50), the improvement in raw fit is too small to justify adding two extra variables. Both AIC and BIC are penalizing the unnecessary complexity of Model C.
I would choose Model B as my final model. It achieves the best balance between model fit and parsimony age, sex, and BMI together explain hypertension risk well, and adding physical activity and smoking does not meaningfully improve the model by any metric. This is also consistent with the Task 5 finding that no interaction exists, and the Task 3 finding that physical activity and smoking were not statistically significant predictors. Retaining only variables that contribute meaningfully to the model produces cleaner, more interpretable results and avoids overfitting.
Write a brief report (1-2 pages) summarizing your findings:
Submission: Submit your completed R Markdown file and knitted HTML report.
A statistical model is a mathematical equation that summarizes how one or more predictor variables relate to an outcome. In this lab, we built models to describe how age, sex, BMI, physical activity, and smoking relate to the probability of having hypertension. Rather than just observing that older people tend to have higher blood pressure, the model quantifies exactly how much the odds change per year of age and whether that relationship holds after accounting for other factors.
The type of outcome determines which regression model is appropriate. Because hypertension is a binary outcome (yes or no), logistic regression was the correct choice. If the outcome had been a continuous measurement like systolic blood pressure in mmHg, linear regression would have been used instead. Using the wrong model for your outcome type produces invalid results, which is why identifying the outcome type is always the first step in any regression analysis.
Logistic regression models the log-odds of the outcome rather than the outcome directly, which ensures predicted probabilities always fall between 0 and 1. In our analysis, the model estimated the probability of hypertension as a smooth S-shaped curve across age, rising from near 0% at age 18 to above 75% at age 80. Linear regression applied to the same data would have produced impossible predicted probabilities below 0 or above 1 at the extremes of age.
A confounder is a variable associated with both the predictor and the outcome that distorts the true relationship between them. By including age, sex, BMI, physical activity, and smoking simultaneously in a single model, multiple logistic regression isolates the independent contribution of each variable. In this lab, the age odds ratio changed only minimally from 1.055 (unadjusted) to 1.061 (adjusted), suggesting little confounding was present — but this can only be confirmed by actually fitting the multiple regression model and comparing the two estimates.
Categorical variables cannot be entered into a regression model as raw numbers because that would falsely imply a numerical order or equal spacing between categories. Instead, a categorical variable with k categories is split into k−1 binary dummy variables, each comparing one category to a chosen reference group. For BMI in this lab, Normal weight was the reference, and three dummy variables were created for Underweight, Overweight, and Obese. Each dummy variable’s odds ratio then tells you how much more or less likely hypertension is in that group compared to normal weight individuals specifically.
An interaction term tests whether the effect of one predictor on the outcome changes depending on the level of another predictor in epidemiology this is called effect modification. In this lab, we tested whether the relationship between age and hypertension differed across BMI categories. The likelihood ratio test returned p = 0.54, meaning the interaction was not statistically significant. The predicted probability plot confirmed this with four nearly parallel lines, age increases hypertension risk at the same rate regardless of BMI group. Had the lines diverged, it would have indicated that BMI modifies how strongly age affects hypertension risk.
Before trusting any model’s results, its assumptions must be verified. Two key diagnostics were applied in this lab. Variance inflation factors (VIF) checked for multicollinearity, all values fell between 1.01 and 1.06, confirming that each predictor was contributing independent information with no redundancy. Cook’s Distance checked for influential observations, no data point exceeded the threshold of 1.0, confirming the model was stable and not being distorted by any unusual cases. Without these checks, a model could produce biased or unstable estimates without any obvious warning signs in the output.
Fitting multiple models and comparing them using AIC, BIC, and likelihood ratio tests allows you to identify the model that best balances fit and parsimony, explaining the data well without unnecessary complexity. In this lab, Model B (Age + Sex + BMI) had the lowest AIC of 1576.49 and the lowest BIC of 1607.42, outperforming both the simpler Model A (age only) and the more complex Model C (which added physical activity and smoking). The principle is that a more complex model is only worth keeping if it meaningfully improves fit and in this case it did not, making Model B the final recommended model.
Logistic Regression:
\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]
Odds Ratio:
\[\text{OR} = e^{\beta_i}\]
Predicted Probability:
\[p = \frac{e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}{1 + e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}\]
Session Info
## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggstats_0.12.0 gtsummary_2.5.0 ggeffects_2.3.2 car_3.1-3
## [5] carData_3.0-5 broom_1.0.11 plotly_4.12.0 kableExtra_1.4.0
## [9] knitr_1.51 haven_2.5.5 lubridate_1.9.4 forcats_1.0.1
## [13] stringr_1.6.0 dplyr_1.2.0 purrr_1.2.1 readr_2.1.6
## [17] tidyr_1.3.2 tibble_3.3.1 ggplot2_4.0.0 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.56 bslib_0.9.0
## [4] htmlwidgets_1.6.4 insight_1.4.4 lattice_0.22-7
## [7] tzdb_0.5.0 crosstalk_1.2.2 vctrs_0.7.1
## [10] tools_4.5.2 generics_0.1.4 datawizard_1.3.0
## [13] pkgconfig_2.0.3 Matrix_1.7-4 data.table_1.18.0
## [16] RColorBrewer_1.1-3 S7_0.2.0 lifecycle_1.0.5
## [19] compiler_4.5.2 farver_2.1.2 textshaping_1.0.4
## [22] htmltools_0.5.9 sass_0.4.10 yaml_2.3.12
## [25] lazyeval_0.2.2 Formula_1.2-5 pillar_1.11.1
## [28] jquerylib_0.1.4 broom.helpers_1.22.0 cachem_1.1.0
## [31] abind_1.4-8 nlme_3.1-168 tidyselect_1.2.1
## [34] digest_0.6.39 stringi_1.8.7 labeling_0.4.3
## [37] splines_4.5.2 labelled_2.16.0 fastmap_1.2.0
## [40] grid_4.5.2 cli_3.6.5 magrittr_2.0.4
## [43] cards_0.7.1 withr_3.0.2 scales_1.4.0
## [46] backports_1.5.0 timechange_0.3.0 rmarkdown_2.30
## [49] httr_1.4.7 otel_0.2.0 ragg_1.5.0
## [52] hms_1.1.4 evaluate_1.0.5 viridisLite_0.4.2
## [55] mgcv_1.9-3 rlang_1.1.7 glue_1.8.0
## [58] xml2_1.5.2 svglite_2.2.2 rstudioapi_0.18.0
## [61] jsonlite_2.0.0 R6_2.6.1 systemfonts_1.3.1