2-24-26Statistical modeling is a fundamental tool in epidemiology that allows us to:
This lecture introduces key concepts in regression modeling using real-world data from the Behavioral Risk Factor Surveillance System (BRFSS) 2023.
# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)# Summary table by diabetes status
desc_table <- brfss_subset_2023 %>%
group_by(diabetes) %>%
summarise(
N = n(),
`Mean Age` = round(mean(age_cont), 1),
`% Male` = round(100 * mean(sex == "Male"), 1),
`% Obese` = round(100 * mean(bmi_cat == "Obese", na.rm = TRUE), 1),
`% Physically Active` = round(100 * mean(phys_active), 1),
`% Current Smoker` = round(100 * mean(current_smoker), 1),
`% Hypertension` = round(100 * mean(hypertension), 1),
`% High Cholesterol` = round(100 * mean(high_chol), 1)
) %>%
mutate(diabetes = ifelse(diabetes == 1, "Diabetes", "No Diabetes"))
desc_table %>%
kable(caption = "Descriptive Statistics by Diabetes Status",
align = "lrrrrrrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| diabetes | N | Mean Age | % Male | % Obese | % Physically Active | % Current Smoker | % Hypertension | % High Cholesterol |
|---|---|---|---|---|---|---|---|---|
| No Diabetes | 1053 | 58.2 | 49.0 | 34.8 | 69.4 | 29.3 | 47.5 | 42.5 |
| Diabetes | 228 | 63.1 | 53.9 | 56.1 | 53.5 | 27.6 | 76.8 | 67.1 |
A statistical model is a mathematical representation of the relationship between:
\[f(Y) = \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \cdots + \beta_p X_p + \epsilon\]
Where:
The choice of regression model depends on the type of outcome variable:
| Outcome Type | Regression Type | Link Function | Example |
|---|---|---|---|
| Continuous | Linear | Identity: Y | Blood pressure, BMI |
| Binary | Logistic | Logit: log(p/(1-p)) | Disease status, mortality |
| Count | Poisson/Negative Binomial | Log: log(Y) | Number of infections |
| Time-to-event | Cox Proportional Hazards | Log: log(h(t)) | Survival time |
Let’s model the relationship between age and diabetes prevalence.
# Simple linear regression: diabetes ~ age
model_linear_simple <- lm(diabetes ~ age_cont, data = brfss_subset_2023 )
# Display results
tidy(model_linear_simple, conf.int = TRUE) %>%
kable(caption = "Simple Linear Regression: Diabetes ~ Age",
digits = 4,
col.names = c("Term", "Estimate", "Std. Error", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Estimate | Std. Error | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | -0.0632 | 0.0481 | -1.3125 | 0.1896 | -0.1576 | 0.0312 |
| age_cont | 0.0041 | 0.0008 | 5.1368 | 0.0000 | 0.0025 | 0.0056 |
Interpretation:
# Create scatter plot with regression line
p1 <- ggplot(brfss_subset_2023, aes(x = age_cont, y = diabetes)) +
geom_jitter(alpha = 0.2, width = 0.5, height = 0.02, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1.2) +
labs(
title = "Relationship Between Age and Diabetes",
subtitle = "Simple Linear Regression",
x = "Age (years)",
y = "Probability of Diabetes"
) +
theme_minimal(base_size = 12)
ggplotly(p1) %>%
layout(hovermode = "closest")Diabetes Prevalence by Age
Problem with linear regression for binary outcomes:
Solution: Logistic Regression
Uses the logit link function to ensure predicted probabilities stay between 0 and 1:
\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]
# Simple logistic regression: diabetes ~ age
model_logistic_simple <- glm(diabetes ~ age_cont,
data = brfss_subset_2023,
family = binomial(link = "logit"))
# Display results with odds ratios
tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Simple Logistic Regression: Diabetes ~ Age (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.029 | 0.423 | -8.390 | 0 | 0.012 | 0.064 |
| age_cont | 1.034 | 0.007 | 4.978 | 0 | 1.021 | 1.048 |
Interpretation:
Predicted Diabetes Probability by Age
# Generate predicted probabilities
pred_data <- data.frame(age_cont = seq(18, 80, by = 1))
pred_data$predicted_prob <- predict(model_logistic_simple,
newdata = pred_data,
type = "response")
# Plot
p2 <- ggplot(pred_data, aes(x = age_cont, y = predicted_prob)) +
geom_line(color = "darkred", linewidth = 1.5) +
geom_ribbon(aes(ymin = predicted_prob - 0.02,
ymax = predicted_prob + 0.02),
alpha = 0.2, fill = "darkred") +
labs(
title = "Predicted Probability of Diabetes by Age",
subtitle = "Simple Logistic Regression",
x = "Age (years)",
y = "Predicted Probability of Diabetes"
) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0, 0.6)) +
theme_minimal(base_size = 12)
ggplotly(p2)Predicted Diabetes Probability by Age
A confounder is a variable that:
Example: The relationship between age and diabetes may be confounded by BMI, physical activity, and other factors.
# Multiple logistic regression with potential confounders
model_logistic_multiple <- glm(diabetes ~ age_cont + sex + bmi_cat +
phys_active + current_smoker + education,
data = brfss_subset_2023,
family = binomial(link = "logit"))
# Display results
tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Multiple Logistic Regression: Diabetes ~ Age + Covariates (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
scroll_box(height = "400px")| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.009 | 1.177 | -4.001 | 0.000 | 0.000 | 0.065 |
| age_cont | 1.041 | 0.007 | 5.515 | 0.000 | 1.027 | 1.057 |
| sexMale | 1.191 | 0.154 | 1.133 | 0.257 | 0.880 | 1.613 |
| bmi_catNormal | 1.971 | 1.052 | 0.645 | 0.519 | 0.378 | 36.309 |
| bmi_catOverweight | 3.155 | 1.044 | 1.101 | 0.271 | 0.621 | 57.679 |
| bmi_catObese | 6.834 | 1.041 | 1.845 | 0.065 | 1.354 | 124.675 |
| phys_active | 0.589 | 0.157 | -3.373 | 0.001 | 0.433 | 0.802 |
| current_smoker | 1.213 | 0.178 | 1.085 | 0.278 | 0.852 | 1.716 |
| educationHigh school graduate | 0.634 | 0.288 | -1.579 | 0.114 | 0.364 | 1.131 |
| educationSome college | 0.542 | 0.294 | -2.081 | 0.037 | 0.307 | 0.977 |
| educationCollege graduate | 0.584 | 0.305 | -1.763 | 0.078 | 0.324 | 1.074 |
Interpretation:
Categorical variables with \(k\) levels are represented using \(k-1\) dummy variables (indicator variables).
Education has 4 levels: 1. < High school (reference category) 2. High school graduate 3. Some college 4. College graduate
R automatically creates 3 dummy variables:
# Extract dummy variable coding
dummy_table <- data.frame(
Education = c("< High school", "High school graduate", "Some college", "College graduate"),
`Dummy 1 (HS grad)` = c(0, 1, 0, 0),
`Dummy 2 (Some college)` = c(0, 0, 1, 0),
`Dummy 3 (College grad)` = c(0, 0, 0, 1),
check.names = FALSE
)
dummy_table %>%
kable(caption = "Dummy Variable Coding for Education",
align = "lccc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(1, bold = TRUE, background = "#ffe6e6") # Highlight reference category| Education | Dummy 1 (HS grad) | Dummy 2 (Some college) | Dummy 3 (College grad) |
|---|---|---|---|
| < High school | 0 | 0 | 0 |
| High school graduate | 1 | 0 | 0 |
| Some college | 0 | 1 | 0 |
| College graduate | 0 | 0 | 1 |
Reference Category: The category with all zeros (< High school) is the reference group. All other categories are compared to this reference.
# Extract education coefficients
educ_coefs <- tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "education")) %>%
mutate(
education_level = str_remove(term, "education"),
education_level = factor(education_level,
levels = c("High school graduate",
"Some college",
"College graduate"))
)
# Add reference category
ref_row <- data.frame(
term = "education< High school",
estimate = 1.0,
std.error = 0,
statistic = NA,
p.value = NA,
conf.low = 1.0,
conf.high = 1.0,
education_level = factor("< High school (Ref)",
levels = c("< High school (Ref)",
"High school graduate",
"Some college",
"College graduate"))
)
educ_coefs_full <- bind_rows(ref_row, educ_coefs) %>%
mutate(education_level = factor(education_level,
levels = c("< High school (Ref)",
"High school graduate",
"Some college",
"College graduate")))
# Plot
p3 <- ggplot(educ_coefs_full, aes(x = education_level, y = estimate)) +
geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high),
size = 0.8, color = "darkblue") +
coord_flip() +
labs(
title = "Association Between Education and Diabetes",
subtitle = "Adjusted Odds Ratios (reference: < High school)",
x = "Education Level",
y = "Odds Ratio (95% CI)"
) +
theme_minimal(base_size = 12)
ggplotly(p3)Odds Ratios for Education Levels
# Plot model coefficients with `ggcoef_model()`
ggcoef_model(model_logistic_multiple, exponentiate = TRUE,
include = c("education"),
variable_labels = c(
education = "Education"),
facet_labeller = ggplot2::label_wrap_gen(10)
)An interaction exists when the effect of one variable on the outcome differs across levels of another variable.
Epidemiologic term: Effect modification
Does the effect of age on diabetes differ between males and females?
# Model with interaction term
model_interaction <- glm(diabetes ~ age_cont * sex + bmi_cat + phys_active,
data = brfss_subset_2023 ,
family = binomial(link = "logit"))
# Display interaction results
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "age_cont")) %>%
kable(caption = "Age × Sex Interaction Model (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| age_cont | 1.031 | 0.009 | 3.178 | 0.001 | 1.012 | 1.051 |
| age_cont:sexMale | 1.015 | 0.014 | 1.084 | 0.278 | 0.988 | 1.044 |
Interpretation:
# Generate predicted probabilities by sex
pred_interact <- ggpredict(model_interaction, terms = c("age_cont [18:80]", "sex"))
# Plot
p4 <- ggplot(pred_interact, aes(x = x, y = predicted, color = group, fill = group)) +
geom_line(linewidth = 1.2) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, color = NA) +
labs(
title = "Predicted Probability of Diabetes by Age and Sex",
subtitle = "Testing for Age × Sex Interaction",
x = "Age (years)",
y = "Predicted Probability of Diabetes",
color = "Sex",
fill = "Sex"
) +
scale_y_continuous(labels = scales::percent_format()) +
scale_color_manual(values = c("Female" = "#E64B35", "Male" = "#4DBBD5")) +
scale_fill_manual(values = c("Female" = "#E64B35", "Male" = "#4DBBD5")) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")
ggplotly(p4)Age-Diabetes Relationship by Sex
Every regression model makes assumptions about the data. If assumptions are violated, results may be invalid.
Variance Inflation Factor (VIF): Measures how much the variance of a coefficient is inflated due to correlation with other predictors.
# Calculate VIF
vif_values <- vif(model_logistic_multiple)
# Create VIF table
# For models with categorical variables, vif() returns GVIF (Generalized VIF)
if (is.matrix(vif_values)) {
# If matrix (categorical variables present), extract GVIF^(1/(2*Df))
vif_df <- data.frame(
Variable = rownames(vif_values),
VIF = vif_values[, "GVIF^(1/(2*Df))"]
)
} else {
# If vector (only continuous variables)
vif_df <- data.frame(
Variable = names(vif_values),
VIF = as.numeric(vif_values)
)
}
# Add interpretation
vif_df <- vif_df %>%
arrange(desc(VIF)) %>%
mutate(
Interpretation = case_when(
VIF < 5 ~ "Low (No concern)",
VIF >= 5 & VIF < 10 ~ "Moderate (Monitor)",
VIF >= 10 ~ "High (Problem)"
)
)
vif_df %>%
kable(caption = "Variance Inflation Factors (VIF) for Multiple Regression Model",
digits = 2,
align = "lrc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which(vif_df$VIF >= 10), bold = TRUE, color = "white", background = "#DC143C") %>%
row_spec(which(vif_df$VIF >= 5 & vif_df$VIF < 10), background = "#FFA500") %>%
row_spec(which(vif_df$VIF < 5), background = "#90EE90")| Variable | VIF | Interpretation | |
|---|---|---|---|
| age_cont | age_cont | 1.05 | Low (No concern) |
| current_smoker | current_smoker | 1.05 | Low (No concern) |
| phys_active | phys_active | 1.02 | Low (No concern) |
| sex | sex | 1.01 | Low (No concern) |
| education | education | 1.01 | Low (No concern) |
| bmi_cat | bmi_cat | 1.01 | Low (No concern) |
Cook’s Distance: Measures how much the model would change if an observation were removed.
# Calculate Cook's distance
cooks_d <- cooks.distance(model_logistic_multiple)
# Create data frame
influence_df <- data.frame(
observation = 1:length(cooks_d),
cooks_d = cooks_d
) %>%
mutate(influential = ifelse(cooks_d > 1, "Yes", "No"))
# Plot
p5 <- ggplot(influence_df, aes(x = observation, y = cooks_d, color = influential)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
labs(
title = "Cook's Distance: Identifying Influential Observations",
subtitle = "Values > 1 indicate potentially influential observations",
x = "Observation Number",
y = "Cook's Distance",
color = "Influential?"
) +
scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
theme_minimal(base_size = 12)
ggplotly(p5)Cook’s Distance for Influential Observations
# Count influential observations
n_influential <- sum(influence_df$influential == "Yes")
cat("Number of potentially influential observations:", n_influential, "\n")## Number of potentially influential observations: 0
Use Likelihood Ratio Test to compare nested models:
# Model 1: Age only
model1 <- glm(diabetes ~ age_cont,
data = brfss_subset_2023,
family = binomial)
# Model 2: Age + Sex
model2 <- glm(diabetes ~ age_cont + sex,
data = brfss_subset_2023,
family = binomial)
# Model 3: Full model
model3 <- model_logistic_multiple
# Likelihood ratio test
lrt_1_2 <- anova(model1, model2, test = "LRT")
lrt_2_3 <- anova(model2, model3, test = "LRT")
# Create comparison table
model_comp <- data.frame(
Model = c("Model 1: Age only",
"Model 2: Age + Sex",
"Model 3: Full model"),
AIC = c(AIC(model1), AIC(model2), AIC(model3)),
BIC = c(BIC(model1), BIC(model2), BIC(model3)),
`Deviance` = c(deviance(model1), deviance(model2), deviance(model3)),
check.names = FALSE
)
model_comp %>%
kable(caption = "Model Comparison: AIC, BIC, and Deviance",
digits = 2,
align = "lrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which.min(model_comp$AIC), bold = TRUE, background = "#d4edda")| Model | AIC | BIC | Deviance |
|---|---|---|---|
| Model 1: Age only | 1175.08 | 1185.39 | 1171.08 |
| Model 2: Age + Sex | 1175.85 | 1191.32 | 1169.85 |
| Model 3: Full model | 1122.65 | 1179.36 | 1100.65 |
Interpretation:
All statistical models include an error term (\(\epsilon\)) to account for:
\[Y = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p + \epsilon\]
Key points:
In this lab, you will:
# YOUR CODE HERE: Create a frequency table of hypertension status
htn_freq <- brfss_subset_2023 %>%
count(hypertension) %>%
mutate(
Percent = round(100 * n / sum(n), 1),
hypertension = ifelse(hypertension == 1,
"Hypertension",
"No Hypertension")
)
htn_freq %>%
kable(caption = "Frequency of Hypertension Status",
align = "lrr") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| hypertension | n | Percent |
|---|---|---|
| No Hypertension | 606 | 47.3 |
| Hypertension | 675 | 52.7 |
# YOUR CODE HERE: Calculate the prevalence of hypertension by age group
prev_age_table <- brfss_subset_2023 %>%
group_by(age_group) %>%
summarise(
N = n(),
Hypertension_Cases = sum(hypertension == 1, na.rm = TRUE),
Prevalence_Percent = round(100 * mean(hypertension == 1, na.rm = TRUE), 1)
)
prev_age_table %>%
kable(caption = "Prevalence of Hypertension by Age Group",
align = "lrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| age_group | N | Hypertension_Cases | Prevalence_Percent |
|---|---|---|---|
| 18-24 | 12 | 1 | 8.3 |
| 25-34 | 77 | 15 | 19.5 |
| 35-44 | 138 | 42 | 30.4 |
| 45-54 | 161 | 61 | 37.9 |
| 55-64 | 266 | 137 | 51.5 |
| 65+ | 627 | 419 | 66.8 |
Questions:
What is the overall prevalence of hypertension in the dataset? The overall prevalence of hypertension in the dataset is 52.7%
How does hypertension prevalence vary by age group? Hypertension prevalence increases with age, starting from 8.3% among adults aged 18–24 to 66.8% among those aged 65 years and older. This pattern suggests a strong positive association between age and hypertension in this data.
# YOUR CODE HERE: Fit a simple logistic regression model
# Outcome: hypertension
# Predictor: age_cont
model_logistic_simple <- glm(hypertension ~ age_cont,
data = brfss_subset_2023,
family = binomial(link = "logit"))
# YOUR CODE HERE: Display the results with odds ratios
tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Simple Logistic Regression: Hypertension ~ Age (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.048 | 0.296 | -10.293 | 0 | 0.026 | 0.084 |
| age_cont | 1.055 | 0.005 | 10.996 | 0 | 1.045 | 1.065 |
Questions:
What is the odds ratio for age? Interpret this value. The odds ratio for age is 1.055, meaning that each 1 year of age is associated with a 5.5% increase in the odds of hypertension.
Is the association statistically significant? he association is statistically significant (p < 0.001).
What is the 95% confidence interval for the odds ratio? The 95% confidence interval (1.045–1.065).
# YOUR CODE HERE: Fit a multiple logistic regression model
# Outcome: hypertension
# Predictors: age_cont, sex, bmi_cat, phys_active, current_smoker
# YOUR CODE HERE: Display the results
model_logistic_multiple <- glm(hypertension ~ age_cont + sex + bmi_cat +
phys_active + current_smoker + education,
data = brfss_subset_2023,
family = binomial(link = "logit"))
# Display results
tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Multiple Logistic Regression: Hypertenstion ~ Age + Covariates (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
scroll_box(height = "400px")| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.012 | 0.691 | -6.435 | 0.000 | 0.003 | 0.043 |
| age_cont | 1.062 | 0.005 | 11.299 | 0.000 | 1.051 | 1.073 |
| sexMale | 1.245 | 0.124 | 1.772 | 0.076 | 0.977 | 1.586 |
| bmi_catNormal | 2.110 | 0.548 | 1.362 | 0.173 | 0.761 | 6.819 |
| bmi_catOverweight | 3.301 | 0.545 | 2.191 | 0.028 | 1.198 | 10.613 |
| bmi_catObese | 6.674 | 0.547 | 3.470 | 0.001 | 2.415 | 21.535 |
| phys_active | 0.930 | 0.132 | -0.552 | 0.581 | 0.717 | 1.204 |
| current_smoker | 1.054 | 0.140 | 0.376 | 0.707 | 0.802 | 1.388 |
| educationHigh school graduate | 0.713 | 0.265 | -1.275 | 0.202 | 0.421 | 1.194 |
| educationSome college | 0.554 | 0.267 | -2.212 | 0.027 | 0.326 | 0.930 |
| educationCollege graduate | 0.651 | 0.274 | -1.566 | 0.117 | 0.378 | 1.109 |
Questions:
How did the odds ratio for age change after adjusting for other variables? The odds ratio is 1.062 which means after adjusting for sex, BMI, physical activity, smoking, and education, each 1-year increase in age is associated with a 6.2 increase in the odds of hypertension.
What does this suggest about confounding? This suggests confounding may not convey the true relationship between the exposure and outcome.
Which variables are the strongest predictors of hypertension? Overall, the variables that are the strongest predictors of hypertension is age and higher BMI (either in the overweight or obese categories).
# YOUR CODE HERE: Create a table showing the dummy variable coding for bmi_cat
dummy_table <- data.frame(
bmi__cat = c("Underweight", "Normal", "Overweight", "Obese"),
`Dummy 1 (Normal)` = c(0, 1, 0, 0),
`Dummy 2 (Overweight)` = c(0, 0, 1, 0),
`Dummy 3 (Obese)` = c(0, 0, 0, 1),
check.names = FALSE
)
# YOUR CODE HERE: Extract and display the odds ratios for BMI categories
dummy_table %>%
kable(caption = "Dummy Variable Coding for BMI",
align = "lccc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(1, bold = TRUE, background = "#ffe6e6") # Highlight reference category| bmi__cat | Dummy 1 (Normal) | Dummy 2 (Overweight) | Dummy 3 (Obese) |
|---|---|---|---|
| Underweight | 0 | 0 | 0 |
| Normal | 1 | 0 | 0 |
| Overweight | 0 | 1 | 0 |
| Obese | 0 | 0 | 1 |
Questions:
What is the reference category for BMI? The reference category is underweight.
Interpret the odds ratio for “Obese” compared to the reference category. The odds ratio for Obese is 6.674. This means that individuals classified as obese have approximately 6.67 times the odds of hypertension compared to individuals in the reference BMI category (Underweight).
How would you explain this to a non-statistician? People who are classified as obese are approximately 6 times more likely to have hypertension compared to people in the lowest weight category.
# YOUR CODE HERE: Fit a model with Age × BMI interaction
model_interaction <- glm(hypertension ~ age_cont * bmi_cat,
data = brfss_subset_2023 ,
family = binomial(link = "logit"))
# Display interaction results
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "age_cont")) %>%
kable(caption = "Age × BMI Interaction Model (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| age_cont | 1.004 | 0.042 | 0.102 | 0.918 | 0.929 | 1.108 |
| age_cont:bmi_catNormal | 1.058 | 0.043 | 1.306 | 0.192 | 0.957 | 1.147 |
| age_cont:bmi_catOverweight | 1.063 | 0.043 | 1.423 | 0.155 | 0.962 | 1.151 |
| age_cont:bmi_catObese | 1.054 | 0.042 | 1.232 | 0.218 | 0.954 | 1.140 |
# Fit the model without the interaction
model_no_interaction <- glm(
hypertension ~ age_cont + bmi_cat,
data = brfss_subset_2023,
family = binomial(link = "logit")
)
# Generate predicted probabilities by BMI
pred_interact <- ggpredict(model_interaction, terms = c("age_cont [18:80]", "bmi_cat"))
# Plot
p4 <- ggplot(pred_interact, aes(x = x, y = predicted, color = group, fill = group)) +
geom_line(linewidth = 1.2) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.15, color = NA) +
labs(
title = "Predicted Probability of Diabetes by Age and BMI",
subtitle = "Testing for Age × BMI Interaction",
x = "Age (years)",
y = "Predicted Probability of Hypertension",
color = "BMI",
fill = "BMI"
) +
scale_y_continuous(labels = scales::percent_format()) +
scale_color_manual(values = c(
"Underweight" = "#E64B35",
"Normal" = "#4DBBD5",
"Overweight" = "#1B9E77",
"Obese" = "#E7298A")) +
scale_fill_manual(values = c(
"Underweight" = "#E64B35",
"Normal" = "#4DBBD5",
"Overweight" = "#1B9E77",
"Obese" = "#E7298A")) +
guides(fill = "none") +
theme_minimal(base_size = 10) +
theme(legend.position = "bottom")
ggplotly(p4)# YOUR CODE HERE: Perform a likelihood ratio test comparing models with and without interaction
# Likelihood ratio test
lrt_result <- anova(model_interaction, model_no_interaction, test = "LRT")
lrt_result## Analysis of Deviance Table
##
## Model 1: hypertension ~ age_cont * bmi_cat
## Model 2: hypertension ~ age_cont + bmi_cat
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 1273 1566.1
## 2 1276 1568.1 -3 -1.9645 0.5798
Questions:
Is the interaction term statistically significant? The p-value is 0.5798, meaning it is not statistically significant
What does this mean in epidemiologic terms (effect modification)? Since the interaction is not significant, we can interpret that the effect of age on hypertension is roughly the same across all BMI categories.
Create a visualization showing predicted probabilities by age and BMI category
# YOUR CODE HERE: Calculate VIF for your multiple regression model
vif_values <- vif(model_logistic_multiple)
# Create VIF table
# For models with categorical variables, vif() returns GVIF (Generalized VIF)
if (is.matrix(vif_values)) {
# If matrix (categorical variables present), extract GVIF^(1/(2*Df))
vif_df <- data.frame(
Variable = rownames(vif_values),
VIF = vif_values[, "GVIF^(1/(2*Df))"]
)
} else {
# If vector (only continuous variables)
vif_df <- data.frame(
Variable = names(vif_values),
VIF = as.numeric(vif_values)
)
}
# Add interpretation
vif_df <- vif_df %>%
arrange(desc(VIF)) %>%
mutate(
Interpretation = case_when(
VIF < 5 ~ "Low (No concern)",
VIF >= 5 & VIF < 10 ~ "Moderate (Monitor)",
VIF >= 10 ~ "High (Problem)"
)
)
vif_df %>%
kable(caption = "Variance Inflation Factors (VIF) for Multiple Regression Model",
digits = 2,
align = "lrc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which(vif_df$VIF >= 10), bold = TRUE, color = "white", background = "#DC143C") %>%
row_spec(which(vif_df$VIF >= 5 & vif_df$VIF < 10), background = "#FFA500") %>%
row_spec(which(vif_df$VIF < 5), background = "#90EE90")| Variable | VIF | Interpretation | |
|---|---|---|---|
| age_cont | age_cont | 1.07 | Low (No concern) |
| current_smoker | current_smoker | 1.04 | Low (No concern) |
| phys_active | phys_active | 1.02 | Low (No concern) |
| bmi_cat | bmi_cat | 1.02 | Low (No concern) |
| sex | sex | 1.01 | Low (No concern) |
| education | education | 1.01 | Low (No concern) |
# YOUR CODE HERE: Create a Cook's distance plot to identify influential observations
cooks_d <- cooks.distance(model_logistic_multiple)
# Create data frame
influence_df <- data.frame(
observation = 1:length(cooks_d),
cooks_d = cooks_d
) %>%
mutate(influential = ifelse(cooks_d > 1, "Yes", "No"))
# Plot
p5 <- ggplot(influence_df, aes(x = observation, y = cooks_d, color = influential)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
labs(
title = "Cook's Distance: Identifying Influential Observations",
subtitle = "Values > 1 indicate potentially influential observations",
x = "Observation Number",
y = "Cook's Distance",
color = "Influential?"
) +
scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
theme_minimal(base_size = 12)
ggplotly(p5)# Count influential observations
n_influential <- sum(influence_df$influential == "Yes")
cat("Number of potentially influential observations:", n_influential, "\n")## Number of potentially influential observations: 0
Questions:
Are there any concerns about multicollinearity? There are no concerns about multicollinearity.
Are there any influential observations that might affect your results? There are no influential observations that affect my results.
What would you do if you found serious violations? If I found serious violations, I would either adjust the model by removing problematic predictor or remove an observation.
# YOUR CODE HERE: Compare three models using AIC and BIC
# Model A: Age only
model1 <- glm(hypertension ~ age_cont,
data = brfss_subset_2023,
family = binomial)
# Model B: Age + sex + bmi_cat
model2 <- glm(hypertension ~ age_cont + sex + bmi_cat,
data = brfss_subset_2023,
family = binomial)
# Model C: Age + sex + bmi_cat + phys_active + current_smoker
model3 <- glm(hypertension ~ age_cont + sex +bmi_cat + current_smoker,
data = brfss_subset_2023,
family = binomial)
# YOUR CODE HERE: Create a comparison table
model_comp <- data.frame(
Model = c("Model 1: Age only",
"Model 2: Age + Sex + BMI",
"Model 3: Age + Sex + BMI + Physical Activity + Current Smoker"),
AIC = c(AIC(model1), AIC(model2), AIC(model3)),
BIC = c(BIC(model1), BIC(model2), BIC(model3)),
`Deviance` = c(deviance(model1), deviance(model2), deviance(model3)),
check.names = FALSE
)
model_comp %>%
kable(caption = "Model Comparison: AIC, BIC, and Deviance",
digits = 2,
align = "lrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which.min(model_comp$AIC), bold = TRUE, background = "#d4edda")| Model | AIC | BIC | Deviance |
|---|---|---|---|
| Model 1: Age only | 1636.61 | 1646.92 | 1632.61 |
| Model 2: Age + Sex + BMI | 1576.49 | 1607.42 | 1564.49 |
| Model 3: Age + Sex + BMI + Physical Activity + Current Smoker | 1578.15 | 1614.24 | 1564.15 |
lrt_1_2 <- anova(model1, model2, test = "LRT")
lrt_2_3 <- anova(model2, model3, test = "LRT")
lrt_1_2## Analysis of Deviance Table
##
## Model 1: hypertension ~ age_cont
## Model 2: hypertension ~ age_cont + sex + bmi_cat
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 1279 1632.6
## 2 1275 1564.5 4 68.126 5.643e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Questions:
Which model has the best fit based on AIC? Model 2 has the best fit based on AIC.
Is the added complexity of the full model justified? No, the added complexity is not justified. Model 3 does not show a significant increase from Model 1.
Which model would you choose for your final analysis? Why? I would choose Model 2 because it has the lowest AIC, and by adding physical activity and current smoker, the AIC increases and deviance barely improves meaning these extra variables don’t meaningfully improve the model’s predictive ability.
Write a brief report (1-2 pages) summarizing your findings:
Submission: Submit your completed R Markdown file and knitted HTML report.
Research Question: How do different factors such as age, sex, BMI, physical activity, smoking status, and education affect hypertension (high blood pressure) in adults?
Hypertension (high blood pressure) is a public health concern and the goal of this analysis is to examine the associations between factors such as age, sex, BMI, physical activity, smoking status, and education and the likelihood of hypertension (high blood pressure) in adults using logistic regression and multiple logistic regression. Furthermore, we investigated the increase in the odds of hypertension and whether BMI modifies the effect of age on hypertension risk. I used a subset from the BRFSS 2023 dataset.
First, I calculated the overall prevalence of hypertension in the dataset and the prevalence of hypertension by age group. Second, I built both a simple logistic regression model and a multiple regression model to find the odd ratio and the adjusted odds ratio. Third, I created and interpreted dummy variables for BMI. Fourth, I tested for interaction between age and BMI and performing a likelihood ratio test to compare the models with and without interaction. Then I generated the predicted probabilities by BMI. These steps were for analyzing the effect modification of BMI modifies the effect of age on hypertension risk. Fifth, I ran model diagnostics - calculating VIF for your multiple regression model and creating a Cook’s distance plot to identify any influential observations that may affect my results. Finally, I did a model comparison, comparing three models using AIC and BIC - the models being age only, age + sex + BMI, and age + sex + BMI + physical activity + current smoker.
From my analysis, the following results are below:
Task 1 (Descriptive Statistics): Hypertension prevalence increased steadily across age groups, rising from 8.3% among adults aged 18-24 to 66.8% among those aged 65 years and older. This pattern indicates a strong positive association between age and hypertension. The overall prevalence of hypertension in the dataset is 52.7%.
Task 2 (Logistic Regression Results): In the unadjusted logistic regression model, the odds ratio (OR) for age was 1.055 (95% CI: 1.045-1.065, p < 0.001). This indicates that each additional year of age is associated with a 5.5% increase in the odds of hypertension, and this association is statistically significant.
Task 3 (Multiple Logistic Regression Model): After adjusting for sex, BMI, physical activity, smoking status, and education, the odds ratio for age increased slightly to 1.062, indicating that each additional year of age is associated with a 6.2% increase in the odds of hypertension. The small change in the age coefficient after adjustment suggests minimal confounding by the other variables included in the model or that confounding may not convey the true relationship between the exposure and outcome. The strongest predictors of hypertension were age and BMI category particularly in the overweight and obese categories.
Task 4 (Dummy Variables): Individuals classified as obese had an odds ratio of 6.674, meaning they had approximately 6.7 times the odds of hypertension compared to individuals in the reference category (underweight).
Task 5 (Interaction): The interaction between age and BMI was not statistically significant, the p-value = 0.5798. This indicates that the effect of age on hypertension does not significantly differ across BMI categories.
Task 6 (Model Diagnostics): Multicollinearity: VIF values ranged from 1.01 to 1.07, indicating no concerns. No influential data points were identified that substantially affected my results.
Task 7: Model fit was evaluated using AIC, BIC, and deviance: Model 1 (Age only): AIC = 1636.61, Model 2 (Age + Sex + BMI): AIC = 1576.49, and Model 3 (Full model): AIC = 1578.15. Model 2 had the lowest AIC, indicating the best fit model complexity. Although Model 3 included additional variables (physical activity and smoking status), its AIC increased slightly and deviance improved only minimally. Therefore, Model 2 was selected as the final model.
The results indicate that age and BMI are the primary determinants of hypertension risk in this dataset. Age demonstrated a strong and statistically significant association with hypertension. The steady increase in prevalence across age groups and the consistent odds ratio in both unadjusted and adjusted models suggest that aging is an independent and important risk factor. BMI, particularly obesity, showed a strong association with hypertension. Individuals classified as obese had substantially higher odds of hypertension compared to those in the reference category, highlighting excess body weight as a major modifiable risk factor. The absence of a significant age-by-BMI interaction suggests that the effect of age on hypertension is consistent across BMI categories. In epidemiologic terms, BMI increases overall risk but does not modify how age influences hypertension. Model comparison supported the selection of Model 2. Although additional behavioral variables were considered, they did not meaningfully improve model performance. Overall, these findings reinforce established evidence that aging and higher BMI (also body fat which BMI does not account for) are key contributors to hypertension risk and should remain central targets for prevention and public health interventions.
There are some limitations to account for in my analysis. One, the cross-sectional design prevents establishing causality between risk factors and hypertension. Second, many variables are self-reported, which may introduce recall or reporting bias. Residual confounding is also possible, as not all relevant factors (such as diet or family history) were included in the model. Finally, small sample sizes in certain BMI categories may reduce the precision of some estimates. Despite these limitations, the findings provide useful key predictors of hypertension in this dataset.
Logistic Regression:
\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]
Odds Ratio:
\[\text{OR} = e^{\beta_i}\]
Predicted Probability:
\[p = \frac{e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}{1 + e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}\]
Session Info
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggstats_0.12.0 gtsummary_2.5.0 ggeffects_2.3.2 car_3.1-3
## [5] carData_3.0-5 broom_1.0.11 plotly_4.12.0 kableExtra_1.4.0
## [9] knitr_1.51 haven_2.5.5 lubridate_1.9.4 forcats_1.0.1
## [13] stringr_1.6.0 dplyr_1.2.0 purrr_1.2.1 readr_2.1.5
## [17] tidyr_1.3.2 tibble_3.3.0 ggplot2_4.0.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.56 bslib_0.10.0
## [4] htmlwidgets_1.6.4 insight_1.4.5 lattice_0.22-7
## [7] tzdb_0.5.0 crosstalk_1.2.2 vctrs_0.7.1
## [10] tools_4.5.1 generics_0.1.4 datawizard_1.3.0
## [13] pkgconfig_2.0.3 Matrix_1.7-3 data.table_1.18.0
## [16] RColorBrewer_1.1-3 S7_0.2.1 lifecycle_1.0.5
## [19] compiler_4.5.1 farver_2.1.2 textshaping_1.0.4
## [22] htmltools_0.5.9 sass_0.4.10 yaml_2.3.12
## [25] lazyeval_0.2.2 Formula_1.2-5 pillar_1.11.0
## [28] jquerylib_0.1.4 broom.helpers_1.22.0 cachem_1.1.0
## [31] abind_1.4-8 nlme_3.1-168 tidyselect_1.2.1
## [34] digest_0.6.39 stringi_1.8.7 labeling_0.4.3
## [37] splines_4.5.1 labelled_2.16.0 fastmap_1.2.0
## [40] grid_4.5.1 cli_3.6.5 magrittr_2.0.3
## [43] cards_0.7.1 withr_3.0.2 scales_1.4.0
## [46] backports_1.5.0 timechange_0.3.0 rmarkdown_2.30
## [49] httr_1.4.7 otel_0.2.0 hms_1.1.3
## [52] evaluate_1.0.5 viridisLite_0.4.2 mgcv_1.9-3
## [55] rlang_1.1.7 glue_1.8.0 xml2_1.5.2
## [58] svglite_2.2.2 rstudioapi_0.18.0 jsonlite_2.0.0
## [61] R6_2.6.1 systemfonts_1.3.1