Statistical modeling is a fundamental tool in epidemiology that allows us to:
This lecture introduces key concepts in regression modeling using real-world data from the Behavioral Risk Factor Surveillance System (BRFSS) 2023.
# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)
# Load brfss_subset_2023.rds
eval=TRUE
brfss_clean <- read_rds('~/Desktop/EPI553/data/brfss_subset_2023.rds') eval=TRUE brfss_clean <- read_rds(‘~/Desktop/EPI553/data/brfss_subset_2023.rds’) ## Descriptive Statistics
# Summary table by diabetes status
desc_table <- brfss_clean %>%
group_by(diabetes) %>%
summarise(
N = n(),
`Mean Age` = round(mean(age_cont), 1),
`% Male` = round(100 * mean(sex == "Male"), 1),
`% Obese` = round(100 * mean(bmi_cat == "Obese", na.rm = TRUE), 1),
`% Physically Active` = round(100 * mean(phys_active), 1),
`% Current Smoker` = round(100 * mean(current_smoker), 1),
`% Hypertension` = round(100 * mean(hypertension), 1),
`% High Cholesterol` = round(100 * mean(high_chol), 1)
) %>%
mutate(diabetes = ifelse(diabetes == 1, "Diabetes", "No Diabetes"))
desc_table %>%
kable(caption = "Descriptive Statistics by Diabetes Status",
align = "lrrrrrrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| diabetes | N | Mean Age | % Male | % Obese | % Physically Active | % Current Smoker | % Hypertension | % High Cholesterol |
|---|---|---|---|---|---|---|---|---|
| No Diabetes | 1053 | 58.2 | 49.0 | 34.8 | 69.4 | 29.3 | 47.5 | 42.5 |
| Diabetes | 228 | 63.1 | 53.9 | 56.1 | 53.5 | 27.6 | 76.8 | 67.1 |
A statistical model is a mathematical representation of the relationship between:
\[f(Y) = \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \cdots + \beta_p X_p + \epsilon\]
Where:
The choice of regression model depends on the type of outcome variable:
| Outcome Type | Regression Type | Link Function | Example |
|---|---|---|---|
| Continuous | Linear | Identity: Y | Blood pressure, BMI |
| Binary | Logistic | Logit: log(p/(1-p)) | Disease status, mortality |
| Count | Poisson/Negative Binomial | Log: log(Y) | Number of infections |
| Time-to-event | Cox Proportional Hazards | Log: log(h(t)) | Survival time |
Let’s model the relationship between age and diabetes prevalence.
# Simple linear regression: diabetes ~ age
model_linear_simple <- lm(diabetes ~ age_cont, data = brfss_clean)
# Display results
tidy(model_linear_simple, conf.int = TRUE) %>%
kable(caption = "Simple Linear Regression: Diabetes ~ Age",
digits = 4,
col.names = c("Term", "Estimate", "Std. Error", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Estimate | Std. Error | t-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | -0.0632 | 0.0481 | -1.3125 | 0.1896 | -0.1576 | 0.0312 |
| age_cont | 0.0041 | 0.0008 | 5.1368 | 0.0000 | 0.0025 | 0.0056 |
Interpretation:
# Create scatter plot with regression line
p1 <- ggplot(brfss_clean, aes(x = age_cont, y = diabetes)) +
geom_jitter(alpha = 0.2, width = 0.5, height = 0.02, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1.2) +
labs(
title = "Relationship Between Age and Diabetes",
subtitle = "Simple Linear Regression",
x = "Age (years)",
y = "Probability of Diabetes"
) +
theme_minimal(base_size = 12)
ggplotly(p1) %>%
layout(hovermode = "closest")Diabetes Prevalence by Age
Problem with linear regression for binary outcomes:
Solution: Logistic Regression
Uses the logit link function to ensure predicted probabilities stay between 0 and 1:
\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]
# Simple logistic regression: diabetes ~ age
model_logistic_simple <- glm(diabetes ~ age_cont,
data = brfss_clean,
family = binomial(link = "logit"))
# Display results with odds ratios
tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Simple Logistic Regression: Diabetes ~ Age (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.029 | 0.423 | -8.390 | 0 | 0.012 | 0.064 |
| age_cont | 1.034 | 0.007 | 4.978 | 0 | 1.021 | 1.048 |
Interpretation:
Predicted Diabetes Probability by Age
# Generate predicted probabilities
pred_data <- data.frame(age_cont = seq(18, 80, by = 1))
pred_data$predicted_prob <- predict(model_logistic_simple,
newdata = pred_data,
type = "response")
# Plot
p2 <- ggplot(pred_data, aes(x = age_cont, y = predicted_prob)) +
geom_line(color = "darkred", linewidth = 1.5) +
geom_ribbon(aes(ymin = predicted_prob - 0.02,
ymax = predicted_prob + 0.02),
alpha = 0.2, fill = "darkred") +
labs(
title = "Predicted Probability of Diabetes by Age",
subtitle = "Simple Logistic Regression",
x = "Age (years)",
y = "Predicted Probability of Diabetes"
) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0, 0.6)) +
theme_minimal(base_size = 12)
ggplotly(p2)Predicted Diabetes Probability by Age
A confounder is a variable that:
Example: The relationship between age and diabetes may be confounded by BMI, physical activity, and other factors.
# Multiple logistic regression with potential confounders
model_logistic_multiple <- glm(diabetes ~ age_cont + sex + bmi_cat +
phys_active + current_smoker + education,
data = brfss_clean,
family = binomial(link = "logit"))
# Display results
tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Multiple Logistic Regression: Diabetes ~ Age + Covariates (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
scroll_box(height = "400px")| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.009 | 1.177 | -4.001 | 0.000 | 0.000 | 0.065 |
| age_cont | 1.041 | 0.007 | 5.515 | 0.000 | 1.027 | 1.057 |
| sexMale | 1.191 | 0.154 | 1.133 | 0.257 | 0.880 | 1.613 |
| bmi_catNormal | 1.971 | 1.052 | 0.645 | 0.519 | 0.378 | 36.309 |
| bmi_catOverweight | 3.155 | 1.044 | 1.101 | 0.271 | 0.621 | 57.679 |
| bmi_catObese | 6.834 | 1.041 | 1.845 | 0.065 | 1.354 | 124.675 |
| phys_active | 0.589 | 0.157 | -3.373 | 0.001 | 0.433 | 0.802 |
| current_smoker | 1.213 | 0.178 | 1.085 | 0.278 | 0.852 | 1.716 |
| educationHigh school graduate | 0.634 | 0.288 | -1.579 | 0.114 | 0.364 | 1.131 |
| educationSome college | 0.542 | 0.294 | -2.081 | 0.037 | 0.307 | 0.977 |
| educationCollege graduate | 0.584 | 0.305 | -1.763 | 0.078 | 0.324 | 1.074 |
Interpretation:
Categorical variables with \(k\) levels are represented using \(k-1\) dummy variables (indicator variables).
Education has 4 levels: 1. < High school (reference category) 2. High school graduate 3. Some college 4. College graduate
R automatically creates 3 dummy variables:
# Extract dummy variable coding
dummy_table <- data.frame(
Education = c("< High school", "High school graduate", "Some college", "College graduate"),
`Dummy 1 (HS grad)` = c(0, 1, 0, 0),
`Dummy 2 (Some college)` = c(0, 0, 1, 0),
`Dummy 3 (College grad)` = c(0, 0, 0, 1),
check.names = FALSE
)
dummy_table %>%
kable(caption = "Dummy Variable Coding for Education",
align = "lccc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(1, bold = TRUE, background = "#ffe6e6") # Highlight reference category| Education | Dummy 1 (HS grad) | Dummy 2 (Some college) | Dummy 3 (College grad) |
|---|---|---|---|
| < High school | 0 | 0 | 0 |
| High school graduate | 1 | 0 | 0 |
| Some college | 0 | 1 | 0 |
| College graduate | 0 | 0 | 1 |
Reference Category: The category with all zeros (< High school) is the reference group. All other categories are compared to this reference.
# Extract education coefficients
educ_coefs <- tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "education")) %>%
mutate(
education_level = str_remove(term, "education"),
education_level = factor(education_level,
levels = c("High school graduate",
"Some college",
"College graduate"))
)
# Add reference category
ref_row <- data.frame(
term = "education< High school",
estimate = 1.0,
std.error = 0,
statistic = NA,
p.value = NA,
conf.low = 1.0,
conf.high = 1.0,
education_level = factor("< High school (Ref)",
levels = c("< High school (Ref)",
"High school graduate",
"Some college",
"College graduate"))
)
educ_coefs_full <- bind_rows(ref_row, educ_coefs) %>%
mutate(education_level = factor(education_level,
levels = c("< High school (Ref)",
"High school graduate",
"Some college",
"College graduate")))
# Plot
p3 <- ggplot(educ_coefs_full, aes(x = education_level, y = estimate)) +
geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high),
size = 0.8, color = "darkblue") +
coord_flip() +
labs(
title = "Association Between Education and Diabetes",
subtitle = "Adjusted Odds Ratios (reference: < High school)",
x = "Education Level",
y = "Odds Ratio (95% CI)"
) +
theme_minimal(base_size = 12)
ggplotly(p3)Odds Ratios for Education Levels
# Plot model coefficients with `ggcoef_model()`
ggcoef_model(model_logistic_multiple, exponentiate = TRUE,
include = c("education"),
variable_labels = c(
education = "Education"),
facet_labeller = ggplot2::label_wrap_gen(10)
)An interaction exists when the effect of one variable on the outcome differs across levels of another variable.
Epidemiologic term: Effect modification
Does the effect of age on diabetes differ between males and females?
# Model with interaction term
model_interaction <- glm(diabetes ~ age_cont * sex + bmi_cat + phys_active,
data = brfss_clean,
family = binomial(link = "logit"))
# Display interaction results
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "age_cont")) %>%
kable(caption = "Age × Sex Interaction Model (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| age_cont | 1.031 | 0.009 | 3.178 | 0.001 | 1.012 | 1.051 |
| age_cont:sexMale | 1.015 | 0.014 | 1.084 | 0.278 | 0.988 | 1.044 |
Interpretation:
# Generate predicted probabilities by sex
pred_interact <- ggpredict(model_interaction, terms = c("age_cont [18:80]", "sex"))
# Plot
p4 <- ggplot(pred_interact, aes(x = x, y = predicted, color = group, fill = group)) +
geom_line(linewidth = 1.2) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, color = NA) +
labs(
title = "Predicted Probability of Diabetes by Age and Sex",
subtitle = "Testing for Age × Sex Interaction",
x = "Age (years)",
y = "Predicted Probability of Diabetes",
color = "Sex",
fill = "Sex"
) +
scale_y_continuous(labels = scales::percent_format()) +
scale_color_manual(values = c("Female" = "#E64B35", "Male" = "#4DBBD5")) +
scale_fill_manual(values = c("Female" = "#E64B35", "Male" = "#4DBBD5")) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")
ggplotly(p4)Age-Diabetes Relationship by Sex
Every regression model makes assumptions about the data. If assumptions are violated, results may be invalid.
Variance Inflation Factor (VIF): Measures how much the variance of a coefficient is inflated due to correlation with other predictors.
# Calculate VIF
vif_values <- vif(model_logistic_multiple)
# Create VIF table
# For models with categorical variables, vif() returns GVIF (Generalized VIF)
if (is.matrix(vif_values)) {
# If matrix (categorical variables present), extract GVIF^(1/(2*Df))
vif_df <- data.frame(
Variable = rownames(vif_values),
VIF = vif_values[, "GVIF^(1/(2*Df))"]
)
} else {
# If vector (only continuous variables)
vif_df <- data.frame(
Variable = names(vif_values),
VIF = as.numeric(vif_values)
)
}
# Add interpretation
vif_df <- vif_df %>%
arrange(desc(VIF)) %>%
mutate(
Interpretation = case_when(
VIF < 5 ~ "Low (No concern)",
VIF >= 5 & VIF < 10 ~ "Moderate (Monitor)",
VIF >= 10 ~ "High (Problem)"
)
)
vif_df %>%
kable(caption = "Variance Inflation Factors (VIF) for Multiple Regression Model",
digits = 2,
align = "lrc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which(vif_df$VIF >= 10), bold = TRUE, color = "white", background = "#DC143C") %>%
row_spec(which(vif_df$VIF >= 5 & vif_df$VIF < 10), background = "#FFA500") %>%
row_spec(which(vif_df$VIF < 5), background = "#90EE90")| Variable | VIF | Interpretation | |
|---|---|---|---|
| age_cont | age_cont | 1.05 | Low (No concern) |
| current_smoker | current_smoker | 1.05 | Low (No concern) |
| phys_active | phys_active | 1.02 | Low (No concern) |
| sex | sex | 1.01 | Low (No concern) |
| education | education | 1.01 | Low (No concern) |
| bmi_cat | bmi_cat | 1.01 | Low (No concern) |
Cook’s Distance: Measures how much the model would change if an observation were removed.
# Calculate Cook's distance
cooks_d <- cooks.distance(model_logistic_multiple)
# Create data frame
influence_df <- data.frame(
observation = 1:length(cooks_d),
cooks_d = cooks_d
) %>%
mutate(influential = ifelse(cooks_d > 1, "Yes", "No"))
# Plot
p5 <- ggplot(influence_df, aes(x = observation, y = cooks_d, color = influential)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
labs(
title = "Cook's Distance: Identifying Influential Observations",
subtitle = "Values > 1 indicate potentially influential observations",
x = "Observation Number",
y = "Cook's Distance",
color = "Influential?"
) +
scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
theme_minimal(base_size = 12)
ggplotly(p5)Cook’s Distance for Influential Observations
# Count influential observations
n_influential <- sum(influence_df$influential == "Yes")
cat("Number of potentially influential observations:", n_influential, "\n")## Number of potentially influential observations: 0
Use Likelihood Ratio Test to compare nested models:
# Model 1: Age only
model1 <- glm(diabetes ~ age_cont,
data = brfss_clean,
family = binomial)
# Model 2: Age + Sex
model2 <- glm(diabetes ~ age_cont + sex,
data = brfss_clean,
family = binomial)
# Model 3: Full model
model3 <- model_logistic_multiple
# Likelihood ratio test
lrt_1_2 <- anova(model1, model2, test = "LRT")
lrt_2_3 <- anova(model2, model3, test = "LRT")
# Create comparison table
model_comp <- data.frame(
Model = c("Model 1: Age only",
"Model 2: Age + Sex",
"Model 3: Full model"),
AIC = c(AIC(model1), AIC(model2), AIC(model3)),
BIC = c(BIC(model1), BIC(model2), BIC(model3)),
`Deviance` = c(deviance(model1), deviance(model2), deviance(model3)),
check.names = FALSE
)
model_comp %>%
kable(caption = "Model Comparison: AIC, BIC, and Deviance",
digits = 2,
align = "lrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which.min(model_comp$AIC), bold = TRUE, background = "#d4edda")| Model | AIC | BIC | Deviance |
|---|---|---|---|
| Model 1: Age only | 1175.08 | 1185.39 | 1171.08 |
| Model 2: Age + Sex | 1175.85 | 1191.32 | 1169.85 |
| Model 3: Full model | 1122.65 | 1179.36 | 1100.65 |
Interpretation:
All statistical models include an error term (\(\epsilon\)) to account for:
\[Y = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p + \epsilon\]
Key points:
In this lab, you will:
# Load the full BRFSS 2023 dataset
brfss_clean <- read_rds('~/Desktop/EPI553/data/brfss_subset_2023.rds') %>%
janitor::clean_names()
# Display dataset dimensions
names(brfss_clean)## [1] "diabetes" "age_group" "age_cont" "sex"
## [5] "race" "education" "income" "bmi_cat"
## [9] "phys_active" "current_smoker" "gen_health" "hypertension"
## [13] "high_chol"
## # A tibble: 6 × 13
## diabetes age_group age_cont sex race education income bmi_cat phys_active
## <dbl> <fct> <dbl> <fct> <fct> <fct> <fct> <fct> <dbl>
## 1 0 65+ 70 Female White Some coll… $75,0… Obese 1
## 2 0 35-44 39.5 Male Black Some coll… Unkno… Obese 1
## 3 0 65+ 70 Male White College g… Unkno… Normal 1
## 4 0 65+ 70 Female White High scho… $50,0… Normal 1
## 5 0 65+ 70 Female White High scho… $50,0… Overwe… 1
## 6 0 65+ 70 Male White College g… $75,0… Normal 1
## # ℹ 4 more variables: current_smoker <dbl>, gen_health <fct>,
## # hypertension <dbl>, high_chol <dbl>
# Create a frequency table of hypertension status
brfss_clean %>%
count(hypertension) %>%
mutate(
percent = n/ sum(n)* 100
)## # A tibble: 2 × 3
## hypertension n percent
## <dbl> <int> <dbl>
## 1 0 606 47.3
## 2 1 675 52.7
# Calculate the prevalence of hypertension by age group
brfss_clean %>%
group_by(age_group) %>%
summarise(
n=n(),
prevalence= mean(hypertension==1, na.rm=TRUE)
)## # A tibble: 6 × 3
## age_group n prevalence
## <fct> <int> <dbl>
## 1 18-24 12 0.0833
## 2 25-34 77 0.195
## 3 35-44 138 0.304
## 4 45-54 161 0.379
## 5 55-64 266 0.515
## 6 65+ 627 0.668
# Overall prevalence
brfss_clean %>%
summarise(
n=n(),
prevalence= mean(hypertension==1, na.rm=TRUE)
) %>%
kable(caption= "Overall Prevalence of Hypertension")| n | prevalence |
|---|---|
| 1281 | 0.5269321 |
Questions:
# Fit a simple logistic regression model
# Outcome: hypertension
# Predictor: age_cont
model_slr <- glm(hypertension ~ age_cont,
data=brfss_clean,
family= binomial(link= "logit"))
# Display the results with odds ratios
tidy(model_slr, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Simple Logistic Regression: Hypertension ~ Age (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.048 | 0.296 | -10.293 | 0 | 0.026 | 0.084 |
| age_cont | 1.055 | 0.005 | 10.996 | 0 | 1.045 | 1.065 |
Questions:
# Fit a multiple logistic regression model
# Outcome: hypertension
# Predictors: age_cont, sex, bmi_cat, phys_active, current_smoker
model_logistic_multiple <- glm(hypertension~ age_cont + sex + bmi_cat +
phys_active + current_smoker,
data = brfss_clean,
family = binomial(link = "logit"))
# Display results
tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Multiple Logistic Regression: Hypertension ~ Age + Predictors (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
scroll_box(height = "400px")| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.008 | 0.653 | -7.355 | 0.000 | 0.002 | 0.028 |
| age_cont | 1.061 | 0.005 | 11.234 | 0.000 | 1.050 | 1.073 |
| sexMale | 1.270 | 0.123 | 1.950 | 0.051 | 0.999 | 1.616 |
| bmi_catNormal | 2.097 | 0.546 | 1.356 | 0.175 | 0.759 | 6.756 |
| bmi_catOverweight | 3.241 | 0.543 | 2.166 | 0.030 | 1.183 | 10.385 |
| bmi_catObese | 6.585 | 0.545 | 3.459 | 0.001 | 2.394 | 21.176 |
| phys_active | 0.900 | 0.130 | -0.808 | 0.419 | 0.697 | 1.162 |
| current_smoker | 1.071 | 0.139 | 0.495 | 0.621 | 0.817 | 1.407 |
Questions:
#check bmi variable names tidy(model_logistic_multiple)
# Create a table showing the dummy variable coding for bmi_cat
dummy_table <- data.frame(
BMI = c("normal", "overweight", "obese"),
`Dummy 1 (normal)` = c(1, 0, 0),
`Dummy 2 (overweight)` = c(0, 1, 0),
`Dummy 3 (obese)` = c(0, 0, 1),
check.names = FALSE
)
dummy_table %>%
kable(caption = "Dummy Variable Coding for BMI",
align = "lccc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(1, bold = TRUE, background = "#ffe6e6") # Highlight reference category| BMI | Dummy 1 (normal) | Dummy 2 (overweight) | Dummy 3 (obese) |
|---|---|---|---|
| normal | 1 | 0 | 0 |
| overweight | 0 | 1 | 0 |
| obese | 0 | 0 | 1 |
# Extract
bmi_coefs <- tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "bmi")) %>%
mutate(
bmi_cat = str_remove(term, "bmi"),
bmi_cat = factor(bmi_cat,
levels = c("bmi_catNormal",
"bmi_catOverweight",
"bmi_catObese"))
)
# Add reference category
ref_row <- data.frame(
term = "Normal BMI",
estimate = 1.0,
std.error = 0,
statistic = NA,
p.value = NA,
conf.low = 1.0,
conf.high = 1.0,
bmi_cat = factor("Normal (Ref)",
levels = c("bmi_catNormal (Ref)",
"bmi_catOverweight",
"bmi_catObese"))
)
bmi_coefs_full <- bind_rows(ref_row, bmi_coefs) %>%
mutate(bmi_cat = factor(bmi_cat,
levels = c("bmi_catNormal (Ref)",
"bmi_catOverweight",
"bmi_catObese")))
# Display
p3 <- ggplot(bmi_coefs_full, aes(x = bmi_cat, y = estimate)) +
geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high),
size = 0.8, color = "darkblue") +
coord_flip() +
labs(
title = "Association Between BMI and Hypertension",
subtitle = "Adjusted Odds Ratios (reference: Normal BMI)",
x = "BMI Category",
y = "Odds Ratio (95% CI)"
) +
theme_minimal(base_size = 12)
ggplotly(p3)Questions:
# Fit a model with Age × BMI interaction
# Test if the effect of age on hypertension differs by BMI category
model_interaction <- glm(hypertension ~ age_cont * bmi_cat,
data = brfss_clean,
family = binomial(link = "logit"))
# Display interaction results
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "age_cont")) %>%
kable(caption = "BMI × Age Interaction Model (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| age_cont | 1.004 | 0.042 | 0.102 | 0.918 | 0.929 | 1.108 |
| age_cont:bmi_catNormal | 1.058 | 0.043 | 1.306 | 0.192 | 0.957 | 1.147 |
| age_cont:bmi_catOverweight | 1.063 | 0.043 | 1.423 | 0.155 | 0.962 | 1.151 |
| age_cont:bmi_catObese | 1.054 | 0.042 | 1.232 | 0.218 | 0.954 | 1.140 |
library(ggeffects)
library(plotly)
# Generate predicted probabilities by sex
pred_interact <- ggpredict(model_interaction, terms = c("age_cont [18:80]","bmi_cat"))
# Visualize
p4 <- ggplot(pred_interact, aes(x = x, y = predicted, color = group, fill = group)) +
geom_line(linewidth = 1.2) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, color = NA) +
labs(
title = "Predicted Probability of Hypertension by Age and BMI",
subtitle = "Testing for Age × BMI Interaction",
x = "Age (years)",
y = "Predicted Probability of Hypertension",
color = "BMI",
fill = "BMI"
) +
scale_y_continuous(labels = scales::percent_format()) +
scale_color_manual(values = c("Normal" = "#E64B35", "Overweight" = "#4DBBD5", "Obese"= "#ABB087")) +
scale_fill_manual(values = c("Normal" = "#E64B35", "Overweight" = "#4DBBD5", "Obese"= "#ABB087")) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")
ggplotly(p4)# Perform a likelihood ratio test comparing models with and without interaction
# Likelihood ratio test
lrt_1_2 <- anova(model1, model2, test = "LRT")
lrt_2_3 <- anova(model2, model3, test = "LRT")Questions:
# Calculate VIF for your multiple regression model
vif_values <- vif(model_logistic_multiple)
# Create VIF table
# For models with categorical variables, vif() returns GVIF (Generalized VIF)
if (is.matrix(vif_values)) {
# If matrix (categorical variables present), extract GVIF^(1/(2*Df))
vif_df <- data.frame(
Variable = rownames(vif_values),
VIF = vif_values[, "GVIF^(1/(2*Df))"]
)
} else {
# If vector (only continuous variables)
vif_df <- data.frame(
Variable = names(vif_values),
VIF = as.numeric(vif_values)
)
}
# Add interpretation
vif_df <- vif_df %>%
arrange(desc(VIF)) %>%
mutate(
Interpretation = case_when(
VIF < 5 ~ "Low (No concern)",
VIF >= 5 & VIF < 10 ~ "Moderate (Monitor)",
VIF >= 10 ~ "High (Problem)"
)
)
vif_df %>%
kable(caption = "Variance Inflation Factors (VIF) for Multiple Regression Model",
digits = 2,
align = "lrc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which(vif_df$VIF >= 10), bold = TRUE, color = "white", background = "#DC143C") %>%
row_spec(which(vif_df$VIF >= 5 & vif_df$VIF < 10), background = "#FFA500") %>%
row_spec(which(vif_df$VIF < 5), background = "#90EE90")| Variable | VIF | Interpretation | |
|---|---|---|---|
| age_cont | age_cont | 1.06 | Low (No concern) |
| current_smoker | current_smoker | 1.04 | Low (No concern) |
| bmi_cat | bmi_cat | 1.02 | Low (No concern) |
| phys_active | phys_active | 1.01 | Low (No concern) |
| sex | sex | 1.01 | Low (No concern) |
# YOUR CODE HERE: Create a Cook's distance plot to identify influential observations
cooks_d <- cooks.distance(model_logistic_multiple)
# Create data frame
influence_df <- data.frame(
observation = 1:length(cooks_d),
cooks_d = cooks_d
) %>%
mutate(influential = ifelse(cooks_d > 1, "Yes", "No"))
# Plot
p5 <- ggplot(influence_df, aes(x = observation, y = cooks_d, color = influential)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
labs(
title = "Cook's Distance: Identifying Influential Observations",
subtitle = "Values > 1 indicate potentially influential observations",
x = "Observation Number",
y = "Cook's Distance",
color = "Influential?"
) +
scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
theme_minimal(base_size = 12)
ggplotly(p5)# Count influential observations
n_influential <- sum(influence_df$influential == "Yes")
cat("Number of potentially influential observations:", n_influential, "\n")## Number of potentially influential observations: 0
Questions:
# YOUR CODE HERE: Compare three models using AIC and BIC
# Model A: Age only
model1 <- glm(hypertension ~ age_cont,
data = brfss_clean,
family = binomial)
# Model B: Age + sex + bmi_cat
model2 <- glm(hypertension ~ age_cont + bmi_cat,
data = brfss_clean,
family = binomial)
# Model C: Age + sex + bmi_cat + phys_active + current_smoker
modelC <- model_logistic_multiple
# Create comparison table
model_comp <- data.frame(
Model = c("Model 1: Age only",
"Model 2: Age + BMI",
"Model 3: Full model"),
AIC = c(AIC(model1), AIC(model2), AIC(model3)),
BIC = c(BIC(model1), BIC(model2), BIC(model3)),
`Deviance` = c(deviance(model1), deviance(model2), deviance(model3)),
check.names = FALSE
)
model_comp %>%
kable(caption = "Model Comparison: AIC, BIC, and Deviance",
digits = 2,
align = "lrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which.min(model_comp$AIC), bold = TRUE, background = "#d4edda")| Model | AIC | BIC | Deviance |
|---|---|---|---|
| Model 1: Age only | 1636.61 | 1646.92 | 1632.61 |
| Model 2: Age + BMI | 1578.06 | 1603.84 | 1568.06 |
| Model 3: Full model | 1122.65 | 1179.36 | 1100.65 |
Questions:
Write a brief report (1-2 pages) summarizing your findings:
My research was to explore the relationship between hypertension and age. Additionaly, relationships between confounders, such as BMI, sex, physical activity, and smoking were examined to see if they had any influence on my primary relationship (hypertension and age). I conducted a cross-sectional analysis. First, I calculated the prevalence of hypertension for each age group. Then, I built a simple logisitic model with age as the predictor and hypertension status as the the outcome to calculate the odds ratio for age. A multiple regression model was created to see how the odds ratio changed in the presence of confounders. After, I focused on BMI by creating a dummy variable table and testing for an interaction between age and BMI to see if the effect of age on hypertension varies by BMI status. Multicollinearity was investigated using VIF from my multiple regression model. Influential observations were identified from a Cook’s distance plot. Finally, nested models were compared. The prevalence of hypertension increased with age. The simple logistic model showed there was a statistically significant association between hypertension and age (OR=1.055, p<0.001).After adjusting for confounders, the odds ratio slightly increased indicating that other factors including BMI, sex, physical acitivty, and smoking status were masking the relationship between age and hypertension. In other words, the relationship between age and hypertension is stronger (OR= 1.061 without confounders). There was a relationship between BMI status and hypertension found when investigating their relationship, those with a normal bmi had lower odds of having hypertension. However, BMI was not an effect modifier of age and hypertension though. There were no concerns of multicolinearity found (VIF values all < 2) and there were no influential observations that might affect the results. These findings suggest a strong, postive association between age and hypertension so as age increases so does the likelihood of hypertension. Adjusting for confounders revealed that the effect of age is slightly stronger than originally observed. Individuals with a normal BMI were less likely to have hypertension, although BMI did not modify the effect of age. Since this was a cross-sectional study, causation cannot be assumed we can only see that there is a relationship between both variables (age doesn’t cause hypertension and hypertension doesn’t cause age). Also, not all possible confounders were evaluated.
Submission: Submit your completed R Markdown file and knitted HTML report.
Logistic Regression:
\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]
Odds Ratio:
\[\text{OR} = e^{\beta_i}\]
Predicted Probability:
\[p = \frac{e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}{1 + e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}\]
Session Info
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggstats_0.12.0 gtsummary_2.5.0 ggeffects_2.3.2 car_3.1-3
## [5] carData_3.0-5 broom_1.0.11 plotly_4.12.0 kableExtra_1.4.0
## [9] knitr_1.51 haven_2.5.5 lubridate_1.9.4 forcats_1.0.1
## [13] stringr_1.6.0 dplyr_1.2.0 purrr_1.2.1 readr_2.1.5
## [17] tidyr_1.3.2 tibble_3.3.0 ggplot2_4.0.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.56 bslib_0.9.0
## [4] htmlwidgets_1.6.4 insight_1.4.5 lattice_0.22-7
## [7] tzdb_0.5.0 crosstalk_1.2.2 vctrs_0.7.1
## [10] tools_4.5.2 generics_0.1.4 datawizard_1.3.0
## [13] pkgconfig_2.0.3 Matrix_1.7-4 data.table_1.18.0
## [16] RColorBrewer_1.1-3 S7_0.2.1 lifecycle_1.0.5
## [19] compiler_4.5.2 farver_2.1.2 textshaping_1.0.4
## [22] janitor_2.2.1 codetools_0.2-20 snakecase_0.11.1
## [25] htmltools_0.5.9 sass_0.4.10 yaml_2.3.12
## [28] lazyeval_0.2.2 Formula_1.2-5 pillar_1.11.0
## [31] jquerylib_0.1.4 broom.helpers_1.22.0 cachem_1.1.0
## [34] abind_1.4-8 nlme_3.1-168 tidyselect_1.2.1
## [37] digest_0.6.39 stringi_1.8.7 labeling_0.4.3
## [40] splines_4.5.2 labelled_2.16.0 fastmap_1.2.0
## [43] grid_4.5.2 cli_3.6.5 magrittr_2.0.3
## [46] cards_0.7.1 utf8_1.2.6 withr_3.0.2
## [49] scales_1.4.0 backports_1.5.0 timechange_0.3.0
## [52] rmarkdown_2.30 httr_1.4.7 otel_0.2.0
## [55] hms_1.1.3 evaluate_1.0.5 viridisLite_0.4.2
## [58] mgcv_1.9-3 rlang_1.1.7 glue_1.8.0
## [61] xml2_1.5.2 svglite_2.2.2 rstudioapi_0.18.0
## [64] jsonlite_2.0.0 R6_2.6.1 systemfonts_1.3.1