# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)# Summary table by diabetes status
desc_table <- brfss_clean %>%
group_by(diabetes) %>%
summarise(
N = n(),
`Mean Age` = round(mean(age_cont), 1),
`% Male` = round(100 * mean(sex == "Male"), 1),
`% Obese` = round(100 * mean(bmi_cat == "Obese", na.rm = TRUE), 1),
`% Physically Active` = round(100 * mean(phys_active), 1),
`% Current Smoker` = round(100 * mean(current_smoker), 1),
`% Hypertension` = round(100 * mean(hypertension), 1),
`% High Cholesterol` = round(100 * mean(high_chol), 1)
) %>%
mutate(diabetes = ifelse(diabetes == 1, "Diabetes", "No Diabetes"))
desc_table %>%
kable(caption = "Descriptive Statistics by Diabetes Status",
align = "lrrrrrrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| diabetes | N | Mean Age | % Male | % Obese | % Physically Active | % Current Smoker | % Hypertension | % High Cholesterol |
|---|---|---|---|---|---|---|---|---|
| No Diabetes | 1053 | 58.2 | 49.0 | 34.8 | 69.4 | 29.3 | 47.5 | 42.5 |
| Diabetes | 228 | 63.1 | 53.9 | 56.1 | 53.5 | 27.6 | 76.8 | 67.1 |
In this lab, you will:
##
## 0 1
## 606 675
## # A tibble: 1 × 1
## `% hypertension`
## <dbl>
## 1 0.527
# YOUR CODE HERE: Calculate the prevalence of hypertension by age group
hyper_table <- brfss_clean %>%
group_by(age_group) %>%
summarise(
N = n(),
`% Hypertension` = round(100 * mean(hypertension), 1),
)
hyper_table %>%
kable(caption = "Hypertension Prevalence by Age",
align = "lrrrrrrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| age_group | N | % Hypertension |
|---|---|---|
| 18-24 | 12 | 8.3 |
| 25-34 | 77 | 19.5 |
| 35-44 | 138 | 30.4 |
| 45-54 | 161 | 37.9 |
| 55-64 | 266 | 51.5 |
| 65+ | 627 | 66.8 |
Questions:
# YOUR CODE HERE: Fit a simple logistic regression model
# Outcome: hypertension
# Predictor: age_cont
model_logistic_simple <- glm(hypertension ~ age_cont,
data = brfss_clean,
family = binomial(link = "logit"))
# YOUR CODE HERE: Display the results with odds ratios
tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Simple Logistic Regression: Hypertension ~ Age (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.048 | 0.296 | -10.293 | 0 | 0.026 | 0.084 |
| age_cont | 1.055 | 0.005 | 10.996 | 0 | 1.045 | 1.065 |
Questions:
# YOUR CODE HERE: Fit a multiple logistic regression model
# Outcome: hypertension
# Predictors: age_cont, sex, bmi_cat, phys_active, current_smoker
model_logistic_multiple <- glm(diabetes ~ age_cont + sex + bmi_cat +
phys_active + current_smoker,
data = brfss_clean,
family = binomial(link = "logit"))
# YOUR CODE HERE: Display the results
# Display results
tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Multiple Logistic Regression: Diabetes ~ Age + Covariates (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
scroll_box(height = "400px")| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.006 | 1.153 | -4.477 | 0.000 | 0.000 | 0.039 |
| age_cont | 1.040 | 0.007 | 5.445 | 0.000 | 1.026 | 1.056 |
| sexMale | 1.214 | 0.154 | 1.264 | 0.206 | 0.899 | 1.642 |
| bmi_catNormal | 2.012 | 1.051 | 0.665 | 0.506 | 0.386 | 37.062 |
| bmi_catOverweight | 3.208 | 1.043 | 1.118 | 0.264 | 0.632 | 58.630 |
| bmi_catObese | 6.993 | 1.041 | 1.869 | 0.062 | 1.388 | 127.528 |
| phys_active | 0.566 | 0.155 | -3.668 | 0.000 | 0.418 | 0.768 |
| current_smoker | 1.239 | 0.177 | 1.212 | 0.226 | 0.873 | 1.747 |
Questions:
# YOUR CODE HERE: Create a table showing the dummy variable coding for bmi_cat
summary(brfss_clean$bmi_cat)## Underweight Normal Overweight Obese
## 20 320 447 494
dummy_table <- data.frame(
BMI_cat = c("< Underweight", "Normal", "Overweight", "Obese"),
`Dummy 1 (normal)` = c(0, 1, 0, 0),
`Dummy 2 (over)` = c(0, 0, 1, 0),
`Dummy 3 (obese)` = c(0, 0, 0, 1),
check.names = FALSE
)
# YOUR CODE HERE: Extract and display the odds ratios for BMI categories
dummy_table %>%
kable(caption = "Dummy Variable Coding for BMI",
align = "lccc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(1, bold = TRUE, background = "#ffe6e6") # Highlight reference category| BMI_cat | Dummy 1 (normal) | Dummy 2 (over) | Dummy 3 (obese) |
|---|---|---|---|
| < Underweight | 0 | 0 | 0 |
| Normal | 1 | 0 | 0 |
| Overweight | 0 | 1 | 0 |
| Obese | 0 | 0 | 1 |
Questions:
# YOUR CODE HERE: Fit a model with Age × BMI interaction
# Test if the effect of age on hypertension differs by BMI category
# Model with interaction term
model_interaction <- glm(diabetes ~ age_cont * bmi_cat,
data = brfss_clean,
family = binomial(link = "logit"))
# Display interaction results
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "age_cont")) %>%
kable(caption = "Age × BMI Interaction Model (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| age_cont | 3.530 | 40.609 | 0.031 | 0.975 | 853638056 | 84628851 |
| age_cont:bmi_catNormal | 0.291 | 40.609 | -0.030 | 0.976 | 0 | 0 |
| age_cont:bmi_catOverweight | 0.296 | 40.609 | -0.030 | 0.976 | 0 | 0 |
| age_cont:bmi_catObese | 0.294 | 40.609 | -0.030 | 0.976 | 0 | 0 |
# YOUR CODE HERE: Perform a likelihood ratio test comparing models with and without interaction
# Model 1: Age only
model1 <- glm(diabetes ~ age_cont,
data = brfss_clean,
family = binomial)
# Model 2: Interaction model
model2 <- model_interaction
# Model 3: Full model
model3 <- model_logistic_multiple
# Likelihood ratio test
lrt_1_2 <- anova(model1, model2, test = "LRT")
lrt_2_3 <- anova(model2, model3, test = "LRT")
# Create comparison table
model_comp <- data.frame(
Model = c("Model 1: Age only",
"Model 2: Age and BMI Interaction",
"Model 3: Full model"),
AIC = c(AIC(model1), AIC(model2), AIC(model3)),
BIC = c(BIC(model1), BIC(model2), BIC(model3)),
`Deviance` = c(deviance(model1), deviance(model2), deviance(model3)),
check.names = FALSE
)
model_comp %>%
kable(caption = "Model Comparison: AIC, BIC, and Deviance",
digits = 2,
align = "lrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which.min(model_comp$AIC), bold = TRUE, background = "#d4edda")| Model | AIC | BIC | Deviance |
|---|---|---|---|
| Model 1: Age only | 1175.08 | 1185.39 | 1171.08 |
| Model 2: Age and BMI Interaction | 1135.64 | 1176.88 | 1119.64 |
| Model 3: Full model | 1120.93 | 1162.17 | 1104.93 |
Questions:
# Generate predicted probabilities by sex
pred_interact <- ggpredict(model_interaction, terms = c("age_cont", "bmi_cat"))
# Plot
p4 <- ggplot(pred_interact, aes(x = x, y = predicted, color = group, fill = group)) +
geom_line(linewidth = 1.2) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, color = NA) +
labs(
title = "Predicted Probability of Diabetes by Age and BMI",
subtitle = "Testing for Age × BMI Interaction",
x = "Age (years)",
y = "Predicted Probability of Diabetes",
color = "bmi_cat",
fill = "bmi_cat"
) +
scale_y_continuous(labels = scales::percent_format()) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")
ggplotly(p4)# YOUR CODE HERE: Calculate VIF for your multiple regression model
# Calculate VIF
vif_values <- vif(model_logistic_multiple)
# Create VIF table
# For models with categorical variables, vif() returns GVIF (Generalized VIF)
if (is.matrix(vif_values)) {
# If matrix (categorical variables present), extract GVIF^(1/(2*Df))
vif_df <- data.frame(
Variable = rownames(vif_values),
VIF = vif_values[, "GVIF^(1/(2*Df))"]
)
} else {
# If vector (only continuous variables)
vif_df <- data.frame(
Variable = names(vif_values),
VIF = as.numeric(vif_values)
)
}
# Add interpretation
vif_df <- vif_df %>%
arrange(desc(VIF)) %>%
mutate(
Interpretation = case_when(
VIF < 5 ~ "Low (No concern)",
VIF >= 5 & VIF < 10 ~ "Moderate (Monitor)",
VIF >= 10 ~ "High (Problem)"
)
)
vif_df %>%
kable(caption = "Variance Inflation Factors (VIF) for Multiple Regression Model",
digits = 2,
align = "lrc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which(vif_df$VIF >= 10), bold = TRUE, color = "white", background = "#DC143C") %>%
row_spec(which(vif_df$VIF >= 5 & vif_df$VIF < 10), background = "#FFA500") %>%
row_spec(which(vif_df$VIF < 5), background = "#90EE90")| Variable | VIF | Interpretation | |
|---|---|---|---|
| age_cont | age_cont | 1.05 | Low (No concern) |
| current_smoker | current_smoker | 1.04 | Low (No concern) |
| phys_active | phys_active | 1.01 | Low (No concern) |
| sex | sex | 1.01 | Low (No concern) |
| bmi_cat | bmi_cat | 1.01 | Low (No concern) |
# YOUR CODE HERE: Create a Cook's distance plot to identify influential observations
cooks_d <- cooks.distance(model_logistic_multiple)
# Create data frame
influence_df <- data.frame(
observation = 1:length(cooks_d),
cooks_d = cooks_d
) %>%
mutate(influential = ifelse(cooks_d > 1, "Yes", "No"))
# Plot
p5 <- ggplot(influence_df, aes(x = observation, y = cooks_d, color = influential)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
labs(
title = "Cook's Distance: Identifying Influential Observations",
subtitle = "Values > 1 indicate potentially influential observations",
x = "Observation Number",
y = "Cook's Distance",
color = "Influential?"
) +
scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
theme_minimal(base_size = 12)
ggplotly(p5)Questions:
# YOUR CODE HERE: Compare three models using AIC and BIC
# Model A: Age only
# Model B: Age + sex + bmi_cat
# Model C: Age + sex + bmi_cat + phys_active + current_smoker
# Model 1: Age only
model4 <- glm(diabetes ~ age_cont,
data = brfss_clean,
family = binomial)
# Model 2: Age + BMI
model5 <- glm(diabetes ~ age_cont + bmi_cat,
data = brfss_clean,
family = binomial)
# Model 3: Full model
model6 <- model_logistic_multiple
# Likelihood ratio test
lrt_1_2 <- anova(model4, model5, test = "LRT")
lrt_2_3 <- anova(model5, model6, test = "LRT")
# YOUR CODE HERE: Create a comparison table
model_comp_2 <- data.frame(
Model = c("Model 1: Age only",
"Model 2: Age + BMI",
"Model 3: Full model"),
AIC = c(AIC(model4), AIC(model5), AIC(model6)),
BIC = c(BIC(model4), BIC(model5), BIC(model6)),
`Deviance` = c(deviance(model4), deviance(model5), deviance(model6)),
check.names = FALSE
)
model_comp_2 %>%
kable(caption = "Model Comparison: AIC, BIC, and Deviance",
digits = 2,
align = "lrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which.min(model_comp$AIC), bold = TRUE, background = "#d4edda")| Model | AIC | BIC | Deviance |
|---|---|---|---|
| Model 1: Age only | 1175.08 | 1185.39 | 1171.08 |
| Model 2: Age + BMI | 1131.45 | 1157.22 | 1121.45 |
| Model 3: Full model | 1120.93 | 1162.17 | 1104.93 |
Questions:
Introduction: State your research question In this lab we analyze the association between age and hypertension. The research question concerns whether hypertension risk worsens with age. Hypertension, or high blood pressure, risk will likely increase with age as this disease is more common in older populations. In addition, this lab will determine the impact of covariates such as BMI, physical activity, sex, and smoking. These factors are all likely involved in the development of hypertension. It will be important to determine whether these counfounders have effect modifications on the association between age and hypertension.
Methods: Describe your analytic approach In this lab, we will use a multiple logistic regression model to understand the association between age and hypertension, while controlling for confounding variables. Post hoc testing will be done to ensure that these confounders are not causing effect modification. In addition, multicollinearity and likelihood ratio tests will be performed.
Results: Present key findings with tables and figures Hypertension was analyzed by age group to elucidate trends.
|
age_group |
N |
% Hypertension |
|---|---|---|
|
18-24 |
12 |
8.3 |
|
25-34 |
77 |
19.5 |
|
35-44 |
138 |
30.4 |
|
45-54 |
161 |
37.9 |
|
55-64 |
266 |
51.5 |
|
65+ |
627 |
66.8 |
| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.006 | 1.153 | -4.477 | 0.000 | 0.000 | 0.039 |
| age_cont | 1.040 | 0.007 | 5.445 | 0.000 | 1.026 | 1.056 |
| sexMale | 1.214 | 0.154 | 1.264 | 0.206 | 0.899 | 1.642 |
| bmi_catNormal | 2.012 | 1.051 | 0.665 | 0.506 | 0.386 | 37.062 |
| bmi_catOverweight | 3.208 | 1.043 | 1.118 | 0.264 | 0.632 | 58.630 |
| bmi_catObese | 6.993 | 1.041 | 1.869 | 0.062 | 1.388 | 127.528 |
| phys_active | 0.566 | 0.155 | -3.668 | 0.000 | 0.418 | 0.768 |
| current_smoker | 1.239 | 0.177 | 1.212 | 0.226 | 0.873 | 1.747 |
| Variable | VIF | Interpretation | |
|---|---|---|---|
| age_cont | age_cont | 1.05 | Low (No concern) |
| current_smoker | current_smoker | 1.04 | Low (No concern) |
| phys_active | phys_active | 1.01 | Low (No concern) |
| sex | sex | 1.01 | Low (No concern) |
| bmi_cat | bmi_cat | 1.01 | Low (No concern) |
| Model | AIC | BIC | Deviance |
|---|---|---|---|
| Model 1: Age only | 1175.08 | 1185.39 | 1171.08 |
| Model 2: Age + BMI | 1131.45 | 1157.22 | 1121.45 |
| Model 3: Full model | 1120.93 | 1162.17 | 1104.93 |
Model analytics showed that BMI does not impart effect modification on age and hypertension. This means that the association between age and hypertension does not differ across BMI groups. In addition, all VIF values showed low concern of multicollinearity. The full model including all confounders was found to have the lowest AIC, meaning the best fit model.