# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)The BRFSS is a large-scale telephone survey that collects data on health-related risk behaviors, chronic health conditions, and use of preventive services from U.S. residents.
# Load the BRFSS 2023 subset
brfss_subset_2023 <- read_rds("C:/Users/AA843241/OneDrive - University at Albany - SUNY/Desktop/EPI553/lab05/brfss_subset_2023.rds")
# Display dataset dimensions
names(brfss_subset_2023)## [1] "diabetes" "age_group" "age_cont" "sex"
## [5] "race" "education" "income" "bmi_cat"
## [9] "phys_active" "current_smoker" "gen_health" "hypertension"
## [13] "high_chol"
## # A tibble: 15 × 13
## diabetes age_group age_cont sex race education income bmi_cat phys_active
## <dbl> <fct> <dbl> <fct> <fct> <fct> <fct> <fct> <dbl>
## 1 0 65+ 70 Female White Some col… $75,0… Obese 1
## 2 0 35-44 39.5 Male Black Some col… Unkno… Obese 1
## 3 0 65+ 70 Male White College … Unkno… Normal 1
## 4 0 65+ 70 Female White High sch… $50,0… Normal 1
## 5 0 65+ 70 Female White High sch… $50,0… Overwe… 1
## 6 0 65+ 70 Male White College … $75,0… Normal 1
## 7 0 65+ 70 Male White College … $50,0… Normal 0
## 8 0 65+ 70 Male Black College … Unkno… Overwe… 0
## 9 0 65+ 70 Female White College … Unkno… Normal 1
## 10 0 65+ 70 Male White High sch… $50,0… Normal 0
## 11 0 45-54 49.5 Female Hisp… < High s… $50,0… Obese 1
## 12 0 65+ 70 Male White Some col… $75,0… Normal 1
## 13 1 65+ 70 Male Hisp… High sch… < $25… Overwe… 1
## 14 0 25-34 29.5 Male White College … $50,0… Normal 1
## 15 0 65+ 70 Male White High sch… $75,0… Obese 1
## # ℹ 4 more variables: current_smoker <dbl>, gen_health <fct>,
## # hypertension <dbl>, high_chol <dbl>
In this lab, you will:
# YOUR CODE HERE: Create a frequency table of hypertension status
# Summary table by diabetes status
table(brfss_subset_2023$hypertension)##
## 0 1
## 606 675
# YOUR CODE HERE: Calculate the prevalence of hypertension by age group
desc_table <- brfss_subset_2023 %>%
group_by(age_group) %>%
summarise(
N = n(),
`% hypertension` = round(100 * mean(hypertension), 1)
)
desc_table## # A tibble: 6 × 3
## age_group N `% hypertension`
## <fct> <int> <dbl>
## 1 18-24 12 8.3
## 2 25-34 77 19.5
## 3 35-44 138 30.4
## 4 45-54 161 37.9
## 5 55-64 266 51.5
## 6 65+ 627 66.8
kable(desc_table, caption = "Descriptive Statistics of Hypertension by age group")%>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| age_group | N | % hypertension |
|---|---|---|
| 18-24 | 12 | 8.3 |
| 25-34 | 77 | 19.5 |
| 35-44 | 138 | 30.4 |
| 45-54 | 161 | 37.9 |
| 55-64 | 266 | 51.5 |
| 65+ | 627 | 66.8 |
Questions:
What is the overall prevalence of hypertension in the dataset? Overall prevalence of hypertension in the dataset is approximately 52.7%, indicating that about half of the participants have hypertension.
How does hypertension prevalence vary by age group? Hypertension prevalence increases with age group. Older age groups have higher hypertension prevalence compared to younger age groups
# YOUR CODE HERE: Fit a simple logistic regression model
# Outcome: hypertension
# Predictor: age_cont
model_logistic_simple <- glm(hypertension ~ age_cont,
data = brfss_subset_2023,
family = binomial(link = "logit"))
# YOUR CODE HERE: Display the results with odds ratios
tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Simple Logistic Regression: Hypertension ~ Age (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.048 | 0.296 | -10.293 | 0 | 0.026 | 0.084 |
| age_cont | 1.055 | 0.005 | 10.996 | 0 | 1.045 | 1.065 |
Questions:
# YOUR CODE HERE: Fit a multiple logistic regression model
# Outcome: hypertension
# Predictors: age_cont, sex, bmi_cat, phys_active, current_smoker
model_logistic_multiple <- glm(hypertension ~ age_cont + sex + bmi_cat +
phys_active + current_smoker,
data = brfss_subset_2023,
family = binomial(link = "logit"))
# YOUR CODE HERE: Display the results
tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
kable(caption = "Multiple Logistic Regression: Hypertension ~ Age + Covariates (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
scroll_box(height = "400px")| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| (Intercept) | 0.008 | 0.653 | -7.355 | 0.000 | 0.002 | 0.028 |
| age_cont | 1.061 | 0.005 | 11.234 | 0.000 | 1.050 | 1.073 |
| sexMale | 1.270 | 0.123 | 1.950 | 0.051 | 0.999 | 1.616 |
| bmi_catNormal | 2.097 | 0.546 | 1.356 | 0.175 | 0.759 | 6.756 |
| bmi_catOverweight | 3.241 | 0.543 | 2.166 | 0.030 | 1.183 | 10.385 |
| bmi_catObese | 6.585 | 0.545 | 3.459 | 0.001 | 2.394 | 21.176 |
| phys_active | 0.900 | 0.130 | -0.808 | 0.419 | 0.697 | 1.162 |
| current_smoker | 1.071 | 0.139 | 0.495 | 0.621 | 0.817 | 1.407 |
Questions:
# YOUR CODE HERE: Create a table showing the dummy variable coding for bmi_cat
levels(brfss_subset_2023$bmi_cat)## [1] "Underweight" "Normal" "Overweight" "Obese"
#dummy variables for BMI_cat
dummy_table <- data.frame(
bmi_cat = c("Underweight","Normal", "Overweight", "Obese"),
`Dummy 1 (Normal)` = c(0,1,0,0),
`Dummy 2 (Overweight)` = c(0,0,1,0),
`Dummy 3 (Obese)` = c(0,0,0,1),
check.names = FALSE
)
# YOUR CODE HERE: Extract and display the odds ratios for BMI categories
dummy_table %>%
kable(caption = "Dummy Variable Coding for BMI categories",
align = "lccc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(1, bold = TRUE, background = "#ffe6e6") # Highlight reference category| bmi_cat | Dummy 1 (Normal) | Dummy 2 (Overweight) | Dummy 3 (Obese) |
|---|---|---|---|
| Underweight | 0 | 0 | 0 |
| Normal | 1 | 0 | 0 |
| Overweight | 0 | 1 | 0 |
| Obese | 0 | 0 | 1 |
# Extract education coefficients
bmi_cat <- tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "bmi_cat")) %>%
mutate(
bmi_cat = str_remove(term, "bmi_cat"),
bmi_cat = factor(bmi_cat,
levels = c("Underweight",
"Normal",
"Overweight",
"Obese"))
)
# Add reference category
ref_row <- data.frame(
term = "bmi_catNormal",
estimate = 1.0,
std.error = 0,
statistic = NA,
p.value = NA,
conf.low = 1.0,
conf.high = 1.0,
bmi_cat = factor("Underweight (Ref)",
levels = c("Underweight (Ref)",
"Normal",
"Overweight",
"Obese"))
)
bmi_cat_full <- bind_rows(ref_row, bmi_cat) %>%
mutate(bmi_cat = factor(bmi_cat,
levels = c("Underweight (Ref)",
"Normal",
"Overweight",
"Obese")))
bmi_cat_full <- bmi_cat_full %>%
filter(!is.na(bmi_cat))
# Plot
p3 <- ggplot(bmi_cat_full, aes(x = bmi_cat, y = estimate)) +
geom_hline(yintercept = 1, linetype = "dashed", color = "darkred") +
geom_pointrange(aes(ymin = conf.low, ymax = conf.high),
size = 0.8, color = "darkblue") +
coord_flip() +
labs(
title = "Association Between BMI categories and Hypertension",
subtitle = "Adjusted Odds Ratios (reference: Normal)",
x = "BMI category",
y = "Odds Ratio (95% CI)"
) +
theme_minimal()
ggplotly(p3)Odds Ratios for BMI categories
Questions:
# YOUR CODE HERE: Fit a model with Age × BMI interaction
# Test if the effect of age on hypertension differs by BMI category
model_interaction <- glm(hypertension ~ age_cont * bmi_cat,
data = brfss_subset_2023,
family = binomial(link = "logit"))
# Display interaction results
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE) %>%
filter(str_detect(term, "age_cont")) %>%
kable(caption = "Age × BMI category Interaction Model (Odds Ratios)",
digits = 3,
col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Term | Odds Ratio | Std. Error | z-statistic | p-value | 95% CI Lower | 95% CI Upper |
|---|---|---|---|---|---|---|
| age_cont | 1.004 | 0.042 | 0.102 | 0.918 | 0.929 | 1.108 |
| age_cont:bmi_catNormal | 1.058 | 0.043 | 1.306 | 0.192 | 0.957 | 1.147 |
| age_cont:bmi_catOverweight | 1.063 | 0.043 | 1.423 | 0.155 | 0.962 | 1.151 |
| age_cont:bmi_catObese | 1.054 | 0.042 | 1.232 | 0.218 | 0.954 | 1.140 |
# YOUR CODE HERE: Perform a likelihood ratio test comparing models with and without interaction
pred_interact <- ggpredict(model_interaction, terms = c("age_cont [18:80]", "bmi_cat"))
# Plot
p4 <- ggplot(pred_interact, aes(x = x, y = predicted, color = group, fill = group)) +
geom_line(linewidth = 1.2) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, color = NA) +
labs(
title = "Predicted Probability of Hypertension by Age and BMI category",
subtitle = "Testing for Age × BMI category Interaction",
x = "Age (years)",
y = "Predicted Probability of Hypertension",
color = "BMI category",
fill = "BMI category"
) +
scale_y_continuous(labels = scales::percent_format()) +
scale_color_manual(values = c("Normal" = "#50C878", "Overweight" = "#FFFF00", "Obese" = "#E64B35")) +
scale_fill_manual(values = c("Normal" = "#50C878", "Overweight" = "#FFFF00", "Obese" = "#E64B35")) +
theme_minimal() +
theme(legend.position = "bottom")
ggplotly(p4)Questions:
# YOUR CODE HERE: Calculate VIF for your multiple regression model
vif_values <- vif(model_logistic_multiple)
# Create VIF table
# For models with categorical variables, vif() returns GVIF (Generalized VIF)
if (is.matrix(vif_values)) {
# If matrix (categorical variables present), extract GVIF^(1/(2*Df))
vif_df <- data.frame(
Variable = rownames(vif_values),
VIF = vif_values[, "GVIF^(1/(2*Df))"]
)
} else {
# If vector (only continuous variables)
vif_df <- data.frame(
Variable = names(vif_values),
VIF = as.numeric(vif_values)
)
}
# Add interpretation
vif_df <- vif_df %>%
arrange(desc(VIF)) %>%
mutate(
Interpretation = case_when(
VIF < 5 ~ "Low (No concern)",
VIF >= 5 & VIF < 10 ~ "Moderate (Monitor)",
VIF >= 10 ~ "High (Problem)"
)
)
vif_df %>%
kable(caption = "Variance Inflation Factors (VIF) for Multiple Regression Model",
digits = 2,
align = "lrc") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which(vif_df$VIF >= 10), bold = TRUE, color = "white", background = "#DC143C") %>%
row_spec(which(vif_df$VIF >= 5 & vif_df$VIF < 10), background = "#FFA500") %>%
row_spec(which(vif_df$VIF < 5), background = "#90EE90")| Variable | VIF | Interpretation | |
|---|---|---|---|
| age_cont | age_cont | 1.06 | Low (No concern) |
| current_smoker | current_smoker | 1.04 | Low (No concern) |
| bmi_cat | bmi_cat | 1.02 | Low (No concern) |
| phys_active | phys_active | 1.01 | Low (No concern) |
| sex | sex | 1.01 | Low (No concern) |
# YOUR CODE HERE: Create a Cook's distance plot to identify influential observations
cooks_d <- cooks.distance(model_logistic_multiple)
# Create data frame
influence_df <- data.frame(
observation = 1:length(cooks_d),
cooks_d = cooks_d
) %>%
mutate(influential = ifelse(cooks_d > 1, "Yes", "No"))
# Plot
p5 <- ggplot(influence_df, aes(x = observation, y = cooks_d, color = influential)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
labs(
title = "Cook's Distance: Identifying Influential Observations",
subtitle = "Values > 1 indicate potentially influential observations",
x = "Observation Number",
y = "Cook's Distance",
color = "Influential?"
) +
scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
theme_minimal(base_size = 12)
ggplotly(p5)# Count influential observations
n_influential <- sum(influence_df$influential == "Yes")
cat("Number of potentially influential observations:", n_influential, "\n")## Number of potentially influential observations: 0
Questions:
# YOUR CODE HERE: Compare three models using AIC and BIC
# Model A: Age only
model1 <- glm(hypertension ~ age_cont,
data = brfss_subset_2023,
family = binomial)
# Model B: Age + sex + bmi_cat
model2 <- glm(hypertension ~ age_cont + sex + bmi_cat,
data = brfss_subset_2023,
family = binomial)
# Model C: Age + sex + bmi_cat + phys_active + current_smoker
model3 <- glm(hypertension ~ age_cont + sex + bmi_cat + phys_active + current_smoker,
data = brfss_subset_2023,
family = binomial)
# Likelihood ratio test
lrt_1_2 <- anova(model1, model2, test = "LRT")
lrt_2_3 <- anova(model2, model3, test = "LRT")
# YOUR CODE HERE: Create a comparison table
model_comp <- data.frame(
Model = c("Model 1: Age only",
"Model 2: Age + sex + bmi_cat + phys_active + current_smoker",
"Model 3: Full model"),
AIC = c(AIC(model1), AIC(model2), AIC(model3)),
BIC = c(BIC(model1), BIC(model2), BIC(model3)),
`Deviance` = c(deviance(model1), deviance(model2), deviance(model3)),
check.names = FALSE
)
model_comp %>%
kable(caption = "Model Comparison: AIC, BIC, and Deviance",
digits = 2,
align = "lrrr") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE) %>%
row_spec(which.min(model_comp$AIC), bold = TRUE, background = "#d4edda")| Model | AIC | BIC | Deviance |
|---|---|---|---|
| Model 1: Age only | 1636.61 | 1646.92 | 1632.61 |
| Model 2: Age + sex + bmi_cat + phys_active + current_smoker | 1576.49 | 1607.42 | 1564.49 |
| Model 3: Full model | 1579.50 | 1620.74 | 1563.50 |
Questions:
Write a brief report (1-2 pages) summarizing your findings:
Submission: Submit your completed R Markdown file and knitted HTML report.
Logistic Regression:
\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]
Odds Ratio:
\[\text{OR} = e^{\beta_i}\]
Predicted Probability:
\[p = \frac{e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}{1 + e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}\]
Session Info
## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggstats_0.12.0 gtsummary_2.5.0 ggeffects_2.3.2 car_3.1-3
## [5] carData_3.0-5 broom_1.0.11 plotly_4.12.0 kableExtra_1.4.0
## [9] knitr_1.51 haven_2.5.5 lubridate_1.9.4 forcats_1.0.1
## [13] stringr_1.6.0 dplyr_1.1.4 purrr_1.2.1 readr_2.1.6
## [17] tidyr_1.3.2 tibble_3.3.1 ggplot2_4.0.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.56 bslib_0.9.0 htmlwidgets_1.6.4
## [5] insight_1.4.4 tzdb_0.5.0 crosstalk_1.2.2 vctrs_0.7.1
## [9] tools_4.5.2 generics_0.1.4 datawizard_1.3.0 pkgconfig_2.0.3
## [13] data.table_1.18.0 RColorBrewer_1.1-3 S7_0.2.1 lifecycle_1.0.5
## [17] compiler_4.5.2 farver_2.1.2 textshaping_1.0.4 codetools_0.2-20
## [21] htmltools_0.5.9 sass_0.4.10 yaml_2.3.12 lazyeval_0.2.2
## [25] Formula_1.2-5 pillar_1.11.1 jquerylib_0.1.4 cachem_1.1.0
## [29] abind_1.4-8 tidyselect_1.2.1 digest_0.6.39 stringi_1.8.7
## [33] labeling_0.4.3 fastmap_1.2.0 grid_4.5.2 cli_3.6.5
## [37] magrittr_2.0.4 utf8_1.2.6 withr_3.0.2 scales_1.4.0
## [41] backports_1.5.0 timechange_0.3.0 rmarkdown_2.30 httr_1.4.7
## [45] otel_0.2.0 hms_1.1.4 evaluate_1.0.5 viridisLite_0.4.2
## [49] rlang_1.1.7 glue_1.8.0 xml2_1.5.2 svglite_2.2.2
## [53] rstudioapi_0.18.0 jsonlite_2.0.0 R6_2.6.1 systemfonts_1.3.1