Setup and Data Preparation

# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)

Loading BRFSS 2023 Data

The BRFSS is a large-scale telephone survey that collects data on health-related risk behaviors, chronic health conditions, and use of preventive services from U.S. residents.

# Load the BRFSS 2023 subset
brfss_subset_2023 <- read_rds("C:/Users/AA843241/OneDrive - University at Albany - SUNY/Desktop/EPI553/lab05/brfss_subset_2023.rds")
 

# Display dataset dimensions
names(brfss_subset_2023)
##  [1] "diabetes"       "age_group"      "age_cont"       "sex"           
##  [5] "race"           "education"      "income"         "bmi_cat"       
##  [9] "phys_active"    "current_smoker" "gen_health"     "hypertension"  
## [13] "high_chol"
head(brfss_subset_2023, 15)
## # A tibble: 15 × 13
##    diabetes age_group age_cont sex    race  education income bmi_cat phys_active
##       <dbl> <fct>        <dbl> <fct>  <fct> <fct>     <fct>  <fct>         <dbl>
##  1        0 65+           70   Female White Some col… $75,0… Obese             1
##  2        0 35-44         39.5 Male   Black Some col… Unkno… Obese             1
##  3        0 65+           70   Male   White College … Unkno… Normal            1
##  4        0 65+           70   Female White High sch… $50,0… Normal            1
##  5        0 65+           70   Female White High sch… $50,0… Overwe…           1
##  6        0 65+           70   Male   White College … $75,0… Normal            1
##  7        0 65+           70   Male   White College … $50,0… Normal            0
##  8        0 65+           70   Male   Black College … Unkno… Overwe…           0
##  9        0 65+           70   Female White College … Unkno… Normal            1
## 10        0 65+           70   Male   White High sch… $50,0… Normal            0
## 11        0 45-54         49.5 Female Hisp… < High s… $50,0… Obese             1
## 12        0 65+           70   Male   White Some col… $75,0… Normal            1
## 13        1 65+           70   Male   Hisp… High sch… < $25… Overwe…           1
## 14        0 25-34         29.5 Male   White College … $50,0… Normal            1
## 15        0 65+           70   Male   White High sch… $75,0… Obese             1
## # ℹ 4 more variables: current_smoker <dbl>, gen_health <fct>,
## #   hypertension <dbl>, high_chol <dbl>

Part 2: Student Lab Activity

Lab Overview

In this lab, you will:

  1. Build your own logistic regression model predicting hypertension (high blood pressure)
  2. Create dummy variables for categorical predictors
  3. Interpret regression coefficients
  4. Test for confounding and interaction
  5. Perform model diagnostics

Lab Instructions

Task 1: Explore the Outcome Variable

# YOUR CODE HERE: Create a frequency table of hypertension status
# Summary table by diabetes status
table(brfss_subset_2023$hypertension)
## 
##   0   1 
## 606 675
# YOUR CODE HERE: Calculate the prevalence of hypertension by age group
desc_table <- brfss_subset_2023 %>%
  group_by(age_group) %>%
  summarise(
    N = n(),
    `% hypertension` = round(100 * mean(hypertension), 1)
  )

desc_table
## # A tibble: 6 × 3
##   age_group     N `% hypertension`
##   <fct>     <int>            <dbl>
## 1 18-24        12              8.3
## 2 25-34        77             19.5
## 3 35-44       138             30.4
## 4 45-54       161             37.9
## 5 55-64       266             51.5
## 6 65+         627             66.8
  kable(desc_table, caption = "Descriptive Statistics of Hypertension by age group")%>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Descriptive Statistics of Hypertension by age group
age_group N % hypertension
18-24 12 8.3
25-34 77 19.5
35-44 138 30.4
45-54 161 37.9
55-64 266 51.5
65+ 627 66.8

Questions:

  1. What is the overall prevalence of hypertension in the dataset? Overall prevalence of hypertension in the dataset is approximately 52.7%, indicating that about half of the participants have hypertension.

  2. How does hypertension prevalence vary by age group? Hypertension prevalence increases with age group. Older age groups have higher hypertension prevalence compared to younger age groups


Task 2: Build a Simple Logistic Regression Model

# YOUR CODE HERE: Fit a simple logistic regression model
# Outcome: hypertension
# Predictor: age_cont
model_logistic_simple <- glm(hypertension ~ age_cont,
                              data = brfss_subset_2023,
                              family = binomial(link = "logit"))

# YOUR CODE HERE: Display the results with odds ratios
tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
  kable(caption = "Simple Logistic Regression: Hypertension ~ Age (Odds Ratios)",
        digits = 3,
        col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)
Simple Logistic Regression: Hypertension ~ Age (Odds Ratios)
Term Odds Ratio Std. Error z-statistic p-value 95% CI Lower 95% CI Upper
(Intercept) 0.048 0.296 -10.293 0 0.026 0.084
age_cont 1.055 0.005 10.996 0 1.045 1.065

Questions:

  1. What is the odds ratio for age? Interpret this value. Odds Ratio for age is 1.055. It means that with each increase in age odss of hypertension increases 5.5%
  2. Is the association statistically significant? Yes association is statistically significant p-value<0.05< CI doesn’t cross the null value
  3. What is the 95% confidence interval for the odds ratio? 95% CI (1.045-1.065) —

Task 3: Create a Multiple Regression Model

# YOUR CODE HERE: Fit a multiple logistic regression model
# Outcome: hypertension
# Predictors: age_cont, sex, bmi_cat, phys_active, current_smoker
model_logistic_multiple <- glm(hypertension ~ age_cont + sex + bmi_cat +
                                phys_active + current_smoker,
                               data = brfss_subset_2023,
                               family = binomial(link = "logit"))

# YOUR CODE HERE: Display the results
tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
  kable(caption = "Multiple Logistic Regression: Hypertension ~ Age + Covariates (Odds Ratios)",
        digits = 3,
        col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  scroll_box(height = "400px")
Multiple Logistic Regression: Hypertension ~ Age + Covariates (Odds Ratios)
Term Odds Ratio Std. Error z-statistic p-value 95% CI Lower 95% CI Upper
(Intercept) 0.008 0.653 -7.355 0.000 0.002 0.028
age_cont 1.061 0.005 11.234 0.000 1.050 1.073
sexMale 1.270 0.123 1.950 0.051 0.999 1.616
bmi_catNormal 2.097 0.546 1.356 0.175 0.759 6.756
bmi_catOverweight 3.241 0.543 2.166 0.030 1.183 10.385
bmi_catObese 6.585 0.545 3.459 0.001 2.394 21.176
phys_active 0.900 0.130 -0.808 0.419 0.697 1.162
current_smoker 1.071 0.139 0.495 0.621 0.817 1.407

Questions:

  1. How did the odds ratio for age change after adjusting for other variables? Odds ratio for ahe increases after adjusting for other variables (5.5% with +1 age to 6.1%), indicating small confounding effect.
  2. What does this suggest about confounding? Because the odds ratio for age changed only slightly after adjustment, this suggests minimal confounding by sex, BMI category, physical activity, and smoking status. Age appears to be an independent predictor of hypertension.
  3. Which variables are the strongest predictors of hypertension? Individuals with obesity had about 6.6 times higher odds of hypertension compared to the reference group. —

Task 4: Interpret Dummy Variables

# YOUR CODE HERE: Create a table showing the dummy variable coding for bmi_cat
levels(brfss_subset_2023$bmi_cat)
## [1] "Underweight" "Normal"      "Overweight"  "Obese"
#dummy variables for BMI_cat
dummy_table <- data.frame(
  bmi_cat = c("Underweight","Normal", "Overweight", "Obese"),
  `Dummy 1 (Normal)` = c(0,1,0,0),
  `Dummy 2 (Overweight)` = c(0,0,1,0),
  `Dummy 3 (Obese)` = c(0,0,0,1),
  check.names = FALSE
)

# YOUR CODE HERE: Extract and display the odds ratios for BMI categories
dummy_table %>%
  kable(caption = "Dummy Variable Coding for BMI categories",
        align = "lccc") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(1, bold = TRUE, background = "#ffe6e6")  # Highlight reference category
Dummy Variable Coding for BMI categories
bmi_cat Dummy 1 (Normal) Dummy 2 (Overweight) Dummy 3 (Obese)
Underweight 0 0 0
Normal 1 0 0
Overweight 0 1 0
Obese 0 0 1
# Extract education coefficients
bmi_cat <- tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(str_detect(term, "bmi_cat")) %>%
  mutate(
    bmi_cat = str_remove(term, "bmi_cat"),
    bmi_cat = factor(bmi_cat,
                             levels = c("Underweight",
                                        "Normal",
                                       "Overweight",
                                       "Obese"))
  )

# Add reference category
ref_row <- data.frame(
  term = "bmi_catNormal",
  estimate = 1.0,
  std.error = 0,
  statistic = NA,
  p.value = NA,
  conf.low = 1.0,
  conf.high = 1.0,
  bmi_cat = factor("Underweight (Ref)",
                          levels = c("Underweight (Ref)",
                                     "Normal",
                                    "Overweight",
                                    "Obese"))
)

bmi_cat_full <- bind_rows(ref_row, bmi_cat) %>%
  mutate(bmi_cat = factor(bmi_cat,
                                 levels = c("Underweight (Ref)",
                                            "Normal",
                                           "Overweight",
                                           "Obese")))
bmi_cat_full <- bmi_cat_full %>%
  filter(!is.na(bmi_cat))

# Plot
p3 <- ggplot(bmi_cat_full, aes(x = bmi_cat, y = estimate)) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "darkred") +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high),
                  size = 0.8, color = "darkblue") +
  coord_flip() +
  labs(
    title = "Association Between BMI categories and Hypertension",
    subtitle = "Adjusted Odds Ratios (reference: Normal)",
    x = "BMI category",
    y = "Odds Ratio (95% CI)"
  ) +
  theme_minimal()

ggplotly(p3)

Odds Ratios for BMI categories

Questions:

  1. What is the reference category for BMI? Reference category for BMI is “Underweight”
  2. Interpret the odds ratio for “Obese” compared to the reference category. Odds of hypertension among individuals with obesity are about 6.59 times the odds among individuals who are underweight, adjusting for other covariates in the model.
  3. How would you explain this to a non-statistician? Individuals with obesity are much more likely to develop hypertension compared to individuals who are underweight. Specifically, people with obesity have almost 7 times higher likelihood of hypertension after accounting for other factors in the model.

Task 5: Test for Interaction

# YOUR CODE HERE: Fit a model with Age × BMI interaction
# Test if the effect of age on hypertension differs by BMI category
model_interaction <- glm(hypertension ~ age_cont * bmi_cat,
                         data = brfss_subset_2023,
                         family = binomial(link = "logit"))

# Display interaction results
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(str_detect(term, "age_cont")) %>%
  kable(caption = "Age × BMI category Interaction Model (Odds Ratios)",
        digits = 3,
        col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)
Age × BMI category Interaction Model (Odds Ratios)
Term Odds Ratio Std. Error z-statistic p-value 95% CI Lower 95% CI Upper
age_cont 1.004 0.042 0.102 0.918 0.929 1.108
age_cont:bmi_catNormal 1.058 0.043 1.306 0.192 0.957 1.147
age_cont:bmi_catOverweight 1.063 0.043 1.423 0.155 0.962 1.151
age_cont:bmi_catObese 1.054 0.042 1.232 0.218 0.954 1.140
# YOUR CODE HERE: Perform a likelihood ratio test comparing models with and without interaction
pred_interact <- ggpredict(model_interaction, terms = c("age_cont [18:80]", "bmi_cat"))

# Plot
p4 <- ggplot(pred_interact, aes(x = x, y = predicted, color = group, fill = group)) +
  geom_line(linewidth = 1.2) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, color = NA) +
  labs(
    title = "Predicted Probability of Hypertension by Age and BMI category",
    subtitle = "Testing for Age × BMI category Interaction",
    x = "Age (years)",
    y = "Predicted Probability of Hypertension",
    color = "BMI category",
    fill = "BMI category"
  ) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_color_manual(values = c("Normal" = "#50C878", "Overweight" = "#FFFF00", "Obese" = "#E64B35")) +
  scale_fill_manual(values = c("Normal" = "#50C878", "Overweight" = "#FFFF00", "Obese" = "#E64B35")) +
  theme_minimal() +
  theme(legend.position = "bottom")

ggplotly(p4)

Questions:

  1. Is the interaction term statistically significant? Interaction term is not statistically significant p-value>0.05 and 95% Confidence intervals include null value.
  2. What does this mean in epidemiologic terms (effect modification)? This means that the association between age and hypertension isn’t significantly different across all BMI categories.
  3. Create a visualization showing predicted probabilities by age and BMI category

Task 6: Model Diagnostics

# YOUR CODE HERE: Calculate VIF for your multiple regression model
vif_values <- vif(model_logistic_multiple)

# Create VIF table
# For models with categorical variables, vif() returns GVIF (Generalized VIF)
if (is.matrix(vif_values)) {
  # If matrix (categorical variables present), extract GVIF^(1/(2*Df))
  vif_df <- data.frame(
    Variable = rownames(vif_values),
    VIF = vif_values[, "GVIF^(1/(2*Df))"]
  )
} else {
  # If vector (only continuous variables)
  vif_df <- data.frame(
    Variable = names(vif_values),
    VIF = as.numeric(vif_values)
  )
}

# Add interpretation
vif_df <- vif_df %>%
  arrange(desc(VIF)) %>%
  mutate(
    Interpretation = case_when(
      VIF < 5 ~ "Low (No concern)",
      VIF >= 5 & VIF < 10 ~ "Moderate (Monitor)",
      VIF >= 10 ~ "High (Problem)"
    )
  )

vif_df %>%
  kable(caption = "Variance Inflation Factors (VIF) for Multiple Regression Model",
        digits = 2,
        align = "lrc") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which(vif_df$VIF >= 10), bold = TRUE, color = "white", background = "#DC143C") %>%
  row_spec(which(vif_df$VIF >= 5 & vif_df$VIF < 10), background = "#FFA500") %>%
  row_spec(which(vif_df$VIF < 5), background = "#90EE90")
Variance Inflation Factors (VIF) for Multiple Regression Model
Variable VIF Interpretation
age_cont age_cont 1.06 Low (No concern)
current_smoker current_smoker 1.04 Low (No concern)
bmi_cat bmi_cat 1.02 Low (No concern)
phys_active phys_active 1.01 Low (No concern)
sex sex 1.01 Low (No concern)
# YOUR CODE HERE: Create a Cook's distance plot to identify influential observations
cooks_d <- cooks.distance(model_logistic_multiple)

# Create data frame
influence_df <- data.frame(
  observation = 1:length(cooks_d),
  cooks_d = cooks_d
) %>%
  mutate(influential = ifelse(cooks_d > 1, "Yes", "No"))

# Plot
p5 <- ggplot(influence_df, aes(x = observation, y = cooks_d, color = influential)) +
  geom_point(alpha = 0.6) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
  labs(
    title = "Cook's Distance: Identifying Influential Observations",
    subtitle = "Values > 1 indicate potentially influential observations",
    x = "Observation Number",
    y = "Cook's Distance",
    color = "Influential?"
  ) +
  scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
  theme_minimal(base_size = 12)

ggplotly(p5)
# Count influential observations
n_influential <- sum(influence_df$influential == "Yes")
cat("Number of potentially influential observations:", n_influential, "\n")
## Number of potentially influential observations: 0

Questions:

  1. Are there any concerns about multicollinearity? No serious concerns were detected, because VIF < 5: Generally acceptable
  2. Are there any influential observations that might affect your results? There are no influential observations that might affect my result according to Cook’s distance
  3. What would you do if you found serious violations? For multicollinearity- removing or combining highly correlated variables. For influential observations- examine whether the observations are data entry errors, outliers, or legitimate extreme values.

Task 7: Model Comparison

# YOUR CODE HERE: Compare three models using AIC and BIC
# Model A: Age only
model1 <- glm(hypertension ~ age_cont,
              data = brfss_subset_2023,
              family = binomial)
# Model B: Age + sex + bmi_cat
model2 <- glm(hypertension ~ age_cont + sex + bmi_cat,
              data = brfss_subset_2023,
              family = binomial)
# Model C: Age + sex + bmi_cat + phys_active + current_smoker
model3 <- glm(hypertension ~ age_cont + sex + bmi_cat + phys_active + current_smoker,
              data = brfss_subset_2023,
              family = binomial)

# Likelihood ratio test
lrt_1_2 <- anova(model1, model2, test = "LRT")
lrt_2_3 <- anova(model2, model3, test = "LRT")

# YOUR CODE HERE: Create a comparison table
model_comp <- data.frame(
  Model = c("Model 1: Age only",
            "Model 2: Age + sex + bmi_cat + phys_active + current_smoker",
            "Model 3: Full model"),
  AIC = c(AIC(model1), AIC(model2), AIC(model3)),
  BIC = c(BIC(model1), BIC(model2), BIC(model3)),
  `Deviance` = c(deviance(model1), deviance(model2), deviance(model3)),
  check.names = FALSE
)

model_comp %>%
  kable(caption = "Model Comparison: AIC, BIC, and Deviance",
        digits = 2,
        align = "lrrr") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which.min(model_comp$AIC), bold = TRUE, background = "#d4edda")
Model Comparison: AIC, BIC, and Deviance
Model AIC BIC Deviance
Model 1: Age only 1636.61 1646.92 1632.61
Model 2: Age + sex + bmi_cat + phys_active + current_smoker 1576.49 1607.42 1564.49
Model 3: Full model 1579.50 1620.74 1563.50

Questions:

  1. Which model has the best fit based on AIC? Model 2 has the best fit based on AIC
  2. Is the added complexity of the full model justified? It is not justified because it does not meaningfully improve model fit (AIC is slightly higher)
  3. Which model would you choose for your final analysis? Why? Model 2, because it is model with the lowest AIC and a substantially lower BIC compared to the full model. —

Lab Report Guidelines

Write a brief report (1-2 pages) summarizing your findings:

  1. Introduction: State your research question
  2. Methods: Describe your analytic approach
  3. Results: Present key findings with tables and figures
  4. Interpretation: Explain what your results mean
  5. Limitations: Discuss potential issues with your analysis

Submission: Submit your completed R Markdown file and knitted HTML report.


Summary

Key Concepts Covered

  1. Statistical modeling describes relationships between variables
  2. Regression types depend on the outcome variable type
  3. Logistic regression is appropriate for binary outcomes
  4. Multiple regression controls for confounding
  5. Dummy variables represent categorical predictors
  6. Interactions test for effect modification
  7. Model diagnostics check assumptions and identify problems
  8. Model comparison helps select the best model

Important Formulas

Logistic Regression:

\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]

Odds Ratio:

\[\text{OR} = e^{\beta_i}\]

Predicted Probability:

\[p = \frac{e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}{1 + e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}\]


References

  • Agresti, A. (2018). An Introduction to Categorical Data Analysis (3rd ed.). Wiley.
  • Hosmer, D. W., Lemeshow, S., & Sturdivant, R. X. (2013). Applied Logistic Regression (3rd ed.). Wiley.
  • Vittinghoff, E., Glidden, D. V., Shiboski, S. C., & McCulloch, C. E. (2012). Regression Methods in Biostatistics (2nd ed.). Springer.
  • Centers for Disease Control and Prevention. (2023). Behavioral Risk Factor Surveillance System.

Session Info

sessionInfo()
## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggstats_0.12.0   gtsummary_2.5.0  ggeffects_2.3.2  car_3.1-3       
##  [5] carData_3.0-5    broom_1.0.11     plotly_4.12.0    kableExtra_1.4.0
##  [9] knitr_1.51       haven_2.5.5      lubridate_1.9.4  forcats_1.0.1   
## [13] stringr_1.6.0    dplyr_1.1.4      purrr_1.2.1      readr_2.1.6     
## [17] tidyr_1.3.2      tibble_3.3.1     ggplot2_4.0.1    tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       xfun_0.56          bslib_0.9.0        htmlwidgets_1.6.4 
##  [5] insight_1.4.4      tzdb_0.5.0         crosstalk_1.2.2    vctrs_0.7.1       
##  [9] tools_4.5.2        generics_0.1.4     datawizard_1.3.0   pkgconfig_2.0.3   
## [13] data.table_1.18.0  RColorBrewer_1.1-3 S7_0.2.1           lifecycle_1.0.5   
## [17] compiler_4.5.2     farver_2.1.2       textshaping_1.0.4  codetools_0.2-20  
## [21] htmltools_0.5.9    sass_0.4.10        yaml_2.3.12        lazyeval_0.2.2    
## [25] Formula_1.2-5      pillar_1.11.1      jquerylib_0.1.4    cachem_1.1.0      
## [29] abind_1.4-8        tidyselect_1.2.1   digest_0.6.39      stringi_1.8.7     
## [33] labeling_0.4.3     fastmap_1.2.0      grid_4.5.2         cli_3.6.5         
## [37] magrittr_2.0.4     utf8_1.2.6         withr_3.0.2        scales_1.4.0      
## [41] backports_1.5.0    timechange_0.3.0   rmarkdown_2.30     httr_1.4.7        
## [45] otel_0.2.0         hms_1.1.4          evaluate_1.0.5     viridisLite_0.4.2 
## [49] rlang_1.1.7        glue_1.8.0         xml2_1.5.2         svglite_2.2.2     
## [53] rstudioapi_0.18.0  jsonlite_2.0.0     R6_2.6.1           systemfonts_1.3.1