Setup and Data Preparation

# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)

Loading BRFSS 2023 Data

The BRFSS is a large-scale telephone survey that collects data on health-related risk behaviors, chronic health conditions, and use of preventive services from U.S. residents.

# Load the full BRFSS 2023 dataset
brfss_full <- read_xpt("C:/Users/abbym/OneDrive/Desktop/STATS553/R Materials/epi553/scripts/LLCP2023XPT/LLCP2023.XPT") %>%
  janitor::clean_names()

# Display dataset dimensions
names(brfss_full)

##   [1] "state"    "fmonth"   "idate"    "imonth"   "iday"     "iyear"   
##   [7] "dispcode" "seqno"    "psu"      "ctelenm1" "pvtresd1" "colghous"
##  [13] "statere1" "celphon1" "ladult1"  "numadult" "respslc1" "landsex2"
##  [19] "lndsxbrt" "safetime" "ctelnum1" "cellfon5" "cadult1"  "cellsex2"
##  [25] "celsxbrt" "pvtresd3" "cclghous" "cstate1"  "landline" "hhadult" 
##  [31] "sexvar"   "genhlth"  "physhlth" "menthlth" "poorhlth" "primins1"
##  [37] "persdoc3" "medcost1" "checkup1" "exerany2" "exract12" "exeroft1"
##  [43] "exerhmm1" "exract22" "exeroft2" "exerhmm2" "strength" "bphigh6" 
##  [49] "bpmeds1"  "cholchk3" "toldhi3"  "cholmed3" "cvdinfr4" "cvdcrhd4"
##  [55] "cvdstrk3" "asthma3"  "asthnow"  "chcscnc1" "chcocnc1" "chccopd3"
##  [61] "addepev3" "chckdny2" "havarth4" "diabete4" "diabage4" "marital" 
##  [67] "educa"    "renthom1" "numhhol4" "numphon4" "cpdemo1c" "veteran3"
##  [73] "employ1"  "children" "income3"  "pregnant" "weight2"  "height3" 
##  [79] "deaf"     "blind"    "decide"   "diffwalk" "diffdres" "diffalon"
##  [85] "fall12mn" "fallinj5" "smoke100" "smokday2" "usenow3"  "ecignow2"
##  [91] "alcday4"  "avedrnk3" "drnk3ge5" "maxdrnks" "flushot7" "flshtmy3"
##  [97] "pneuvac4" "shingle2" "hivtst7"  "hivtstd3" "seatbelt" "drnkdri2"
## [103] "covidpo1" "covidsm1" "covidact" "pdiabts1" "prediab2" "diabtype"
## [109] "insulin1" "chkhemo3" "eyeexam1" "diabeye1" "diabedu1" "feetsore"
## [115] "arthexer" "arthedu"  "lmtjoin3" "arthdis2" "joinpai2" "lcsfirst"
## [121] "lcslast"  "lcsnumcg" "lcsctsc1" "lcsscncr" "lcsctwhn" "hadmam"  
## [127] "howlong"  "cervscrn" "crvclcnc" "crvclpap" "crvclhpv" "hadhyst2"
## [133] "psatest1" "psatime1" "pcpsars2" "psasugs1" "pcstalk2" "hadsigm4"
## [139] "colnsigm" "colntes1" "sigmtes1" "lastsig4" "colncncr" "vircolo1"
## [145] "vclntes2" "smalstol" "stoltest" "stooldn2" "bldstfit" "sdnates1"
## [151] "cncrdiff" "cncrage"  "cncrtyp2" "csrvtrt3" "csrvdoc1" "csrvsum" 
## [157] "csrvrtrn" "csrvinst" "csrvinsr" "csrvdein" "csrvclin" "csrvpain"
## [163] "csrvctl2" "indortan" "numburn3" "sunprtct" "wkdayout" "wkendout"
## [169] "cimemlo1" "cdworry"  "cddiscu1" "cdhous1"  "cdsocia1" "caregiv1"
## [175] "crgvrel4" "crgvlng1" "crgvhrs1" "crgvprb3" "crgvalzd" "crgvper1"
## [181] "crgvhou1" "crgvexpt" "lastsmk2" "stopsmk2" "mentcigs" "mentecig"
## [187] "heattbco" "firearm5" "gunload"  "loadulk2" "hasymp1"  "hasymp2" 
## [193] "hasymp3"  "hasymp4"  "hasymp5"  "hasymp6"  "strsymp1" "strsymp2"
## [199] "strsymp3" "strsymp4" "strsymp5" "strsymp6" "firstaid" "aspirin" 
## [205] "birthsex" "somale"   "sofemale" "trnsgndr" "marijan1" "marjsmok"
## [211] "marjeat"  "marjvape" "marjdab"  "marjothr" "usemrjn4" "acedeprs"
## [217] "acedrink" "acedrugs" "aceprisn" "acedivrc" "acepunch" "acehurt1"
## [223] "aceswear" "acetouch" "acetthem" "acehvsex" "aceadsaf" "aceadned"
## [229] "imfvpla4" "hpvadvc4" "hpvadsht" "tetanus1" "covidva1" "covacge1"
## [235] "covidnu2" "lsatisfy" "emtsuprt" "sdlonely" "sdhemply" "foodstmp"
## [241] "sdhfood1" "sdhbills" "sdhutils" "sdhtrnsp" "sdhstre1" "rrclass3"
## [247] "rrcognt2" "rrtreat"  "rratwrk2" "rrhcare4" "rrphysm2" "rcsgend1"
## [253] "rcsxbrth" "rcsrltn2" "casthdx2" "casthno2" "qstver"   "qstlang" 
## [259] "metstat"  "urbstat"  "mscode"   "ststr"    "strwt"    "rawrake" 
## [265] "wt2rake"  "imprace"  "chispnc"  "crace1"   "cageg"    "cllcpwt" 
## [271] "dualuse"  "dualcor"  "llcpwt2"  "llcpwt"   "rfhlth"   "phys14d" 
## [277] "ment14d"  "hlthpl1"  "hcvu653"  "totinda"  "metvl12"  "metvl22" 
## [283] "maxvo21"  "fc601"    "actin13"  "actin23"  "padur1"   "padur2"  
## [289] "pafreq1"  "pafreq2"  "minac12"  "minac22"  "strfreq"  "pamiss3" 
## [295] "pamin13"  "pamin23"  "pa3min"   "pavig13"  "pavig23"  "pa3vigm" 
## [301] "pacat3"   "paindx3"  "pa150r4"  "pa300r4"  "pa30023"  "pastrng" 
## [307] "parec3"   "pastae3"  "rfhype6"  "cholch3"  "rfchol3"  "michd"   
## [313] "ltasth1"  "casthm1"  "asthms1"  "drdxar2"  "mrace1"   "hispanc" 
## [319] "race"     "raceg21"  "racegr3"  "raceprv"  "sex"      "ageg5yr" 
## [325] "age65yr"  "age80"    "age_g"    "htin4"    "htm4"     "wtkg3"   
## [331] "bmi5"     "bmi5cat"  "rfbmi5"   "chldcnt"  "educag"   "incomg1" 
## [337] "smoker3"  "rfsmok3"  "cureci2"  "drnkany6" "drocdy4"  "rfbing6" 
## [343] "drnkwk2"  "rfdrhv8"  "flshot7"  "pneumo3"  "aidtst4"  "rfseat2" 
## [349] "rfseat3"  "drnkdrv"

Creating a Working Subset

For computational efficiency and teaching purposes, we’ll create a subset with relevant variables and complete cases.

# Select variables of interest and create analytic dataset
set.seed(553)  # For reproducibility

brfss_subset <- brfss_full %>%
  select(
    # Outcome: Diabetes status
    diabete4,
    # Demographics
    age_g,      # Age category
    sex,         # Sex
    race,       # Race/ethnicity
    educag,     # Education level
    incomg1,    # Income category
    # Health behaviors
    bmi5cat,    # BMI category
    exerany2,     # Physical activity
    smokday2,     # Smoking frequency
    # Health status
    genhlth,      # General health
    rfhype6,    # High blood pressure
    rfchol3     # High cholesterol
  ) %>%
  # Filter to complete cases only
  drop_na() %>%
  # Sample 2000 observations for manageable analysis
  slice_sample(n = 2000)

# Display subset dimensions
cat("Working subset dimensions:",
    nrow(brfss_subset), "observations,",
    ncol(brfss_subset), "variables\n")

## Working subset dimensions: 2000 observations, 12 variables

Data Recoding and Cleaning

# Create clean dataset with recoded variables
brfss_clean <- brfss_subset %>%
  mutate(
    # Outcome: Diabetes (binary)
    diabetes = case_when(
      diabete4 == 1 ~ 1,  # Yes
      diabete4 %in% c(2, 3, 4) ~ 0,  # No, pre-diabetes, or gestational only
      TRUE ~ NA_real_
    ),

    # Age groups
    age_group = factor(case_when(
      age_g == 1 ~ "18-24",
      age_g == 2 ~ "25-34",
      age_g == 3 ~ "35-44",
      age_g == 4 ~ "45-54",
      age_g == 5 ~ "55-64",
      age_g == 6 ~ "65+"
    ), levels = c("18-24", "25-34", "35-44", "45-54", "55-64", "65+")),

    # Age continuous (midpoint of category)
    age_cont = case_when(
      age_g == 1 ~ 21,
      age_g == 2 ~ 29.5,
      age_g == 3 ~ 39.5,
      age_g == 4 ~ 49.5,
      age_g == 5 ~ 59.5,
      age_g == 6 ~ 70
    ),

    # Sex
    sex = factor(ifelse(sex == 1, "Male", "Female")),

    # Race/ethnicity
    race = factor(case_when(
      race == 1 ~ "White",
      race == 2 ~ "Black",
      race == 3 ~ "Native American",
      race == 4 ~ "Asian",
      race == 5 ~ "Native Hawaiian/PI",
      race == 6 ~ "Other",
      race == 7 ~ "Multiracial",
      race == 8 ~ "Hispanic"
    )),

    # Education (simplified)
    education = factor(case_when(
      educag == 1 ~ "< High school",
      educag == 2 ~ "High school graduate",
      educag == 3 ~ "Some college",
      educag == 4 ~ "College graduate"
    ), levels = c("< High school", "High school graduate", "Some college", "College graduate")),

    # Income (simplified)
    income = factor(case_when(
      incomg1 == 1 ~ "< $25,000",
      incomg1 == 2 ~ "$25,000-$49,999",
      incomg1 == 3 ~ "$50,000-$74,999",
      incomg1 == 4 ~ "$75,000+",
      incomg1 == 5 ~ "Unknown"
    ), levels = c("< $25,000", "$25,000-$49,999", "$50,000-$74,999", "$75,000+", "Unknown")),

    # BMI category
    bmi_cat = factor(case_when(
      bmi5cat == 1 ~ "Underweight",
      bmi5cat == 2 ~ "Normal",
      bmi5cat == 3 ~ "Overweight",
      bmi5cat == 4 ~ "Obese"
    ), levels = c("Underweight", "Normal", "Overweight", "Obese")),

    # Physical activity (binary)
    phys_active = ifelse(exerany2 == 1, 1, 0),

    # Current smoking
    current_smoker = case_when(
      smokday2 == 1 ~ 1,  # Every day
      smokday2 == 2 ~ 1,  # Some days
      smokday2 == 3 ~ 0,  # Not at all
      TRUE ~ 0
    ),

    # General health (simplified)
    gen_health = factor(case_when(
      genhlth %in% c(1, 2) ~ "Excellent/Very good",
      genhlth == 3 ~ "Good",
      genhlth %in% c(4, 5) ~ "Fair/Poor"
    ), levels = c("Excellent/Very good", "Good", "Fair/Poor")),

    # Hypertension
    hypertension = ifelse(rfhype6 == 2, 1, 0),

    # High cholesterol
    high_chol = ifelse(rfchol3 == 2, 1, 0)
  ) %>%
  # Select only the clean variables
  select(diabetes, age_group, age_cont, sex, race, education, income,
         bmi_cat, phys_active, current_smoker, gen_health,
         hypertension, high_chol) %>%
  # Remove any remaining missing values
  drop_na()

# Save the cleaned subset for future use
write_rds(brfss_clean,
          "C:/Users/abbym/OneDrive/Desktop/STATS553/R Materials/epi553/scripts/brfss_clean.rds")

cat("Clean dataset saved with", nrow(brfss_clean), "complete observations\n")

## Clean dataset saved with 1281 complete observations

Part 2: Student Lab Activity

Lab Overview

In this lab, you will:

Build your own logistic regression model predicting hypertension (high blood pressure)
Create dummy variables for categorical predictors
Interpret regression coefficients
Test for confounding and interaction
Perform model diagnostics

Lab Instructions

Task 1: Explore the Outcome Variable

# YOUR CODE HERE: Create a frequency table of hypertension status
hypertension_freq <- table(brfss_clean$hypertension)
print(hypertension_freq)

## 
##   0   1 
## 606 675

# YOUR CODE HERE: Calculate the prevalence of hypertension by age group
hypertension_age_table <- brfss_clean %>%
  group_by(age_group) %>%
  summarise(
    N = n(),
    `% Hypertension` = round(100 * mean(hypertension), 1)
  )
 
hypertension_age_table %>%
  kable(caption = "Descriptive Statistics by Hypertension",
        align = "lrrrrrrrr") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Descriptive Statistics by Hypertension
age_group	N	% Hypertension
18-24	12	8.3
25-34	77	19.5
35-44	138	30.4
45-54	161	37.9
55-64	266	51.5
65+	627	66.8

Questions:

What is the overall prevalence of hypertension in the dataset? The overall prevalence of hypertension in the dataset is 52.7%.
How does hypertension prevalence vary by age group? As age increases, the prevalence of hypertension increases.

Task 2: Build a Simple Logistic Regression Model

# YOUR CODE HERE: Fit a simple logistic regression model
# Outcome: hypertension
# Predictor: age_cont
# Simple linear regression: hypertension ~ age
model_logistic_simple <- glm(hypertension ~ age_cont, data = brfss_clean)


# YOUR CODE HERE: Display the results with odds ratios
tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
  kable(caption = "Simple Logistic Regression: Hypertension ~ Age (Odds Ratios)",
        digits = 3,
        col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)

Simple Logistic Regression: Hypertension ~ Age (Odds Ratios)
Term	Odds Ratio	Std. Error	z-statistic	p-value	95% CI Lower	95% CI Upper
(Intercept)	0.828	0.060	-3.150	0.002	0.736	0.931
age_cont	1.012	0.001	12.224	0.000	1.010	1.014

Questions:

What is the odds ratio for age? Interpret this value. The odds ratio for age is 1.012. For each 1 year increase in age, the odds of hypertension increases by 1.2%.
Is the association statistically significant? Yes, the association is statistically significant.
What is the 95% confidence interval for the odds ratio? The 95% confidence interval for the odds ratio is [1.010,1.014]. —

Task 3: Create a Multiple Regression Model

# YOUR CODE HERE: Fit a multiple logistic regression model
# Outcome: hypertension
# Predictors: age_cont, sex, bmi_cat, phys_active, current_smoker
model_logistic_multiple <- glm(hypertension ~ age_cont + sex + bmi_cat +
                                phys_active + current_smoker,
                               data = brfss_clean,
                               family = binomial(link = "logit"))

# YOUR CODE HERE: Display the results
tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
  kable(caption = "Multiple Logistic Regression: Hypertension ~ Age + Covariates (Odds Ratios)",
        digits = 3,
        col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  scroll_box(height = "400px")

Multiple Logistic Regression: Hypertension ~ Age + Covariates (Odds Ratios)
Term	Odds Ratio	Std. Error	z-statistic	p-value	95% CI Lower	95% CI Upper
(Intercept)	0.008	0.653	-7.355	0.000	0.002	0.028
age_cont	1.061	0.005	11.234	0.000	1.050	1.073
sexMale	1.270	0.123	1.950	0.051	0.999	1.616
bmi_catNormal	2.097	0.546	1.356	0.175	0.759	6.756
bmi_catOverweight	3.241	0.543	2.166	0.030	1.183	10.385
bmi_catObese	6.585	0.545	3.459	0.001	2.394	21.176
phys_active	0.900	0.130	-0.808	0.419	0.697	1.162
current_smoker	1.071	0.139	0.495	0.621	0.817	1.407

Questions:

How did the odds ratio for age change after adjusting for other variables? The odds ratio for age increased from 1.012 to 1.061 after adjusting for other variables.
What does this suggest about confounding? This suggests that confounding diminishes the relationship between age and risk of hypertension.
Which variables are the strongest predictors of hypertension? BMI and sex are the strongest predictors of hypertension. —

Task 4: Interpret Dummy Variables

# YOUR CODE HERE: Create a table showing the dummy variable coding for bmi_cat
dummy_table <- data.frame(
  bmi_cat = c("Underweight", "Normal", "Overweight", "Obese"),
  `Dummy 1 (Normal)` = c(0, 1, 0, 0),
  `Dummy 2 (Overweight)` = c(0, 0, 1, 0),
  `Dummy 3 (Obese)` = c(0, 0, 0, 1),
  check.names = FALSE
)

dummy_table %>%
  kable(caption = "Dummy Variable Coding for BMI",
        align = "lccc") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(1, bold = TRUE, background = "#ffe6e6")  # Highlight reference category

Dummy Variable Coding for BMI
bmi_cat	Dummy 1 (Normal)	Dummy 2 (Overweight)	Dummy 3 (Obese)
Underweight	0	0	0
Normal	1	0	0
Overweight	0	1	0
Obese	0	0	1

# YOUR CODE HERE: Extract and display the odds ratios for BMI categories
bmi_coefs <- tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(str_detect(term, "bmi_cat")) %>%
  mutate(
    bmi_cat = str_remove(term, "bmi_cat"),
    bmi_cat = factor(bmi_cat,
                             levels = c("Normal",
                                       "Overweight",
                                       "Obese"))
  )

# Add reference category
ref_row <- data.frame(
  term = "bmi Underweight",
  estimate = 1.0,
  std.error = 0,
  statistic = NA,
  p.value = NA,
  conf.low = 1.0,
  conf.high = 1.0,
  bmi_cat = factor("Underweight (Ref)",
                          levels = c("Underweight (Ref)",
                                    "Normal",
                                    "Overweight",
                                    "Obese"))
)

bmi_coefs_full <- bind_rows(ref_row, bmi_coefs) %>%
  mutate(bmi_cat = factor(bmi_cat,
                                 levels = c("Underweight (Ref)",
                                           "Normal",
                                           "Overweight",
                                           "Obese")))
#Plot
p3 <- ggplot(bmi_coefs_full, aes(x = bmi_cat, y = estimate)) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high),
                  size = 0.8, color = "darkblue") +
  coord_flip() +
  labs(
    title = "Association Between BMI Category and Hypertension",
    subtitle = "Adjusted Odds Ratios (reference: Underweight)",
    x = "BMI Category",
    y = "Odds Ratio (95% CI)"
  ) +
  theme_minimal(base_size = 12)

ggplotly(p3)

Questions:

What is the reference category for BMI? Underweight is the reference category for BMI.
Interpret the odds ratio for “Obese” compared to the reference category. Those who are obese have 6.6 times the odds of having hypertension than someone who is underweight.
How would you explain this to a non-statistician? I would explain this to a non-statistician as that if you are obese, you have a much higher chance of having hypertension that someone who is underweight. —

Task 5: Test for Interaction

# YOUR CODE HERE: Fit a model with Age × BMI interaction
# Test if the effect of age on hypertension differs by BMI category
# Model with interaction term
model_interaction <- glm(hypertension ~ age_cont * bmi_cat,
                         data = brfss_clean,
                         family = binomial(link = "logit"))

# Display interaction results
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(str_detect(term, "age_cont")) %>%
  kable(caption = "Age × BMI Interaction Model (Odds Ratios)",
        digits = 3,
        col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)

Age × BMI Interaction Model (Odds Ratios)
Term	Odds Ratio	Std. Error	z-statistic	p-value	95% CI Lower	95% CI Upper
age_cont	1.004	0.042	0.102	0.918	0.929	1.108
age_cont:bmi_catNormal	1.058	0.043	1.306	0.192	0.957	1.147
age_cont:bmi_catOverweight	1.063	0.043	1.423	0.155	0.962	1.151
age_cont:bmi_catObese	1.054	0.042	1.232	0.218	0.954	1.140

# YOUR CODE HERE: Perform a likelihood ratio test comparing models with and without interaction
# Model 1: Age only
model1 <- glm(hypertension ~ age_cont,
              data = brfss_clean,
              family = binomial)

# Model 2: Age + BMI
model2 <- glm(hypertension ~ age_cont + bmi_cat,
              data = brfss_clean,
              family = binomial)

# Likelihood ratio test
lrt_1_2 <- anova(model1, model2, test = "LRT")

print(lrt_1_2)

## Analysis of Deviance Table
## 
## Model 1: hypertension ~ age_cont
## Model 2: hypertension ~ age_cont + bmi_cat
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      1279     1632.6                          
## 2      1276     1568.1  3   64.554 6.249e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Predict probabilities by BMI
pred_interact <- ggpredict(model_interaction, terms = c("age_cont [18:80]", "bmi_cat"))
#Plot
p4 <- ggplot(pred_interact, aes(x = x, y = predicted, color = group, fill = group)) +
  geom_line(linewidth = 1.2) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, color = NA) +
  labs(
    title = "Predicted Probability of Hypertension by Age and BMI",
    subtitle = "Testing for Age × BMI Interaction",
    x = "Age (years)",
    y = "Predicted Probability of Hypertension",
    color = "BMI",
    fill = "BMI"
  ) +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "bottom")

ggplotly(p4)

Questions:

Is the interaction term statistically significant? The interaction term is not statistically significant.
What does this mean in epidemiologic terms (effect modification)? This means that BMI is not an effect modifier of age and hypertension risk.
Create a visualization showing predicted probabilities by age and BMI category

Task 6: Model Diagnostics

# YOUR CODE HERE: Calculate VIF for your multiple regression model
# Calculate VIF
vif_values <- vif(model_logistic_multiple)

# Create VIF table
# For models with categorical variables, vif() returns GVIF (Generalized VIF)
if (is.matrix(vif_values)) {
  # If matrix (categorical variables present), extract GVIF^(1/(2*Df))
  vif_df <- data.frame(
    Variable = rownames(vif_values),
    VIF = vif_values[, "GVIF^(1/(2*Df))"]
  )
} else {
  # If vector (only continuous variables)
  vif_df <- data.frame(
    Variable = names(vif_values),
    VIF = as.numeric(vif_values)
  )
}

# Add interpretation
vif_df <- vif_df %>%
  arrange(desc(VIF)) %>%
  mutate(
    Interpretation = case_when(
      VIF < 5 ~ "Low (No concern)",
      VIF >= 5 & VIF < 10 ~ "Moderate (Monitor)",
      VIF >= 10 ~ "High (Problem)"
    )
  )

vif_df %>%
  kable(caption = "Variance Inflation Factors (VIF) for Multiple Regression Model",
        digits = 2,
        align = "lrc") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which(vif_df$VIF >= 10), bold = TRUE, color = "white", background = "#DC143C") %>%
  row_spec(which(vif_df$VIF >= 5 & vif_df$VIF < 10), background = "#FFA500") %>%
  row_spec(which(vif_df$VIF < 5), background = "#90EE90")

Variance Inflation Factors (VIF) for Multiple Regression Model
	Variable	VIF	Interpretation
age_cont	age_cont	1.06	Low (No concern)
current_smoker	current_smoker	1.04	Low (No concern)
bmi_cat	bmi_cat	1.02	Low (No concern)
phys_active	phys_active	1.01	Low (No concern)
sex	sex	1.01	Low (No concern)

# YOUR CODE HERE: Create a Cook's distance plot to identify influential observations
# Calculate Cook's distance
cooks_d <- cooks.distance(model_logistic_multiple)

# Create data frame
influence_df <- data.frame(
  observation = 1:length(cooks_d),
  cooks_d = cooks_d
) %>%
  mutate(influential = ifelse(cooks_d > 1, "Yes", "No"))

# Plot
p5 <- ggplot(influence_df, aes(x = observation, y = cooks_d, color = influential)) +
  geom_point(alpha = 0.6) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
  labs(
    title = "Cook's Distance: Identifying Influential Observations",
    subtitle = "Values > 1 indicate potentially influential observations",
    x = "Observation Number",
    y = "Cook's Distance",
    color = "Influential?"
  ) +
  scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
  theme_minimal(base_size = 12)

ggplotly(p5)

# Count influential observations
n_influential <- sum(influence_df$influential == "Yes")
cat("Number of potentially influential observations:", n_influential, "\n")

## Number of potentially influential observations: 0

Questions:

Are there any concerns about multicollinearity? No, there are no concerns about multicollinearity.
Are there any influential observations that might affect your results? No, there are no influential observations that might affect my results.
What would you do if you found serious violations? I would remove the predictors that are highly correlated and the influential observations. —

Task 7: Model Comparison

# YOUR CODE HERE: Compare three models using AIC and BIC
# Model A: Age only
# Model B: Age + sex + bmi_cat
# Model C: Age + sex + bmi_cat + phys_active + current_smoker

# Model 1: Age only
model1 <- glm(hypertension ~ age_cont,
              data = brfss_clean,
              family = binomial)

# Model 2: Age + Sex + bmi_cat
model2 <- glm(hypertension ~ age_cont + sex + bmi_cat,
              data = brfss_clean,
              family = binomial)

# Model 3: Age + sex + bmi_cat + phys_active + current_smoker
model3 <- glm(hypertension ~ age_cont + sex + bmi_cat + phys_active + current_smoker,
              data = brfss_clean,
              family = binomial)


# Likelihood ratio test
lrt_1_2 <- anova(model1, model2, test = "LRT")
lrt_2_3 <- anova(model2, model3, test = "LRT")

# Create comparison table
model_comp <- data.frame(
  Model = c("Model 1: Age only",
            "Model 2: Age + Sex + BMI",
            "Model 3: Age + Sex + BMI + Physical Activity + Current Smoker"),
  AIC = c(AIC(model1), AIC(model2), AIC(model3)),
  BIC = c(BIC(model1), BIC(model2), BIC(model3)),
  `Deviance` = c(deviance(model1), deviance(model2), deviance(model3)),
  check.names = FALSE
)

model_comp %>%
  kable(caption = "Model Comparison: AIC, BIC, and Deviance",
        digits = 2,
        align = "lrrr") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which.min(model_comp$AIC), bold = TRUE, background = "#d4edda")

Model Comparison: AIC, BIC, and Deviance
Model	AIC	BIC	Deviance
Model 1: Age only	1636.61	1646.92	1632.61
Model 2: Age + Sex + BMI	1576.49	1607.42	1564.49
Model 3: Age + Sex + BMI + Physical Activity + Current Smoker	1579.50	1620.74	1563.50

Questions:

Which model has the best fit based on AIC? The age + sex + BMI model was the best fit.
Is the added complexity of the full model justified? No, because the AIC is better with the more simple model with age and sex and BMI.
Which model would you choose for your final analysis? Why? I would choose the age + sex + BMI model for the final analysis, because it has the best model fit (lowest AIC/BIC). —

Lab Report Guidelines

Write a brief report (1-2 pages) summarizing your findings:

Introduction: State your research question
Methods: Describe your analytic approach
Results: Present key findings with tables and figures
Interpretation: Explain what your results mean
Limitations: Discuss potential issues with your analysis

Submission: Submit your completed R Markdown file and knitted HTML report. Introduction: How does age impact the risk of hypertension? Age as a continuous variable and hypertension status as a binary (yes/no) variable will be compared, as well as covariates of sex, BMI, physical activity, and smoking status in order to understand the association between these variables. Methods: 1. Descriptive Statistics: Looked at descriptive statistics to figure out how hypertension risk and prevalence is distributed across the different age categories. 2. Simple Logistic Regression: A simple logistic regression was run to explore the relationship between age as a continuous variable and hypertension risk. The odds ratio, p-value, and 95% confidence interval was calculated. 3. Multiple Logistic Regression: A multiple logistic regression was run to explore the relationship between hypertension and age, as well as sex, BMI, physical activity, and smoking status as covariates. 4. Dummy Variables: Dummy variables for BMI categories were used in order to establish “underweight” as the reference category when looking at BMI’s effect on age and hypertension. 5. Interaction: The interaction between BMI and age was tested to see if BMI is an effect modifier of age and hypertension risk. 6. VIF and Cook’s Distance: VIF and Cook’s Distance was calculated to make sure there were no multicollinearity/influential observation issues. 7. Model Comparison: Models of comparing hypertension with age only, then hypertension with age, sex, and BMI, and hypertension with age, sex, BMI, physical activity, and smoking status were compared to see which model was the best fit. Results: The descriptive statistics showed that as age increases, the prevalence of hypertension also increases. The odds ratio calculated from the simple logistic regression (“Simple Logistic Regression: Hypertension ~ Age (Odds Ratios)”) table shows that for each 1 year increase in age, the odds of hypertension increases by 1.2%., which was a statistically significant result. When other variables were included in the multiple logistic regression (like sex, BMI, and more), the odds ratio for age (found in the “Multiple Logistic Regression: Hypertension ~ Age + Covariates (Odds Ratios)” table) shows that for each 1 year increase in age, the odds of hypertension increases by 6.1%. This means that confounding factors diminishes the relationship between age and hypertension. When coding the dummy variables, it was found that as BMI category increases, the odds ratio of developing hypertension increases, which can be found in the “Association Between BMI Category and Hypertension” visualization. The interaction between age and BMI was found to be insignificant, with p-values all >0.05 (“Age × BMI Interaction Model (Odds Ratios)” table). The likelihood ratio test completed during this step had a significant outcome, (p<0.05), meaning the model including BMI is a better fit. The VIF results were all less than 5, meaning multicollinearity is not a concern, and the Cook’s Distance plot showed that there were no influential observations. Finally, the LRT was completed again for 3 different models, which showed that the model including age, sex, and BMI was the best fit (had the lowest AIC/BIC). Interpretation: Based on the simple logistic regression, for every 1 year increase in age, the risk of hypertension increases by 1.2%. Based on the multiple logistic regression, for every 1 year increase in age, the risk of hypertension increases by 6.1%, holding sex, physical activity, BMI, and smoking status constant. This can be due to confounding by the aforementioned variables. As BMI increases, the risk of hypertension increases, but BMI is not an effect modifier of age and hypertension. There were no multicollinearity issues or influential observations found. The best fit model for exploring the relationship between age and hypertension includes the variables of sex and BMI. Limitations: Some limitations to this study and the results found include that it is a cross-sectional study, so it is hard to establish causality, as you are not sure which variable came first, especially with variables such as physical activity, smoking, and more. In addition, the sample size was decreased when the data was cleaned, so the results may not be as precise as they could be with the whole dataset.

Summary

Key Concepts Covered

Statistical modeling describes relationships between variables
Regression types depend on the outcome variable type
Logistic regression is appropriate for binary outcomes
Multiple regression controls for confounding
Dummy variables represent categorical predictors
Interactions test for effect modification
Model diagnostics check assumptions and identify problems
Model comparison helps select the best model

Important Formulas

Logistic Regression:

\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]

Odds Ratio:

\[\text{OR} = e^{\beta_i}\]

Predicted Probability:

\[p = \frac{e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}{1 + e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}\]

References

Agresti, A. (2018). An Introduction to Categorical Data Analysis (3rd ed.). Wiley.
Hosmer, D. W., Lemeshow, S., & Sturdivant, R. X. (2013). Applied Logistic Regression (3rd ed.). Wiley.
Vittinghoff, E., Glidden, D. V., Shiboski, S. C., & McCulloch, C. E. (2012). Regression Methods in Biostatistics (2nd ed.). Springer.
Centers for Disease Control and Prevention. (2023). Behavioral Risk Factor Surveillance System.

Session Info

sessionInfo()

## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggstats_0.12.0   gtsummary_2.5.0  ggeffects_2.3.2  car_3.1-3       
##  [5] carData_3.0-5    broom_1.0.11     plotly_4.12.0    kableExtra_1.4.0
##  [9] knitr_1.51       haven_2.5.5      lubridate_1.9.4  forcats_1.0.1   
## [13] stringr_1.6.0    dplyr_1.1.4      purrr_1.2.1      readr_2.1.6     
## [17] tidyr_1.3.2      tibble_3.3.1     ggplot2_4.0.0    tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       xfun_0.56          bslib_0.9.0        htmlwidgets_1.6.4 
##  [5] insight_1.4.4      tzdb_0.5.0         vctrs_0.6.5        tools_4.5.1       
##  [9] crosstalk_1.2.2    generics_0.1.4     datawizard_1.3.0   pkgconfig_2.0.3   
## [13] data.table_1.18.0  RColorBrewer_1.1-3 S7_0.2.0           lifecycle_1.0.5   
## [17] compiler_4.5.1     farver_2.1.2       textshaping_1.0.4  htmltools_0.5.9   
## [21] sass_0.4.10        yaml_2.3.12        lazyeval_0.2.2     Formula_1.2-5     
## [25] pillar_1.11.1      jquerylib_0.1.4    cachem_1.1.0       abind_1.4-8       
## [29] tidyselect_1.2.1   digest_0.6.39      stringi_1.8.7      labeling_0.4.3    
## [33] fastmap_1.2.0      grid_4.5.1         cli_3.6.5          magrittr_2.0.4    
## [37] withr_3.0.2        scales_1.4.0       backports_1.5.0    timechange_0.3.0  
## [41] rmarkdown_2.30     httr_1.4.7         otel_0.2.0         hms_1.1.4         
## [45] evaluate_1.0.5     viridisLite_0.4.2  rlang_1.1.6        glue_1.8.0        
## [49] xml2_1.5.1         svglite_2.2.2      rstudioapi_0.18.0  jsonlite_2.0.0    
## [53] R6_2.6.1           systemfonts_1.3.1

Lab05_Modeling_AbigailMiller

Abigail Miller

2026-02-24

Setup and Data Preparation

Loading BRFSS 2023 Data

Creating a Working Subset

Data Recoding and Cleaning

Part 2: Student Lab Activity

Lab Overview

Lab Instructions

Task 1: Explore the Outcome Variable

Task 2: Build a Simple Logistic Regression Model

Task 3: Create a Multiple Regression Model

Task 4: Interpret Dummy Variables

Task 5: Test for Interaction

Task 6: Model Diagnostics

Task 7: Model Comparison

Lab Report Guidelines

Summary

Key Concepts Covered

Important Formulas

References