Introduction

Statistical modeling is a fundamental tool in epidemiology that allows us to:

  • Describe relationships between variables
  • Predict outcomes based on risk factors
  • Estimate associations while controlling for confounding

This lecture introduces key concepts in regression modeling using real-world data from the Behavioral Risk Factor Surveillance System (BRFSS) 2023.


Setup and Data Preparation

# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)

Loading BRFSS 2023 Data

The BRFSS is a large-scale telephone survey that collects data on health-related risk behaviors, chronic health conditions, and use of preventive services from U.S. residents.

# Load the full BRFSS 2023 dataset
brfss_full <- read_xpt("C:/Users/userp/OneDrive/Рабочий стол/HSTA553/LLCP2023.XPT") %>%
  janitor::clean_names()

# Display dataset dimensions
names(brfss_full)
##   [1] "state"    "fmonth"   "idate"    "imonth"   "iday"     "iyear"   
##   [7] "dispcode" "seqno"    "psu"      "ctelenm1" "pvtresd1" "colghous"
##  [13] "statere1" "celphon1" "ladult1"  "numadult" "respslc1" "landsex2"
##  [19] "lndsxbrt" "safetime" "ctelnum1" "cellfon5" "cadult1"  "cellsex2"
##  [25] "celsxbrt" "pvtresd3" "cclghous" "cstate1"  "landline" "hhadult" 
##  [31] "sexvar"   "genhlth"  "physhlth" "menthlth" "poorhlth" "primins1"
##  [37] "persdoc3" "medcost1" "checkup1" "exerany2" "exract12" "exeroft1"
##  [43] "exerhmm1" "exract22" "exeroft2" "exerhmm2" "strength" "bphigh6" 
##  [49] "bpmeds1"  "cholchk3" "toldhi3"  "cholmed3" "cvdinfr4" "cvdcrhd4"
##  [55] "cvdstrk3" "asthma3"  "asthnow"  "chcscnc1" "chcocnc1" "chccopd3"
##  [61] "addepev3" "chckdny2" "havarth4" "diabete4" "diabage4" "marital" 
##  [67] "educa"    "renthom1" "numhhol4" "numphon4" "cpdemo1c" "veteran3"
##  [73] "employ1"  "children" "income3"  "pregnant" "weight2"  "height3" 
##  [79] "deaf"     "blind"    "decide"   "diffwalk" "diffdres" "diffalon"
##  [85] "fall12mn" "fallinj5" "smoke100" "smokday2" "usenow3"  "ecignow2"
##  [91] "alcday4"  "avedrnk3" "drnk3ge5" "maxdrnks" "flushot7" "flshtmy3"
##  [97] "pneuvac4" "shingle2" "hivtst7"  "hivtstd3" "seatbelt" "drnkdri2"
## [103] "covidpo1" "covidsm1" "covidact" "pdiabts1" "prediab2" "diabtype"
## [109] "insulin1" "chkhemo3" "eyeexam1" "diabeye1" "diabedu1" "feetsore"
## [115] "arthexer" "arthedu"  "lmtjoin3" "arthdis2" "joinpai2" "lcsfirst"
## [121] "lcslast"  "lcsnumcg" "lcsctsc1" "lcsscncr" "lcsctwhn" "hadmam"  
## [127] "howlong"  "cervscrn" "crvclcnc" "crvclpap" "crvclhpv" "hadhyst2"
## [133] "psatest1" "psatime1" "pcpsars2" "psasugs1" "pcstalk2" "hadsigm4"
## [139] "colnsigm" "colntes1" "sigmtes1" "lastsig4" "colncncr" "vircolo1"
## [145] "vclntes2" "smalstol" "stoltest" "stooldn2" "bldstfit" "sdnates1"
## [151] "cncrdiff" "cncrage"  "cncrtyp2" "csrvtrt3" "csrvdoc1" "csrvsum" 
## [157] "csrvrtrn" "csrvinst" "csrvinsr" "csrvdein" "csrvclin" "csrvpain"
## [163] "csrvctl2" "indortan" "numburn3" "sunprtct" "wkdayout" "wkendout"
## [169] "cimemlo1" "cdworry"  "cddiscu1" "cdhous1"  "cdsocia1" "caregiv1"
## [175] "crgvrel4" "crgvlng1" "crgvhrs1" "crgvprb3" "crgvalzd" "crgvper1"
## [181] "crgvhou1" "crgvexpt" "lastsmk2" "stopsmk2" "mentcigs" "mentecig"
## [187] "heattbco" "firearm5" "gunload"  "loadulk2" "hasymp1"  "hasymp2" 
## [193] "hasymp3"  "hasymp4"  "hasymp5"  "hasymp6"  "strsymp1" "strsymp2"
## [199] "strsymp3" "strsymp4" "strsymp5" "strsymp6" "firstaid" "aspirin" 
## [205] "birthsex" "somale"   "sofemale" "trnsgndr" "marijan1" "marjsmok"
## [211] "marjeat"  "marjvape" "marjdab"  "marjothr" "usemrjn4" "acedeprs"
## [217] "acedrink" "acedrugs" "aceprisn" "acedivrc" "acepunch" "acehurt1"
## [223] "aceswear" "acetouch" "acetthem" "acehvsex" "aceadsaf" "aceadned"
## [229] "imfvpla4" "hpvadvc4" "hpvadsht" "tetanus1" "covidva1" "covacge1"
## [235] "covidnu2" "lsatisfy" "emtsuprt" "sdlonely" "sdhemply" "foodstmp"
## [241] "sdhfood1" "sdhbills" "sdhutils" "sdhtrnsp" "sdhstre1" "rrclass3"
## [247] "rrcognt2" "rrtreat"  "rratwrk2" "rrhcare4" "rrphysm2" "rcsgend1"
## [253] "rcsxbrth" "rcsrltn2" "casthdx2" "casthno2" "qstver"   "qstlang" 
## [259] "metstat"  "urbstat"  "mscode"   "ststr"    "strwt"    "rawrake" 
## [265] "wt2rake"  "imprace"  "chispnc"  "crace1"   "cageg"    "cllcpwt" 
## [271] "dualuse"  "dualcor"  "llcpwt2"  "llcpwt"   "rfhlth"   "phys14d" 
## [277] "ment14d"  "hlthpl1"  "hcvu653"  "totinda"  "metvl12"  "metvl22" 
## [283] "maxvo21"  "fc601"    "actin13"  "actin23"  "padur1"   "padur2"  
## [289] "pafreq1"  "pafreq2"  "minac12"  "minac22"  "strfreq"  "pamiss3" 
## [295] "pamin13"  "pamin23"  "pa3min"   "pavig13"  "pavig23"  "pa3vigm" 
## [301] "pacat3"   "paindx3"  "pa150r4"  "pa300r4"  "pa30023"  "pastrng" 
## [307] "parec3"   "pastae3"  "rfhype6"  "cholch3"  "rfchol3"  "michd"   
## [313] "ltasth1"  "casthm1"  "asthms1"  "drdxar2"  "mrace1"   "hispanc" 
## [319] "race"     "raceg21"  "racegr3"  "raceprv"  "sex"      "ageg5yr" 
## [325] "age65yr"  "age80"    "age_g"    "htin4"    "htm4"     "wtkg3"   
## [331] "bmi5"     "bmi5cat"  "rfbmi5"   "chldcnt"  "educag"   "incomg1" 
## [337] "smoker3"  "rfsmok3"  "cureci2"  "drnkany6" "drocdy4"  "rfbing6" 
## [343] "drnkwk2"  "rfdrhv8"  "flshot7"  "pneumo3"  "aidtst4"  "rfseat2" 
## [349] "rfseat3"  "drnkdrv"

Creating a Working Subset

For computational efficiency and teaching purposes, we’ll create a subset with relevant variables and complete cases.

# Select variables of interest and create analytic dataset
set.seed(553)  # For reproducibility

brfss_subset <- brfss_full %>%
  select(
    # Outcome: Diabetes status
    diabete4,
    # Demographics
    age_g,      # Age category
    sex,         # Sex
    race,       # Race/ethnicity
    educag,     # Education level
    incomg1,    # Income category
    # Health behaviors
    bmi5cat,    # BMI category
    exerany2,     # Physical activity
    smokday2,     # Smoking frequency
    # Health status
    genhlth,      # General health
    rfhype6,    # High blood pressure
    rfchol3     # High cholesterol
  ) %>%
  # Filter to complete cases only
  drop_na() %>%
  # Sample 2000 observations for manageable analysis
  slice_sample(n = 2000)

# Display subset dimensions
cat("Working subset dimensions:",
    nrow(brfss_subset), "observations,",
    ncol(brfss_subset), "variables\n")
## Working subset dimensions: 2000 observations, 12 variables

Data Recoding and Cleaning

# Create clean dataset with recoded variables
brfss_clean <- brfss_subset %>%
  mutate(
    # Outcome: Diabetes (binary)
    diabetes = case_when(
      diabete4 == 1 ~ 1,  # Yes
      diabete4 %in% c(2, 3, 4) ~ 0,  # No, pre-diabetes, or gestational only
      TRUE ~ NA_real_
    ),

    # Age groups
    age_group = factor(case_when(
      age_g == 1 ~ "18-24",
      age_g == 2 ~ "25-34",
      age_g == 3 ~ "35-44",
      age_g == 4 ~ "45-54",
      age_g == 5 ~ "55-64",
      age_g == 6 ~ "65+"
    ), levels = c("18-24", "25-34", "35-44", "45-54", "55-64", "65+")),

    # Age continuous (midpoint of category)
    age_cont = case_when(
      age_g == 1 ~ 21,
      age_g == 2 ~ 29.5,
      age_g == 3 ~ 39.5,
      age_g == 4 ~ 49.5,
      age_g == 5 ~ 59.5,
      age_g == 6 ~ 70
    ),

    # Sex
    sex = factor(ifelse(sex == 1, "Male", "Female")),

    # Race/ethnicity
    race = factor(case_when(
      race == 1 ~ "White",
      race == 2 ~ "Black",
      race == 3 ~ "Native American",
      race == 4 ~ "Asian",
      race == 5 ~ "Native Hawaiian/PI",
      race == 6 ~ "Other",
      race == 7 ~ "Multiracial",
      race == 8 ~ "Hispanic"
    )),

    # Education (simplified)
    education = factor(case_when(
      educag == 1 ~ "< High school",
      educag == 2 ~ "High school graduate",
      educag == 3 ~ "Some college",
      educag == 4 ~ "College graduate"
    ), levels = c("< High school", "High school graduate", "Some college", "College graduate")),

    # Income (simplified)
    income = factor(case_when(
      incomg1 == 1 ~ "< $25,000",
      incomg1 == 2 ~ "$25,000-$49,999",
      incomg1 == 3 ~ "$50,000-$74,999",
      incomg1 == 4 ~ "$75,000+",
      incomg1 == 5 ~ "Unknown"
    ), levels = c("< $25,000", "$25,000-$49,999", "$50,000-$74,999", "$75,000+", "Unknown")),

    # BMI category
    bmi_cat = factor(case_when(
      bmi5cat == 1 ~ "Underweight",
      bmi5cat == 2 ~ "Normal",
      bmi5cat == 3 ~ "Overweight",
      bmi5cat == 4 ~ "Obese"
    ), levels = c("Underweight", "Normal", "Overweight", "Obese")),

    # Physical activity (binary)
    phys_active = ifelse(exerany2 == 1, 1, 0),

    # Current smoking
    current_smoker = case_when(
      smokday2 == 1 ~ 1,  # Every day
      smokday2 == 2 ~ 1,  # Some days
      smokday2 == 3 ~ 0,  # Not at all
      TRUE ~ 0
    ),

    # General health (simplified)
    gen_health = factor(case_when(
      genhlth %in% c(1, 2) ~ "Excellent/Very good",
      genhlth == 3 ~ "Good",
      genhlth %in% c(4, 5) ~ "Fair/Poor"
    ), levels = c("Excellent/Very good", "Good", "Fair/Poor")),

    # Hypertension
    hypertension = ifelse(rfhype6 == 2, 1, 0),

    # High cholesterol
    high_chol = ifelse(rfchol3 == 2, 1, 0)
  ) %>%
  # Select only the clean variables
  select(diabetes, age_group, age_cont, sex, race, education, income,
         bmi_cat, phys_active, current_smoker, gen_health,
         hypertension, high_chol) %>%
  # Remove any remaining missing values
  drop_na()

# Save the cleaned subset for future use
write_rds(brfss_clean,
          "C:/Users/userp/OneDrive/Рабочий стол/HSTA553/brfss_subset_2023.rds")

cat("Clean dataset saved with", nrow(brfss_clean), "complete observations\n")
## Clean dataset saved with 1281 complete observations

Descriptive Statistics

# Summary table by diabetes status
desc_table <- brfss_clean %>%
  group_by(diabetes) %>%
  summarise(
    N = n(),
    `Mean Age` = round(mean(age_cont), 1),
    `% Male` = round(100 * mean(sex == "Male"), 1),
    `% Obese` = round(100 * mean(bmi_cat == "Obese", na.rm = TRUE), 1),
    `% Physically Active` = round(100 * mean(phys_active), 1),
    `% Current Smoker` = round(100 * mean(current_smoker), 1),
    `% Hypertension` = round(100 * mean(hypertension), 1),
    `% High Cholesterol` = round(100 * mean(high_chol), 1)
  ) %>%
  mutate(diabetes = ifelse(diabetes == 1, "Diabetes", "No Diabetes"))

desc_table %>%
  kable(caption = "Descriptive Statistics by Diabetes Status",
        align = "lrrrrrrrr") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Descriptive Statistics by Diabetes Status
diabetes N Mean Age % Male % Obese % Physically Active % Current Smoker % Hypertension % High Cholesterol
No Diabetes 1053 58.2 49.0 34.8 69.4 29.3 47.5 42.5
Diabetes 228 63.1 53.9 56.1 53.5 27.6 76.8 67.1

Part 1: Statistical Modeling Concepts

1. What is Statistical Modeling?

A statistical model is a mathematical representation of the relationship between:

  • An outcome variable (dependent variable, response)
  • One or more predictor variables (independent variables, exposures, covariates)

General Form of a Statistical Model

\[f(Y) = \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \cdots + \beta_p X_p + \epsilon\]

Where:

  • \(f(Y)\) is a function of the outcome (identity, log, logit, etc.)
  • \(\beta_0\) is the intercept (baseline value)
  • \(\beta_1, \beta_2, \ldots, \beta_p\) are coefficients (effect sizes)
  • \(X_1, X_2, \ldots, X_p\) are predictor variables
  • \(\epsilon\) is the error term (random variation)

2. Types of Regression Models

The choice of regression model depends on the type of outcome variable:

Common Regression Models in Epidemiology
Outcome Type Regression Type Link Function Example
Continuous Linear Identity: Y Blood pressure, BMI
Binary Logistic Logit: log(p/(1-p)) Disease status, mortality
Count Poisson/Negative Binomial Log: log(Y) Number of infections
Time-to-event Cox Proportional Hazards Log: log(h(t)) Survival time

Simple vs. Multiple Regression

  • Simple regression: One predictor variable
  • Multiple regression: Two or more predictor variables (controls for confounding)

3. Linear Regression Example

Let’s model the relationship between age and diabetes prevalence.

Simple Linear Regression

# Simple linear regression: diabetes ~ age
model_linear_simple <- lm(diabetes ~ age_cont, data = brfss_clean)

# Display results
tidy(model_linear_simple, conf.int = TRUE) %>%
  kable(caption = "Simple Linear Regression: Diabetes ~ Age",
        digits = 4,
        col.names = c("Term", "Estimate", "Std. Error", "t-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)
Simple Linear Regression: Diabetes ~ Age
Term Estimate Std. Error t-statistic p-value 95% CI Lower 95% CI Upper
(Intercept) -0.0632 0.0481 -1.3125 0.1896 -0.1576 0.0312
age_cont 0.0041 0.0008 5.1368 0.0000 0.0025 0.0056

Interpretation:

  • Intercept (\(\beta_0\)): -0.0632 - Expected probability of diabetes at age 0 (not meaningful in this context)
  • Slope (\(\beta_1\)): 0.0041 - For each 1-year increase in age, the probability of diabetes increases by 0.41%

Visualization

With continuous age

# Create scatter plot with regression line
p1 <- ggplot(brfss_clean, aes(x = age_cont, y = diabetes)) +
  geom_jitter(alpha = 0.2, width = 0.5, height = 0.02, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1.2) +
  labs(
    title = "Relationship Between Age and Diabetes",
    subtitle = "Simple Linear Regression",
    x = "Age (years)",
    y = "Probability of Diabetes"
  ) +
  theme_minimal(base_size = 12)

ggplotly(p1) %>%
  layout(hovermode = "closest")

Diabetes Prevalence by Age


4. Logistic Regression: The Preferred Model for Binary Outcomes

Problem with linear regression for binary outcomes:

  • Predicted probabilities can fall outside [0, 1]
  • Assumes constant variance (violated for binary data)

Solution: Logistic Regression

Uses the logit link function to ensure predicted probabilities stay between 0 and 1:

\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]

Simple Logistic Regression

# Simple logistic regression: diabetes ~ age
model_logistic_simple <- glm(diabetes ~ age_cont,
                              data = brfss_clean,
                              family = binomial(link = "logit"))

# Display results with odds ratios
tidy(model_logistic_simple, exponentiate = TRUE, conf.int = TRUE) %>%
  kable(caption = "Simple Logistic Regression: Diabetes ~ Age (Odds Ratios)",
        digits = 3,
        col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)
Simple Logistic Regression: Diabetes ~ Age (Odds Ratios)
Term Odds Ratio Std. Error z-statistic p-value 95% CI Lower 95% CI Upper
(Intercept) 0.029 0.423 -8.390 0 0.012 0.064
age_cont 1.034 0.007 4.978 0 1.021 1.048

Interpretation:

  • Odds Ratio (OR): 1.034
  • For each 1-year increase in age, the odds of diabetes increase by 3.4%
  • The relationship is highly statistically significant (p < 0.001)

Predicted Probabilities

# From ggeffects package
pp <- predict_response(model_logistic_simple, terms = "age_cont")
plot(pp)
Predicted Diabetes Probability by Age

Predicted Diabetes Probability by Age

# Generate predicted probabilities
pred_data <- data.frame(age_cont = seq(18, 80, by = 1))
pred_data$predicted_prob <- predict(model_logistic_simple,
                                    newdata = pred_data,
                                    type = "response")

# Plot
p2 <- ggplot(pred_data, aes(x = age_cont, y = predicted_prob)) +
  geom_line(color = "darkred", linewidth = 1.5) +
  geom_ribbon(aes(ymin = predicted_prob - 0.02,
                  ymax = predicted_prob + 0.02),
              alpha = 0.2, fill = "darkred") +
  labs(
    title = "Predicted Probability of Diabetes by Age",
    subtitle = "Simple Logistic Regression",
    x = "Age (years)",
    y = "Predicted Probability of Diabetes"
  ) +
  scale_y_continuous(labels = scales::percent_format(), limits = c(0, 0.6)) +
  theme_minimal(base_size = 12)

ggplotly(p2)

Predicted Diabetes Probability by Age


5. Multiple Regression: Controlling for Confounding

What is Confounding?

A confounder is a variable that:

  1. Is associated with both the exposure and the outcome
  2. Is not on the causal pathway between exposure and outcome
  3. Distorts the true relationship between exposure and outcome

Example: The relationship between age and diabetes may be confounded by BMI, physical activity, and other factors.

Multiple Logistic Regression

# Multiple logistic regression with potential confounders
model_logistic_multiple <- glm(diabetes ~ age_cont + sex + bmi_cat +
                                phys_active + current_smoker + education,
                               data = brfss_clean,
                               family = binomial(link = "logit"))

# Display results
tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
  kable(caption = "Multiple Logistic Regression: Diabetes ~ Age + Covariates (Odds Ratios)",
        digits = 3,
        col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  scroll_box(height = "400px")
Multiple Logistic Regression: Diabetes ~ Age + Covariates (Odds Ratios)
Term Odds Ratio Std. Error z-statistic p-value 95% CI Lower 95% CI Upper
(Intercept) 0.009 1.177 -4.001 0.000 0.000 0.065
age_cont 1.041 0.007 5.515 0.000 1.027 1.057
sexMale 1.191 0.154 1.133 0.257 0.880 1.613
bmi_catNormal 1.971 1.052 0.645 0.519 0.378 36.309
bmi_catOverweight 3.155 1.044 1.101 0.271 0.621 57.679
bmi_catObese 6.834 1.041 1.845 0.065 1.354 124.675
phys_active 0.589 0.157 -3.373 0.001 0.433 0.802
current_smoker 1.213 0.178 1.085 0.278 0.852 1.716
educationHigh school graduate 0.634 0.288 -1.579 0.114 0.364 1.131
educationSome college 0.542 0.294 -2.081 0.037 0.307 0.977
educationCollege graduate 0.584 0.305 -1.763 0.078 0.324 1.074

Interpretation:

  • Age (adjusted OR): 1.041
    • After adjusting for sex, BMI, physical activity, smoking, and education, each 1-year increase in age is associated with a 4.1% increase in the odds of diabetes
  • Sex (Male vs Female): OR = 1.191
    • Males have 19.1% higher odds of diabetes compared to females, adjusting for other variables
  • BMI (Obese vs Normal): OR = 6.834
    • Obese individuals had 6.83 times higher odds of diabetes compared to normal-weight individuals.

6. Dummy Variables: Coding Categorical Predictors

Categorical variables with \(k\) levels are represented using \(k-1\) dummy variables (indicator variables).

Example: Education Level

Education has 4 levels: 1. < High school (reference category) 2. High school graduate 3. Some college 4. College graduate

R automatically creates 3 dummy variables:

# Extract dummy variable coding
dummy_table <- data.frame(
  Education = c("< High school", "High school graduate", "Some college", "College graduate"),
  `Dummy 1 (HS grad)` = c(0, 1, 0, 0),
  `Dummy 2 (Some college)` = c(0, 0, 1, 0),
  `Dummy 3 (College grad)` = c(0, 0, 0, 1),
  check.names = FALSE
)

dummy_table %>%
  kable(caption = "Dummy Variable Coding for Education",
        align = "lccc") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(1, bold = TRUE, background = "#ffe6e6")  # Highlight reference category
Dummy Variable Coding for Education
Education Dummy 1 (HS grad) Dummy 2 (Some college) Dummy 3 (College grad)
< High school 0 0 0
High school graduate 1 0 0
Some college 0 1 0
College graduate 0 0 1

Reference Category: The category with all zeros (< High school) is the reference group. All other categories are compared to this reference.

Visualizing Education Effects

# Extract education coefficients
educ_coefs <- tidy(model_logistic_multiple, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(str_detect(term, "education")) %>%
  mutate(
    education_level = str_remove(term, "education"),
    education_level = factor(education_level,
                             levels = c("High school graduate",
                                       "Some college",
                                       "College graduate"))
  )

# Add reference category
ref_row <- data.frame(
  term = "education< High school",
  estimate = 1.0,
  std.error = 0,
  statistic = NA,
  p.value = NA,
  conf.low = 1.0,
  conf.high = 1.0,
  education_level = factor("< High school (Ref)",
                          levels = c("< High school (Ref)",
                                    "High school graduate",
                                    "Some college",
                                    "College graduate"))
)

educ_coefs_full <- bind_rows(ref_row, educ_coefs) %>%
  mutate(education_level = factor(education_level,
                                 levels = c("< High school (Ref)",
                                           "High school graduate",
                                           "Some college",
                                           "College graduate")))

# Plot
p3 <- ggplot(educ_coefs_full, aes(x = education_level, y = estimate)) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high),
                  size = 0.8, color = "darkblue") +
  coord_flip() +
  labs(
    title = "Association Between Education and Diabetes",
    subtitle = "Adjusted Odds Ratios (reference: < High school)",
    x = "Education Level",
    y = "Odds Ratio (95% CI)"
  ) +
  theme_minimal(base_size = 12)

ggplotly(p3)

Odds Ratios for Education Levels

# Plot model coefficients with `ggcoef_model()`
ggcoef_model(model_logistic_multiple, exponentiate = TRUE,
  include = c("education"),
  variable_labels = c(
    education = "Education"),
  facet_labeller = ggplot2::label_wrap_gen(10)
)


7. Interactions (Effect Modification)

An interaction exists when the effect of one variable on the outcome differs across levels of another variable.

Epidemiologic term: Effect modification

Example: Age × Sex Interaction

Does the effect of age on diabetes differ between males and females?

# Model with interaction term
model_interaction <- glm(diabetes ~ age_cont * sex + bmi_cat + phys_active,
                         data = brfss_clean,
                         family = binomial(link = "logit"))

# Display interaction results
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE) %>%
  filter(str_detect(term, "age_cont")) %>%
  kable(caption = "Age × Sex Interaction Model (Odds Ratios)",
        digits = 3,
        col.names = c("Term", "Odds Ratio", "Std. Error", "z-statistic", "p-value", "95% CI Lower", "95% CI Upper")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)
Age × Sex Interaction Model (Odds Ratios)
Term Odds Ratio Std. Error z-statistic p-value 95% CI Lower 95% CI Upper
age_cont 1.031 0.009 3.178 0.001 1.012 1.051
age_cont:sexMale 1.015 0.014 1.084 0.278 0.988 1.044

Interpretation:

  • Main effect of age: OR among females (reference)
  • Interaction term (age:sexMale): Additional effect of age among males
  • If the interaction term is significant, the age-diabetes relationship differs by sex

Visualizing Interaction

# Generate predicted probabilities by sex
pred_interact <- ggpredict(model_interaction, terms = c("age_cont [18:80]", "sex"))

# Plot
p4 <- ggplot(pred_interact, aes(x = x, y = predicted, color = group, fill = group)) +
  geom_line(linewidth = 1.2) +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.2, color = NA) +
  labs(
    title = "Predicted Probability of Diabetes by Age and Sex",
    subtitle = "Testing for Age × Sex Interaction",
    x = "Age (years)",
    y = "Predicted Probability of Diabetes",
    color = "Sex",
    fill = "Sex"
  ) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_color_manual(values = c("Female" = "#E64B35", "Male" = "#4DBBD5")) +
  scale_fill_manual(values = c("Female" = "#E64B35", "Male" = "#4DBBD5")) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "bottom")

ggplotly(p4)

Age-Diabetes Relationship by Sex


8. Model Diagnostics

Every regression model makes assumptions about the data. If assumptions are violated, results may be invalid.

Key Assumptions for Logistic Regression

  1. Linearity of log odds: Continuous predictors have a linear relationship with the log odds of the outcome
  2. Independence of observations: Each observation is independent
  3. No perfect multicollinearity: Predictors are not perfectly correlated
  4. No influential outliers: Individual observations don’t overly influence the model

Checking for Multicollinearity

Variance Inflation Factor (VIF): Measures how much the variance of a coefficient is inflated due to correlation with other predictors.

  • VIF < 5: Generally acceptable
  • VIF > 10: Serious multicollinearity problem
# Calculate VIF
vif_values <- vif(model_logistic_multiple)

# Create VIF table
# For models with categorical variables, vif() returns GVIF (Generalized VIF)
if (is.matrix(vif_values)) {
  # If matrix (categorical variables present), extract GVIF^(1/(2*Df))
  vif_df <- data.frame(
    Variable = rownames(vif_values),
    VIF = vif_values[, "GVIF^(1/(2*Df))"]
  )
} else {
  # If vector (only continuous variables)
  vif_df <- data.frame(
    Variable = names(vif_values),
    VIF = as.numeric(vif_values)
  )
}

# Add interpretation
vif_df <- vif_df %>%
  arrange(desc(VIF)) %>%
  mutate(
    Interpretation = case_when(
      VIF < 5 ~ "Low (No concern)",
      VIF >= 5 & VIF < 10 ~ "Moderate (Monitor)",
      VIF >= 10 ~ "High (Problem)"
    )
  )

vif_df %>%
  kable(caption = "Variance Inflation Factors (VIF) for Multiple Regression Model",
        digits = 2,
        align = "lrc") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which(vif_df$VIF >= 10), bold = TRUE, color = "white", background = "#DC143C") %>%
  row_spec(which(vif_df$VIF >= 5 & vif_df$VIF < 10), background = "#FFA500") %>%
  row_spec(which(vif_df$VIF < 5), background = "#90EE90")
Variance Inflation Factors (VIF) for Multiple Regression Model
Variable VIF Interpretation
age_cont age_cont 1.05 Low (No concern)
current_smoker current_smoker 1.05 Low (No concern)
phys_active phys_active 1.02 Low (No concern)
sex sex 1.01 Low (No concern)
education education 1.01 Low (No concern)
bmi_cat bmi_cat 1.01 Low (No concern)

Influential Observations

Cook’s Distance: Measures how much the model would change if an observation were removed.

  • Cook’s D > 1: Potentially influential observation
# Calculate Cook's distance
cooks_d <- cooks.distance(model_logistic_multiple)

# Create data frame
influence_df <- data.frame(
  observation = 1:length(cooks_d),
  cooks_d = cooks_d
) %>%
  mutate(influential = ifelse(cooks_d > 1, "Yes", "No"))

# Plot
p5 <- ggplot(influence_df, aes(x = observation, y = cooks_d, color = influential)) +
  geom_point(alpha = 0.6) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "red") +
  labs(
    title = "Cook's Distance: Identifying Influential Observations",
    subtitle = "Values > 1 indicate potentially influential observations",
    x = "Observation Number",
    y = "Cook's Distance",
    color = "Influential?"
  ) +
  scale_color_manual(values = c("No" = "steelblue", "Yes" = "red")) +
  theme_minimal(base_size = 12)

ggplotly(p5)

Cook’s Distance for Influential Observations

# Count influential observations
n_influential <- sum(influence_df$influential == "Yes")
cat("Number of potentially influential observations:", n_influential, "\n")
## Number of potentially influential observations: 0

9. Model Comparison and Selection

Comparing Nested Models

Use Likelihood Ratio Test to compare nested models:

# Model 1: Age only
model1 <- glm(diabetes ~ age_cont,
              data = brfss_clean,
              family = binomial)

# Model 2: Age + Sex
model2 <- glm(diabetes ~ age_cont + sex,
              data = brfss_clean,
              family = binomial)

# Model 3: Full model
model3 <- model_logistic_multiple

# Likelihood ratio test
lrt_1_2 <- anova(model1, model2, test = "LRT")
lrt_2_3 <- anova(model2, model3, test = "LRT")

# Create comparison table
model_comp <- data.frame(
  Model = c("Model 1: Age only",
            "Model 2: Age + Sex",
            "Model 3: Full model"),
  AIC = c(AIC(model1), AIC(model2), AIC(model3)),
  BIC = c(BIC(model1), BIC(model2), BIC(model3)),
  `Deviance` = c(deviance(model1), deviance(model2), deviance(model3)),
  check.names = FALSE
)

model_comp %>%
  kable(caption = "Model Comparison: AIC, BIC, and Deviance",
        digits = 2,
        align = "lrrr") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE) %>%
  row_spec(which.min(model_comp$AIC), bold = TRUE, background = "#d4edda")
Model Comparison: AIC, BIC, and Deviance
Model AIC BIC Deviance
Model 1: Age only 1175.08 1185.39 1171.08
Model 2: Age + Sex 1175.85 1191.32 1169.85
Model 3: Full model 1122.65 1179.36 1100.65

Interpretation:

  • Lower AIC/BIC indicates better model fit
  • Model 3 (full model) has the lowest AIC, suggesting it provides the best fit to the data

10. Error Term in Statistical Models

All statistical models include an error term (\(\epsilon\)) to account for:

  • Random variation in the outcome
  • Unmeasured variables not included in the model
  • Measurement error in variables

\[Y = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p + \epsilon\]

Key points:

  • The model cannot perfectly predict every outcome
  • The difference between observed and predicted values is the error (residual)
  • We assume errors are normally distributed with mean 0 (for linear regression)

Part 2: Student Lab Activity

Lab Overview

In this lab, you will:

  1. Build your own logistic regression model predicting hypertension (high blood pressure)
  2. Create dummy variables for categorical predictors
  3. Interpret regression coefficients
  4. Test for confounding and interaction
  5. Perform model diagnostics

Lab Instructions

Task 1: Explore the Outcome Variable

# YOUR CODE HERE: Create a frequency table of hypertension status
brfss <- readRDS("brfss_subset_2023.rds")
names(brfss)
##  [1] "diabetes"       "age_group"      "age_cont"       "sex"           
##  [5] "race"           "education"      "income"         "bmi_cat"       
##  [9] "phys_active"    "current_smoker" "gen_health"     "hypertension"  
## [13] "high_chol"
table(brfss$hypertension, useNA = "ifany")
## 
##   0   1 
## 606 675
# YOUR CODE HERE: Calculate the prevalence of hypertension by age group
table(brfss$age_group)
## 
## 18-24 25-34 35-44 45-54 55-64   65+ 
##    12    77   138   161   266   627
prev <- prop.table(
  table(brfss$age_group, brfss$hypertension),
  margin = 1
) * 100

round(prev, 1)
##        
##            0    1
##   18-24 91.7  8.3
##   25-34 80.5 19.5
##   35-44 69.6 30.4
##   45-54 62.1 37.9
##   55-64 48.5 51.5
##   65+   33.2 66.8
library(dplyr)

brfss %>%
  group_by(age_group) %>%
  summarise(
    n = n(),
    hypertension_cases = sum(hypertension == 1, na.rm = TRUE),
    prevalence_percent = round(mean(hypertension == 1, na.rm = TRUE) * 100, 1)
  )
## # A tibble: 6 × 4
##   age_group     n hypertension_cases prevalence_percent
##   <fct>     <int>              <int>              <dbl>
## 1 18-24        12                  1                8.3
## 2 25-34        77                 15               19.5
## 3 35-44       138                 42               30.4
## 4 45-54       161                 61               37.9
## 5 55-64       266                137               51.5
## 6 65+         627                419               66.8

Questions:

  1. What is the overall prevalence of hypertension in the dataset?
    675
  2. How does hypertension prevalence vary by age group?
    Hypertension prevalence increased markedly with age, rising from 8.3% among adults aged 18–24 to 66.8% among those aged 65 years and older, demonstrating a strong age-related gradient.

Task 2: Build a Simple Logistic Regression Model

# YOUR CODE HERE: Fit a simple logistic regression model
# Outcome: hypertension
# Predictor: age_cont
model1 <- glm(hypertension ~ age_cont,
              data = brfss,
              family = binomial(link = "logit"))
summary(model1)
## 
## Call:
## glm(formula = hypertension ~ age_cont, family = binomial(link = "logit"), 
##     data = brfss)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.042577   0.295584  -10.29   <2e-16 ***
## age_cont     0.053119   0.004831   11.00   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1772.1  on 1280  degrees of freedom
## Residual deviance: 1632.6  on 1279  degrees of freedom
## AIC: 1636.6
## 
## Number of Fisher Scoring iterations: 4
# YOUR CODE HERE: Display the results with odds ratios
exp(coef(model1))
## (Intercept)    age_cont 
##  0.04771176  1.05455475
exp(confint(model1))
##                  2.5 %     97.5 %
## (Intercept) 0.02644276 0.08431815
## age_cont    1.04476526 1.06475213
tidy(model1, exponentiate = TRUE, conf.int = TRUE)
## # A tibble: 2 × 7
##   term        estimate std.error statistic  p.value conf.low conf.high
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 (Intercept)   0.0477   0.296       -10.3 7.54e-25   0.0264    0.0843
## 2 age_cont      1.05     0.00483      11.0 4.02e-28   1.04      1.06

Questions:

  1. What is the odds ratio for age? Interpret this value. In this model, age was positively associated with hypertension (OR = 1.055). This indicates that each additional year of age is associated with a 5.5% increase in the odds of hypertension.
  2. Is the association statistically significant? this association is statistically significant (p < 0.05).
  3. What is the 95% confidence interval for the odds ratio? the 95% Confidence Interval for age_cont is: 1.0448 to 1.0648 —

Task 3: Create a Multiple Regression Model

# YOUR CODE HERE: Fit a multiple logistic regression model
# Outcome: hypertension
# Predictors: age_cont, sex, bmi_cat, phys_active, current_smoker
model2 <- glm(hypertension ~ age_cont +
                                sex +
                                bmi_cat +
                                phys_active +
                                current_smoker,
              data = brfss,
              family = binomial(link = "logit"))

summary(model2)
## 
## Call:
## glm(formula = hypertension ~ age_cont + sex + bmi_cat + phys_active + 
##     current_smoker, family = binomial(link = "logit"), data = brfss)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -4.806068   0.653465  -7.355 1.91e-13 ***
## age_cont           0.059453   0.005292  11.234  < 2e-16 ***
## sexMale            0.239129   0.122612   1.950 0.051141 .  
## bmi_catNormal      0.740579   0.546292   1.356 0.175212    
## bmi_catOverweight  1.175933   0.542839   2.166 0.030291 *  
## bmi_catObese       1.884828   0.544866   3.459 0.000542 ***
## phys_active       -0.105371   0.130457  -0.808 0.419260    
## current_smoker     0.068533   0.138515   0.495 0.620763    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1772.1  on 1280  degrees of freedom
## Residual deviance: 1563.5  on 1273  degrees of freedom
## AIC: 1579.5
## 
## Number of Fisher Scoring iterations: 4
# YOUR CODE HERE: Display the results
exp(coef(model2))
##       (Intercept)          age_cont           sexMale     bmi_catNormal 
##       0.008179959       1.061255783       1.270142112       2.097150060 
## bmi_catOverweight      bmi_catObese       phys_active    current_smoker 
##       3.241164895       6.585220088       0.899990714       1.070935933
exp(confint(model2))
##                         2.5 %      97.5 %
## (Intercept)       0.002105268  0.02803472
## age_cont          1.050496837  1.07253490
## sexMale           0.998922794  1.61567286
## bmi_catNormal     0.759395421  6.75617644
## bmi_catOverweight 1.182648040 10.38462655
## bmi_catObese      2.394090483 21.17598499
## phys_active       0.696650987  1.16203458
## current_smoker    0.816955023  1.40654285
results <- tidy(model2, exponentiate = TRUE, conf.int = TRUE)
results
## # A tibble: 8 × 7
##   term              estimate std.error statistic  p.value conf.low conf.high
##   <chr>                <dbl>     <dbl>     <dbl>    <dbl>    <dbl>     <dbl>
## 1 (Intercept)        0.00818   0.653      -7.35  1.91e-13  0.00211    0.0280
## 2 age_cont           1.06      0.00529    11.2   2.79e-29  1.05       1.07  
## 3 sexMale            1.27      0.123       1.95  5.11e- 2  0.999      1.62  
## 4 bmi_catNormal      2.10      0.546       1.36  1.75e- 1  0.759      6.76  
## 5 bmi_catOverweight  3.24      0.543       2.17  3.03e- 2  1.18      10.4   
## 6 bmi_catObese       6.59      0.545       3.46  5.42e- 4  2.39      21.2   
## 7 phys_active        0.900     0.130      -0.808 4.19e- 1  0.697      1.16  
## 8 current_smoker     1.07      0.139       0.495 6.21e- 1  0.817      1.41

Questions:

  1. How did the odds ratio for age change after adjusting for other variables? The odds ratio for age increased slightly from 1.055 in the crude model to 1.061 in the adjusted model.
  2. What does this suggest about confounding? This small change (<10%) suggests there is minimal evidence of confounding by sex, BMI, physical activity, or smoking. Age appears to be independently associated with hypertension.
  3. Which variables are the strongest predictors of hypertension? The strongest predictors of hypertension were BMI categories, particularly obesity (OR = 6.59) and overweight (OR = 3.24), followed by increasing age (OR = 1.06 per year). These variables showed statistically significant associations and the largest effect sizes. —

Task 4: Interpret Dummy Variables

# YOUR CODE HERE: Create a table showing the dummy variable coding for bmi_cat
bmi_dummies <- model.matrix(~ bmi_cat, data = brfss)
head(bmi_dummies)
##   (Intercept) bmi_catNormal bmi_catOverweight bmi_catObese
## 1           1             0                 0            1
## 2           1             0                 0            1
## 3           1             1                 0            0
## 4           1             1                 0            0
## 5           1             0                 1            0
## 6           1             1                 0            0
dummy_table <- as.data.frame(bmi_dummies)
unique(data.frame(
  bmi_cat = brfss$bmi_cat,
  model.matrix(~ bmi_cat, data = brfss)[, -1]
))
##         bmi_cat bmi_catNormal bmi_catOverweight bmi_catObese
## 1         Obese             0                 0            1
## 3        Normal             1                 0            0
## 5    Overweight             0                 1            0
## 123 Underweight             0                 0            0
# YOUR CODE HERE: Extract and display the odds ratios for BMI categories
or <- exp(coef(model2))
ci <- exp(confint(model2))

bmi_table <- data.frame(
  Variable = names(or),
  OR = or,
  CI_lower = ci[,1],
  CI_upper = ci[,2]
)

bmi_table[grep("bmi_cat", bmi_table$Variable), ]
##                            Variable       OR  CI_lower  CI_upper
## bmi_catNormal         bmi_catNormal 2.097150 0.7593954  6.756176
## bmi_catOverweight bmi_catOverweight 3.241165 1.1826480 10.384627
## bmi_catObese           bmi_catObese 6.585220 2.3940905 21.175985

Questions:

  1. What is the reference category for BMI? The reference category for BMI is the category that does not appear in the regression output, most likely the underweight group
  2. Interpret the odds ratio for “Obese” compared to the reference category. The odds ratio for obese individuals was 6.59 (95% CI: 2.39–21.18), meaning obese individuals had approximately 6.6 times higher odds of hypertension compared with the reference BMI group, adjusting for other variables.
  3. How would you explain this to a non-statistician? In simple terms, people with obesity are much more likely to have high blood pressure compared to people in the reference weight category. —

Task 5: Test for Interaction

# YOUR CODE HERE: Fit a model with Age × BMI interaction
# Test if the effect of age on hypertension differs by BMI category
model_interaction <- glm(hypertension ~ age_cont * bmi_cat +
                                        sex +
                                        phys_active +
                                        current_smoker,
                         data = brfss,
                         family = binomial(link = "logit"))

summary(model_interaction)
## 
## Call:
## glm(formula = hypertension ~ age_cont * bmi_cat + sex + phys_active + 
##     current_smoker, family = binomial(link = "logit"), data = brfss)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)                -1.449064   2.558038  -0.566   0.5711  
## age_cont                    0.004922   0.041980   0.117   0.9067  
## bmi_catNormal              -2.703080   2.650288  -1.020   0.3078  
## bmi_catOverweight          -2.623344   2.623875  -1.000   0.3174  
## bmi_catObese               -1.253018   2.590804  -0.484   0.6286  
## sexMale                     0.244929   0.123167   1.989   0.0467 *
## phys_active                -0.112236   0.130761  -0.858   0.3907  
## current_smoker              0.075878   0.138923   0.546   0.5849  
## age_cont:bmi_catNormal      0.055910   0.043458   1.287   0.1983  
## age_cont:bmi_catOverweight  0.061652   0.043089   1.431   0.1525  
## age_cont:bmi_catObese       0.050616   0.042695   1.186   0.2358  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1772.1  on 1280  degrees of freedom
## Residual deviance: 1561.3  on 1270  degrees of freedom
## AIC: 1583.3
## 
## Number of Fisher Scoring iterations: 4
tidy(model_interaction, exponentiate = TRUE, conf.int = TRUE)
## # A tibble: 11 × 7
##    term                  estimate std.error statistic p.value conf.low conf.high
##    <chr>                    <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 (Intercept)             0.235     2.56      -0.566  0.571  0.000432     23.3 
##  2 age_cont                1.00      0.0420     0.117  0.907  0.930         1.11
##  3 bmi_catNormal           0.0670    2.65      -1.02   0.308  0.000549     40.7 
##  4 bmi_catOverweight       0.0726    2.62      -1.000  0.317  0.000632     42.7 
##  5 bmi_catObese            0.286     2.59      -0.484  0.629  0.00268     162.  
##  6 sexMale                 1.28      0.123      1.99   0.0467 1.00          1.63
##  7 phys_active             0.894     0.131     -0.858  0.391  0.691         1.15
##  8 current_smoker          1.08      0.139      0.546  0.585  0.822         1.42
##  9 age_cont:bmi_catNorm…   1.06      0.0435     1.29   0.198  0.956         1.15
## 10 age_cont:bmi_catOver…   1.06      0.0431     1.43   0.152  0.962         1.15
## 11 age_cont:bmi_catObese   1.05      0.0427     1.19   0.236  0.952         1.14
# YOUR CODE HERE: Perform a likelihood ratio test comparing models with and without interaction
anova(model2, model_interaction, test = "Chisq")
## Analysis of Deviance Table
## 
## Model 1: hypertension ~ age_cont + sex + bmi_cat + phys_active + current_smoker
## Model 2: hypertension ~ age_cont * bmi_cat + sex + phys_active + current_smoker
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1      1273     1563.5                     
## 2      1270     1561.3  3   2.2363   0.5248
# Creating visualization showing predicted probabilities by age and BMI category
ggplot(brfss, aes(x = age_cont, y = hypertension, color = bmi_cat)) +
  geom_smooth(method = "glm", method.args = list(family = "binomial")) +
  labs(y = "Predicted probability of hypertension",
       x = "Age")

Questions:

  1. Is the interaction term statistically significant? Since p > 0.05, the interaction is not statistically significant.
  2. What does this mean in epidemiologic terms (effect modification)? The interaction between age and BMI was not statistically significant (p = 0.525), indicating no evidence of effect modification. This suggests that the association between age and hypertension is consistent across BMI categories.
  3. Create a visualization showing predicted probabilities by age and BMI category done

Predicted probability plots show that hypertension risk increases with age across all BMI categories. Individuals in higher BMI categories have consistently higher predicted probabilities, but the slopes are similar, supporting the absence of interaction between age and BMI.


Task 6: Model Diagnostics

# YOUR CODE HERE: Calculate VIF for your multiple regression model
vif(model2)
##                    GVIF Df GVIF^(1/(2*Df))
## age_cont       1.126628  1        1.061428
## sex            1.016509  1        1.008221
## bmi_cat        1.103045  3        1.016480
## phys_active    1.024820  1        1.012334
## current_smoker 1.073574  1        1.036134
# YOUR CODE HERE: Create a Cook's distance plot to identify influential observations
cooks_d <- cooks.distance(model2)
head(sort(cooks_d, decreasing = TRUE))
##         302         213         523         123         712         970 
## 0.033059311 0.025006460 0.023492319 0.017800470 0.016477887 0.007823496
plot(cooks_d,
     type = "h",
     main = "Cook's Distance Plot",
     xlab = "Observation Number",
     ylab = "Cook's Distance")

abline(h = 4/length(cooks_d), col = "red", lty = 2)

cutoff <- 4/length(cooks_d)

which(cooks_d > cutoff)
##  123  213  242  246  270  302  474  510  523  547  583  610  683  712  720  759 
##  123  213  242  246  270  302  474  510  523  547  583  610  683  712  720  759 
##  806  873  950  970  992 1080 
##  806  873  950  970  992 1080

Questions:

  1. Are there any concerns about multicollinearity? There were no concerns about multicollinearity because all VIF values were close to 1 and well below the threshold of 5.
  2. Are there any influential observations that might affect your results? Several observations exceeded the Cook’s distance cutoff, but all Cook’s distance values were small, indicating no highly influential observations that would substantially affect the model results.
  3. What would you do if you found serious violations? If serious violations were detected, I would examine the data for errors, consider removing or combining highly correlated variables, and perform sensitivity analyses to assess the impact of influential observations before deciding whether to exclude them. —

Task 7: Model Comparison

# YOUR CODE HERE: Compare three models using AIC and BIC
# Model A: Age only
model_A <- glm(hypertension ~ age_cont,
               data = brfss,
               family = binomial)

# Model B: Age + sex + bmi_cat
model_B <- glm(hypertension ~ age_cont + sex + bmi_cat,
               data = brfss,
               family = binomial)

#  Model C: Age + sex + bmi_cat + phys_active + current_smoker
model_C <- glm(hypertension ~ age_cont + sex + bmi_cat +
                                 phys_active + current_smoker,
               data = brfss,
               family = binomial)


# YOUR CODE HERE: Create a comparison table
model_comparison <- data.frame(
  Model = c("Model A: Age only",
            "Model B: Age + sex + BMI",
            "Model C: Full model"),
  AIC = c(AIC(model_A), AIC(model_B), AIC(model_C)),
  BIC = c(BIC(model_A), BIC(model_B), BIC(model_C))
)

model_comparison
##                      Model      AIC      BIC
## 1        Model A: Age only 1636.613 1646.924
## 2 Model B: Age + sex + BMI 1576.487 1607.419
## 3      Model C: Full model 1579.496 1620.739

Questions:

  1. Which model has the best fit based on AIC? Model B (Age + sex + BMI) had the lowest AIC (1576.5), indicating the best model fit among the three models.
  2. Is the added complexity of the full model justified? The added complexity of Model C was not justified because it had higher AIC and BIC values, suggesting that including physical activity and smoking did not significantly improve model performance.
  3. Which model would you choose for your final analysis? Why? Model B would be chosen for the final analysis because it provides the best balance between model fit and simplicity, while including the key predictors associated with hypertension

Lab Report Guidelines

Write a brief report (1-2 pages) summarizing your findings:

  1. Introduction: State your research question What demographic and lifestyle factors, particularly age and body mass index (BMI), are associated with hypertension among adults in the BRFSS 2023 dataset?
  2. Methods: Describe your analytic approach Logistic regression was used to model hypertension (yes/no). Three models were compared: age only, age + sex + BMI, and a full model including physical activity and smoking. Odds ratios (OR) and 95% confidence intervals were calculated. Model fit was compared using AIC and BIC. Multicollinearity was assessed using VIF, and influential observations were evaluated using Cook’s distance.
  3. Results: Present key findings with tables and figures Age was significantly associated with hypertension (OR ≈ 1.06 per year). Obesity was the strongest predictor (OR ≈ 6.6). Overweight individuals also had higher odds (OR ≈ 3.2). Sex, smoking, and physical activity were not statistically significant. No significant interaction between age and BMI (p = 0.525).
  4. Interpretation: Explain what your results mean Older age and higher BMI were strongly associated with hypertension. Obese individuals had much higher odds compared to the reference group. BMI and age were independent predictors. The best model included age, sex, and BMI.
  5. Limitations: Discuss potential issues with your analysis This was a cross-sectional study, so causality cannot be determined. Some variables were self-reported, which may introduce bias.

Submission: Submit your completed R Markdown file and knitted HTML report.


Summary

Key Concepts Covered

  1. Statistical modeling describes relationships between variables
  2. Regression types depend on the outcome variable type
  3. Logistic regression is appropriate for binary outcomes
  4. Multiple regression controls for confounding
  5. Dummy variables represent categorical predictors
  6. Interactions test for effect modification
  7. Model diagnostics check assumptions and identify problems
  8. Model comparison helps select the best model

Important Formulas

Logistic Regression:

\[\text{logit}(p) = \log\left(\frac{p}{1-p}\right) = \beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p\]

Odds Ratio:

\[\text{OR} = e^{\beta_i}\]

Predicted Probability:

\[p = \frac{e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}{1 + e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}}\]


References

  • Agresti, A. (2018). An Introduction to Categorical Data Analysis (3rd ed.). Wiley.
  • Hosmer, D. W., Lemeshow, S., & Sturdivant, R. X. (2013). Applied Logistic Regression (3rd ed.). Wiley.
  • Vittinghoff, E., Glidden, D. V., Shiboski, S. C., & McCulloch, C. E. (2012). Regression Methods in Biostatistics (2nd ed.). Springer.
  • Centers for Disease Control and Prevention. (2023). Behavioral Risk Factor Surveillance System.

Session Info

sessionInfo()
## R version 4.5.0 (2025-04-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=Russian_Kazakhstan.utf8  LC_CTYPE=Russian_Kazakhstan.utf8   
## [3] LC_MONETARY=Russian_Kazakhstan.utf8 LC_NUMERIC=C                       
## [5] LC_TIME=Russian_Kazakhstan.utf8    
## 
## time zone: Asia/Qyzylorda
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggstats_0.12.0   gtsummary_2.5.0  ggeffects_2.3.2  car_3.1-5       
##  [5] carData_3.0-6    broom_1.0.12     plotly_4.12.0    kableExtra_1.4.0
##  [9] knitr_1.51       haven_2.5.5      lubridate_1.9.4  forcats_1.0.1   
## [13] stringr_1.5.1    dplyr_1.1.4      purrr_1.2.1      readr_2.1.6     
## [17] tidyr_1.3.2      tibble_3.3.1     ggplot2_4.0.2    tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6         xfun_0.56            bslib_0.10.0        
##  [4] htmlwidgets_1.6.4    insight_1.4.6        lattice_0.22-6      
##  [7] tzdb_0.5.0           crosstalk_1.2.2      vctrs_0.6.5         
## [10] tools_4.5.0          generics_0.1.4       datawizard_1.3.0    
## [13] pkgconfig_2.0.3      Matrix_1.7-3         data.table_1.18.2.1 
## [16] RColorBrewer_1.1-3   S7_0.2.1             lifecycle_1.0.5     
## [19] compiler_4.5.0       farver_2.1.2         textshaping_1.0.4   
## [22] janitor_2.2.1        codetools_0.2-20     snakecase_0.11.1    
## [25] htmltools_0.5.9      sass_0.4.10          yaml_2.3.10         
## [28] lazyeval_0.2.2       Formula_1.2-5        pillar_1.11.1       
## [31] jquerylib_0.1.4      broom.helpers_1.22.0 cachem_1.1.0        
## [34] abind_1.4-8          nlme_3.1-168         tidyselect_1.2.1    
## [37] digest_0.6.37        stringi_1.8.7        labeling_0.4.3      
## [40] splines_4.5.0        labelled_2.16.0      fastmap_1.2.0       
## [43] grid_4.5.0           cli_3.6.5            magrittr_2.0.3      
## [46] cards_0.7.1          utf8_1.2.6           withr_3.0.2         
## [49] scales_1.4.0         backports_1.5.0      timechange_0.4.0    
## [52] rmarkdown_2.30       httr_1.4.7           otel_0.2.0          
## [55] hms_1.1.4            evaluate_1.0.3       viridisLite_0.4.2   
## [58] mgcv_1.9-1           rlang_1.1.6          glue_1.8.0          
## [61] xml2_1.5.2           svglite_2.2.2        rstudioapi_0.18.0   
## [64] jsonlite_2.0.0       R6_2.6.1             systemfonts_1.3.1