Section 1: Analytical Sample Update

Setup and Data Preparation

# Load required packages
library(tidyr)
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)
library(janitor)
library(broom.helpers)
library(ggplot2)
library(survey)
library(purrr)
library(stringr)

Loading BRFSS 2024 Data

The BRFSS is a large-scale telephone survey that collects data on health-related risk behaviors, chronic health conditions, and use of preventive services from U.S. residents.

# Load the full BRFSS 2024 dataset
brfss_full <- read_xpt("LLCP2024.XTP") |> 
  janitor::clean_names()

# Display dataset dimensions
names(brfss_full)

##   [1] "state"    "fmonth"   "idate"    "imonth"   "iday"     "iyear"   
##   [7] "dispcode" "seqno"    "psu"      "ctelenm1" "pvtresd1" "colghous"
##  [13] "statere1" "celphon1" "ladult1"  "numadult" "respslc1" "landsex3"
##  [19] "safetime" "ctelnum1" "cellfon5" "cadult1"  "cellsex3" "pvtresd3"
##  [25] "cclghous" "cstate1"  "landline" "hhadult"  "sexvar"   "genhlth" 
##  [31] "physhlth" "menthlth" "poorhlth" "primins2" "persdoc3" "medcost1"
##  [37] "checkup1" "exerany2" "lastden4" "rmvteth4" "cvdinfr4" "cvdcrhd4"
##  [43] "cvdstrk3" "asthma3"  "asthnow"  "chcscnc1" "chcocnc1" "chccopd3"
##  [49] "addepev3" "chckdny2" "havarth4" "diabete4" "diabage4" "marital" 
##  [55] "educa"    "renthom1" "numhhol4" "numphon4" "cpdemo1c" "veteran3"
##  [61] "employ1"  "children" "income3"  "pregnant" "weight2"  "height3" 
##  [67] "deaf"     "blind"    "decide"   "diffwalk" "diffdres" "diffalon"
##  [73] "hadmam"   "howlong"  "cervscrn" "crvclcnc" "crvclpap" "crvclhpv"
##  [79] "hadhyst2" "hadsigm4" "colnsigm" "colntes1" "sigmtes1" "lastsig4"
##  [85] "colncncr" "vircolo1" "vclntes2" "smalstol" "stoltest" "stooldn2"
##  [91] "bldstfit" "sdnates1" "smoke100" "smokday2" "usenow3"  "ecignow3"
##  [97] "lcsfirst" "lcsnumcg" "lcsctsc1" "lcsscncr" "lcsctwhn" "alcday4" 
## [103] "avedrnk4" "drnk3ge5" "maxdrnks" "flushot7" "flshtmy3" "imfvpla5"
## [109] "pneuvac4" "hivtst7"  "hivtstd3" "hivrisk5" "pdiabts1" "prediab2"
## [115] "diabtype" "insulin1" "chkhemo3" "eyeexam1" "diabeye1" "diabedu1"
## [121] "feetsore" "arthexer" "shingle2" "hpvadvc4" "hpvadsh1" "tetanus1"
## [127] "cncrdiff" "cncrage"  "cncrtyp2" "csrvtrt3" "csrvdoc1" "csrvsum" 
## [133] "csrvrtrn" "csrvinst" "csrvinsr" "csrvdein" "csrvclin" "csrvpain"
## [139] "csrvctl2" "psatest1" "psatime1" "pcpsars2" "psasugs1" "pcstalk2"
## [145] "cimemlo1" "cdworry"  "cddiscu1" "cdhous1"  "cdsocia1" "caregiv1"
## [151] "crgvrel5" "crgvprb4" "crgvalzd" "crgvnurs" "crgvper2" "crgvhou2"
## [157] "crgvhrs2" "crgvlng2" "acedeprs" "acedrink" "acedrugs" "aceprisn"
## [163] "acedivrc" "acepunch" "acehurt1" "aceswear" "acetouch" "acetthem"
## [169] "acehvsex" "aceadsaf" "aceadned" "lsatisfy" "emtsuprt" "sdlonely"
## [175] "sdhemply" "foodstmp" "sdhfood1" "sdhbills" "sdhutils" "sdhtrnsp"
## [181] "howsafe1" "marijan1" "marjsmok" "marjeat"  "marjvape" "marjdab" 
## [187] "marjothr" "usemrjn4" "lastsmk2" "stopsmk2" "mentcigs" "mentecig"
## [193] "heattbco" "ssbsugr2" "ssbfrut3" "firearm5" "gunload"  "loadulk2"
## [199] "rcsgend1" "rcsxbrth" "rcsrltn2" "casthdx2" "casthno2" "somale"  
## [205] "sofemale" "hadsex"   "pfpprvn4" "typcntr9" "nobcuse8" "qstver"  
## [211] "qstlang"  "hpvdsht"  "icfqstvr" "metstat"  "urbstat"  "mscode"  
## [217] "ststr"    "strwt"    "rawrake"  "wt2rake"  "imprace"  "chispnc" 
## [223] "crace1"   "cageg"    "cllcpwt"  "dualuse"  "dualcor"  "llcpwt2" 
## [229] "llcpwt"   "rfhlth"   "phys14d"  "ment14d"  "hlthpl2"  "hcvu654" 
## [235] "totinda"  "exteth3"  "alteth3"  "denvst3"  "michd"    "ltasth1" 
## [241] "casthm1"  "asthms1"  "drdxar2"  "mrace1"   "hispanc"  "race"    
## [247] "raceg21"  "racegr3"  "raceprv"  "sex"      "ageg5yr"  "age65yr" 
## [253] "age80"    "age_g"    "htin4"    "htm4"     "wtkg3"    "bmi5"    
## [259] "bmi5cat"  "rfbmi5"   "chldcnt"  "educag"   "incomg1"  "rfmam23" 
## [265] "mam402y"  "crvscrn"  "rfpap37"  "hpv5yr1"  "paphpv1"  "hadcoln" 
## [271] "clnscp2"  "hadsigm"  "sgmscp2"  "sgms102"  "rfblds6"  "stoldn2" 
## [277] "vircol2"  "sbonti2"  "crcrec3"  "smoker3"  "rfsmok3"  "cureci3" 
## [283] "lcslast"  "lcsnumc"  "lcsage"   "lcsysmk"  "packday"  "packyrs" 
## [289] "lcsyqts"  "lcssmkg"  "lcselig"  "lcsctsn"  "lcspstf"  "drnkany6"
## [295] "drocdy4"  "rfbing6"  "drnkwk3"  "rfdrhv9"  "flshot7"  "pneumo3" 
## [301] "aidtst4"

# Display full dataset dimensions
cat("Full Data Dimensions:",
    nrow(brfss_full), "observations,",
    ncol(brfss_full), "variables\n")

## Full Data Dimensions: 457670 observations, 301 variables

tibble(Metric = c("Observations", "Variables"),
       Value  = c(nrow(brfss_full), ncol(brfss_full))) |> 
  kable(caption = "Full Dataset Dimensions") |> 
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Full Dataset Dimensions
Metric	Value
Observations	457670
Variables	301

Creating a Working Subset

# Select variables of interest and create analytic dataset
brfss_subset <- brfss_full |> 
  select(
    # Outcome: Have you ever had a cervical cancer screening test
    cervscrn,
    # Demographics
    ageg5yr,      # Age category, fourteen-level age category
    sexvar,         # Sex
    race,       # Race/ethnicity
    educag,     # Education level
    employ1,    # Employment status 
    # Health status
    genhlth,    # General health
    incomg1,    # Income category
    urbstat,    # Urban/Rural Status
    primins2,   # Primary source of health care coverage
    persdoc3,   # Personal health care provider
    medcost1,   # Delayed care due to cost
    checkup1,   # Time since last routine checkup
    llcpwt,
    ststr,
    psu
  )

# Display full dataset dimensions
cat("Full Subset Dimensions:",
    nrow(brfss_subset), "observations,",
    ncol(brfss_subset), "variables\n")

## Full Subset Dimensions: 457670 observations, 16 variables

tibble(Metric = c("Observations", "Variables"),
       Value  = c(nrow(brfss_subset), ncol(brfss_subset))) |> 
  kable(caption = "Full Subset Dataset Dimensions") |> 
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Full Subset Dataset Dimensions
Metric	Value
Observations	457670
Variables	16

# Filter for only females
brfss_female <- brfss_subset |> 
  filter(
    sexvar == 2)         # 2 = Female in BRFSS

# Display first exclusion dataset dimensions
names(brfss_female)

##  [1] "cervscrn" "ageg5yr"  "sexvar"   "race"     "educag"   "employ1" 
##  [7] "genhlth"  "incomg1"  "urbstat"  "primins2" "persdoc3" "medcost1"
## [13] "checkup1" "llcpwt"   "ststr"    "psu"

# Display first exclusion dataset dimensions
cat("First Exclusion Data dimensions:",
    nrow(brfss_female), "observations,",
    ncol(brfss_female), "variables\n")

## First Exclusion Data dimensions: 240183 observations, 16 variables

tibble(Metric = c("Observations", "Variables"),
       Value  = c(nrow(brfss_female), ncol(brfss_female))) |> 
  kable(caption = "First Exclusion Dataset Dimensions") |> 
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

First Exclusion Dataset Dimensions
Metric	Value
Observations	240183
Variables	16

# Filter for age (25-64)
brfss_filtered <- brfss_female |>  
  filter(
    ageg5yr %in% 2:9)     # 1–9 correspond to ages 25–64

# Display second exclusion dataset dimensions
names(brfss_filtered)

##  [1] "cervscrn" "ageg5yr"  "sexvar"   "race"     "educag"   "employ1" 
##  [7] "genhlth"  "incomg1"  "urbstat"  "primins2" "persdoc3" "medcost1"
## [13] "checkup1" "llcpwt"   "ststr"    "psu"

# Display second exclusion dataset dimensions
cat("Second Exclusion Data dimensions:",
    nrow(brfss_filtered), "observations,",
    ncol(brfss_filtered), "variables\n")

## Second Exclusion Data dimensions: 128221 observations, 16 variables

tibble(Metric = c("Observations", "Variables"),
       Value  = c(nrow(brfss_filtered), ncol(brfss_filtered))) |> 
  kable(caption = "Second Exclusion Dataset Dimensions") |> 
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Second Exclusion Dataset Dimensions
Metric	Value
Observations	128221
Variables	16

# Recode variables
brfss_recode <- brfss_filtered |>  
 mutate(
    cervicalscr_bin = ifelse(cervscrn == 1, 1, 0),
    cervicalscr_fac = factor(cervicalscr_bin,
                             levels = c(0, 1),
                             labels = c("Not screened", "Screened")), 
   age = factor(case_when(
      ageg5yr == 2 ~ "25-29",
      ageg5yr == 3 ~ "30-34",
      ageg5yr == 4 ~ "35-39",
      ageg5yr == 5 ~ "40-44",
      ageg5yr == 6 ~ "45-49",
      ageg5yr == 7 ~ "50-54",
      ageg5yr == 8 ~ "55-59",
      ageg5yr == 9 ~ "60-64"), 
      levels = c("25-29","30-34","35-39","40-44", "45-49","50-54","55-59","60-64")),
    sex = factor("Female", 
                 levels = "Female"),
    education = factor(case_when(
      educag == 1 ~ "Less than high school",
      educag == 2 ~ "High school graduate",
      educag == 3 ~ "Some college",
      educag == 4 ~ "College graduate"), 
      levels = c("Less than high school", "High school graduate",
                  "Some college", "College graduate")),
    race_cat = factor(case_when(
      race == 1 ~ "White",
      race == 2 ~ "Black",
      race == 3 ~ "American Indian or Alaska Native",
      race == 4 ~ "Asian",
      race == 5 ~ "Native Hawaiian or other Pacific Islander",
      race == 7 ~ "Multiracial",
      race == 8 ~ "Hispanic"), 
      levels = c("White",
      "Black",
      "American Indian or Alaska Native",
      "Asian",
      "Native Hawaiian or other Pacific Islander",
      "Multiracial",
      "Hispanic")),
    employ1_cat = factor(case_when(
      employ1 == 1 ~ "Employed for wages",
      employ1 == 2 ~ "Self-employed",
      employ1 == 3 ~ "Out of work for 1 year or more",
      employ1 == 4 ~ "Out of work for less than 1 year",
      employ1 == 5 ~ "A homemaker",
      employ1 == 6 ~ "A student",
      employ1 == 7 ~ "Retired",
      employ1 == 8 ~ "Unable to work"), 
      levels = c("Employed for wages", "Self-employed",
      "Out of work for 1 year or more", "Out of work for less than 1 year",
      "A homemaker", "A student", "Retired", "Unable to work")),
    gen_health = factor(case_when(
      genhlth == 1 ~ "Excellent",
      genhlth == 2 ~ "Very Good",
      genhlth == 3 ~ "Good", 
      genhlth == 4 ~ "Fair",
      genhlth == 5 ~ "Poor"), 
      levels = c("Excellent", "Very Good", "Good", "Fair", "Poor")),
    income = factor(case_when(
      incomg1 == 1 ~ "Less than $15,000",
      incomg1 == 2 ~ "$15,000 to < $25,000",
      incomg1 == 3 ~ "$25,000 to < $35,000",
      incomg1 == 4 ~ "$35,000 to < $50,000",
      incomg1 == 5 ~ "$50,000 to < $100,000", 
      incomg1 == 6 ~ "$100,000 to < $200,000", 
      incomg1 == 7 ~ "More than $200,000"), 
      levels = c("Less than $15,000", "$15,000 to < $25,000", "$25,000 to < $35,000", "$35,000 to < $50,000", "$50,000 to < $100,000", "$100,000 to < $200,000", "More than $200,000")), 
    urbstat1 = factor(case_when(
      urbstat == 1 ~ "Urban County",
      urbstat == 2 ~ "Rural County"),
      levels = c("Urban County", "Rural County")),
    priminsur = factor(case_when(
      primins2 == 1 ~ "Employer/Union Insurance",
      primins2 == 2 ~ "Private Nongovernmental Insurance",
      primins2 == 3 ~ "Medicare",
      primins2 == 4 ~ "Medigap",
      primins2 == 5 ~ "Medicaid",
      primins2 == 6 ~ "Children's Health Insurance Program",
      primins2 == 7 ~ "Military Related Health Care",
      primins2 == 8 ~ "Indian Health Service",
      primins2 == 9 ~ "State Sponsored Health Plan",
      primins2 == 10 ~ "Other Governmental Program",
      primins2 == 88 ~ "No Coverage of Any Type"), 
      levels = c("Employer/Union Insurance", "Private Nongovernmental Insurance", "Medicare", "Medigap", "Medicaid", "Children's Health Insurance Program", "Military Related Health Care", "Indian Health Service", "State Sponsored Health Plan",  "Other Governmental Program", "No Coverage of Any Type")),
   persdoc = factor(case_when(
      persdoc3 == 3 ~ "No",
      persdoc3 == 1 ~ "Yes only one",
      persdoc3 == 2 ~ "More than one"), 
      levels = c("No", "Yes only one", "More than one")),
    medcost = factor(case_when(
      medcost1 == 2 ~ "No", 
      medcost1 == 1 ~ "Yes"),
      levels = c("No", "Yes")), 
    checkup = factor(case_when(
      checkup1 == 8 ~ "Never",
      checkup1 == 1 ~ "Within past year",
      checkup1 == 2 ~ "Within past 2 years",
      checkup1 == 3 ~ "Within past 5 years", 
      checkup1 == 4 ~ "5 or more years ago",
      checkup1 == 7 ~ "Don't Know/Not Sure"),
      levels = c("Never", "Within past year", "Within past 2 years", "Within Past 5 years", "5 or more years ago", "Don't Know/Not Sure")))

brfss_recode <- brfss_recode |>
  mutate(
    cervicalscr_fac = relevel(cervicalscr_fac, ref = "Not screened"),
    age = relevel(age, ref = "25-29"),
    education = relevel(education, ref = "Less than high school"),
    race_cat = relevel(race_cat, ref = "White"),
    employ1_cat = relevel(employ1_cat, ref = "Employed for wages"),
    gen_health = relevel(gen_health, ref = "Excellent"),
    income = relevel(income, ref = "Less than $15,000"),
    urbstat1 = relevel(urbstat1, ref = "Urban County"),
    priminsur = relevel(priminsur, ref = "Employer/Union Insurance"),
    persdoc = relevel(persdoc, ref = "No"),
    medcost = relevel(medcost, ref = "No"),
    checkup = relevel(checkup, ref = "Within past year"))
  
 # Select only the recoded variables
  brfss_recode2 <- brfss_recode |>  
    select(cervicalscr_bin, cervicalscr_fac, age, sex, education, race_cat, employ1_cat, gen_health, income, urbstat1, priminsur, persdoc, medcost, checkup, llcpwt, ststr, psu)

  # Count missing values in each column
missing_summary <- data.frame(
  Variable = names(brfss_recode2),
  Missing_Count = colSums(is.na(brfss_recode2)),
  Missing_Percent = round(colSums(is.na(brfss_recode2)) / nrow(brfss_recode2) * 100, 2)
)

# Show variables with the most missing data
print(missing_summary[order(-missing_summary$Missing_Count), ][1:14, ]) |> 
  kable(
    caption = "Variables by Missingness"
  ) |> 
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE,
    position = "center"
  )

##                        Variable Missing_Count Missing_Percent
## income                   income         19076           14.88
## cervicalscr_bin cervicalscr_bin          7644            5.96
## cervicalscr_fac cervicalscr_fac          7644            5.96
## checkup                 checkup          6352            4.95
## urbstat1               urbstat1          5044            3.93
## priminsur             priminsur          3494            2.72
## race_cat               race_cat          2605            2.03
## employ1_cat         employ1_cat          2230            1.74
## persdoc                 persdoc           924            0.72
## education             education           422            0.33
## medcost                 medcost           400            0.31
## gen_health           gen_health           259            0.20
## age                         age             0            0.00
## sex                         sex             0            0.00

Variables by Missingness
	Variable	Missing_Count	Missing_Percent
income	income	19076	14.88
cervicalscr_bin	cervicalscr_bin	7644	5.96
cervicalscr_fac	cervicalscr_fac	7644	5.96
checkup	checkup	6352	4.95
urbstat1	urbstat1	5044	3.93
priminsur	priminsur	3494	2.72
race_cat	race_cat	2605	2.03
employ1_cat	employ1_cat	2230	1.74
persdoc	persdoc	924	0.72
education	education	422	0.33
medcost	medcost	400	0.31
gen_health	gen_health	259	0.20
age	age	0	0.00
sex	sex	0	0.00

  brfss_recode3 <- brfss_recode2 |>  
  # Filter to complete cases only
  drop_na()
  
  set.seed(553)  # For reproducibility
  # Sample 50000 observations for manageable analysis
  brfss_analytic <- brfss_recode3 |>  
  slice_sample(n = 50000)
  
  # Save for later use
saveRDS(brfss_analytic,
  "brfss_final_project_2024.rds")

  # Read in the Analytic Dataset
brfss_analytic <- readRDS(
  "brfss_final_project_2024.rds"
)

  # Create survey design 
  options(survey.lonely.psu = "adjust")
  brfss_design <- svydesign(
    ids = ~psu,
    strata = ~ststr,
    weights = ~llcpwt,
    nest = TRUE,
    data = brfss_analytic
    )
  
# Display subset dimensions
cat("Working subset dimensions:",
    nrow(brfss_analytic), "observations,",
    ncol(brfss_analytic), "variables\n")

## Working subset dimensions: 50000 observations, 17 variables

tibble(Metric = c("Observations", "Variables"),
       Value  = c(nrow(brfss_analytic), ncol(brfss_analytic))) |> 
  kable(caption = "Analytic Dataset Dimensions") |> 
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Analytic Dataset Dimensions
Metric	Value
Observations	50000
Variables	17

table(brfss_analytic$race_cat, brfss_analytic$cervicalscr_fac)

##                                            
##                                             Not screened Screened
##   White                                             3196    32399
##   Black                                              875     4047
##   American Indian or Alaska Native                   149      708
##   Asian                                              329      915
##   Native Hawaiian or other Pacific Islander           42      151
##   Multiracial                                        127     1193
##   Hispanic                                          1157     4712

Descriptive Statistics

# Create Unweighted Table 1 Descriptive Statistics
brfss_analytic |>  
  select(cervicalscr_fac, race_cat, age, education, employ1_cat, gen_health, income, urbstat1, priminsur, persdoc, medcost, checkup) |>  
  tbl_summary(
    by = race_cat,   # Stratify Table by Race/Ethnicity
    type = list(medcost ~ "categorical"),
    label = list(
      cervicalscr_fac ~ "Ever Had a Cervical Cancer Screening Test",
      age ~ "Age Category",
      education ~ "Level of Education Completed",
      employ1_cat ~ "Current Employment Status",
      gen_health ~ "General Health Status",
      income ~ "Income Category",
      urbstat1 ~ "Urban/Rural Status",
      priminsur ~ "Current Health Care Coverage",
      persdoc ~ "Personal Health Care Provider",
      medcost ~ "Could not afford to see Doctor (past 12 months)",
      checkup ~ "Time since last routine checkup"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) |> 
  add_n() |> 
  bold_labels() |> 
  modify_caption("Table 1. Unweighted Descriptive Statistics — BRFSS 2024 Analytic Sample (n = 50,000)") |>  
 modify_footnote(
    all_stat_cols() ~ 
      "Analytic sample includes 50,000 complete-case observations. Missing values were excluded listwise for all variables included in this table."
  ) |> 
  as_flex_table()

Table 1. Unweighted Descriptive Statistics — BRFSS 2024 Analytic Sample (n = 50,000)
Characteristic	N	White N = 35,5951	Black N = 4,9221	American Indian or Alaska Native N = 8571	Asian N = 1,2441	Native Hawaiian or other Pacific Islander N = 1931	Multiracial N = 1,3201	Hispanic N = 5,8691
Ever Had a Cervical Cancer Screening Test	50,000
Not screened		3,196 (9.0%)	875 (18%)	149 (17%)	329 (26%)	42 (22%)	127 (9.6%)	1,157 (20%)
Screened		32,399 (91%)	4,047 (82%)	708 (83%)	915 (74%)	151 (78%)	1,193 (90%)	4,712 (80%)
Age Category	50,000
25-29		2,409 (6.8%)	385 (7.8%)	63 (7.4%)	193 (16%)	16 (8.3%)	158 (12%)	797 (14%)
30-34		2,926 (8.2%)	510 (10%)	67 (7.8%)	214 (17%)	29 (15%)	192 (15%)	904 (15%)
35-39		3,760 (11%)	589 (12%)	115 (13%)	190 (15%)	21 (11%)	190 (14%)	891 (15%)
40-44		4,308 (12%)	649 (13%)	107 (12%)	162 (13%)	31 (16%)	192 (15%)	839 (14%)
45-49		4,269 (12%)	640 (13%)	120 (14%)	143 (11%)	20 (10%)	166 (13%)	764 (13%)
50-54		4,844 (14%)	730 (15%)	129 (15%)	139 (11%)	27 (14%)	133 (10%)	660 (11%)
55-59		5,796 (16%)	685 (14%)	132 (15%)	100 (8.0%)	23 (12%)	139 (11%)	543 (9.3%)
60-64		7,283 (20%)	734 (15%)	124 (14%)	103 (8.3%)	26 (13%)	150 (11%)	471 (8.0%)
Level of Education Completed	50,000
Less than high school		885 (2.5%)	182 (3.7%)	57 (6.7%)	15 (1.2%)	8 (4.1%)	46 (3.5%)	1,183 (20%)
High school graduate		6,255 (18%)	1,090 (22%)	208 (24%)	125 (10%)	70 (36%)	237 (18%)	1,468 (25%)
Some college		9,400 (26%)	1,499 (30%)	304 (35%)	186 (15%)	63 (33%)	388 (29%)	1,321 (23%)
College graduate		19,055 (54%)	2,151 (44%)	288 (34%)	918 (74%)	52 (27%)	649 (49%)	1,897 (32%)
Current Employment Status	50,000
Employed for wages		22,378 (63%)	3,169 (64%)	517 (60%)	862 (69%)	110 (57%)	776 (59%)	3,168 (54%)
Self-employed		3,052 (8.6%)	287 (5.8%)	60 (7.0%)	106 (8.5%)	17 (8.8%)	125 (9.5%)	549 (9.4%)
Out of work for 1 year or more		677 (1.9%)	183 (3.7%)	43 (5.0%)	28 (2.3%)	7 (3.6%)	43 (3.3%)	239 (4.1%)
Out of work for less than 1 year		802 (2.3%)	198 (4.0%)	40 (4.7%)	40 (3.2%)	11 (5.7%)	52 (3.9%)	302 (5.1%)
A homemaker		2,650 (7.4%)	144 (2.9%)	49 (5.7%)	87 (7.0%)	19 (9.8%)	106 (8.0%)	968 (16%)
A student		359 (1.0%)	121 (2.5%)	15 (1.8%)	56 (4.5%)	3 (1.6%)	32 (2.4%)	89 (1.5%)
Retired		2,650 (7.4%)	259 (5.3%)	32 (3.7%)	37 (3.0%)	9 (4.7%)	40 (3.0%)	131 (2.2%)
Unable to work		3,027 (8.5%)	561 (11%)	101 (12%)	28 (2.3%)	17 (8.8%)	146 (11%)	423 (7.2%)
General Health Status	50,000
Excellent		5,322 (15%)	580 (12%)	92 (11%)	238 (19%)	32 (17%)	157 (12%)	763 (13%)
Very Good		13,212 (37%)	1,349 (27%)	221 (26%)	423 (34%)	47 (24%)	416 (32%)	1,374 (23%)
Good		11,046 (31%)	1,904 (39%)	311 (36%)	452 (36%)	64 (33%)	443 (34%)	2,273 (39%)
Fair		4,452 (13%)	874 (18%)	163 (19%)	107 (8.6%)	38 (20%)	222 (17%)	1,213 (21%)
Poor		1,563 (4.4%)	215 (4.4%)	70 (8.2%)	24 (1.9%)	12 (6.2%)	82 (6.2%)	246 (4.2%)
Income Category	50,000
Less than $15,000		1,646 (4.6%)	463 (9.4%)	85 (9.9%)	54 (4.3%)	18 (9.3%)	116 (8.8%)	709 (12%)
$15,000 to < $25,000		2,181 (6.1%)	531 (11%)	109 (13%)	63 (5.1%)	25 (13%)	115 (8.7%)	990 (17%)
$25,000 to < $35,000		2,600 (7.3%)	660 (13%)	132 (15%)	105 (8.4%)	25 (13%)	124 (9.4%)	1,008 (17%)
$35,000 to < $50,000		3,565 (10%)	773 (16%)	148 (17%)	123 (9.9%)	26 (13%)	147 (11%)	864 (15%)
$50,000 to < $100,000		10,959 (31%)	1,400 (28%)	231 (27%)	325 (26%)	52 (27%)	376 (28%)	1,194 (20%)
$100,000 to < $200,000		10,822 (30%)	828 (17%)	128 (15%)	320 (26%)	42 (22%)	322 (24%)	821 (14%)
More than $200,000		3,822 (11%)	267 (5.4%)	24 (2.8%)	254 (20%)	5 (2.6%)	120 (9.1%)	283 (4.8%)
Urban/Rural Status	50,000
Urban County		30,228 (85%)	4,646 (94%)	531 (62%)	1,216 (98%)	187 (97%)	1,168 (88%)	5,593 (95%)
Rural County		5,367 (15%)	276 (5.6%)	326 (38%)	28 (2.3%)	6 (3.1%)	152 (12%)	276 (4.7%)
Current Health Care Coverage	50,000
Employer/Union Insurance		21,297 (60%)	2,319 (47%)	276 (32%)	805 (65%)	91 (47%)	661 (50%)	2,062 (35%)
Private Nongovernmental Insurance		4,230 (12%)	418 (8.5%)	56 (6.5%)	134 (11%)	10 (5.2%)	119 (9.0%)	504 (8.6%)
Medicare		2,189 (6.1%)	445 (9.0%)	62 (7.2%)	46 (3.7%)	18 (9.3%)	86 (6.5%)	367 (6.3%)
Medigap		24 (<0.1%)	2 (<0.1%)	1 (0.1%)	0 (0%)	2 (1.0%)	0 (0%)	2 (<0.1%)
Medicaid		3,365 (9.5%)	972 (20%)	160 (19%)	95 (7.6%)	25 (13%)	220 (17%)	984 (17%)
Children's Health Insurance Program		13 (<0.1%)	3 (<0.1%)	1 (0.1%)	0 (0%)	0 (0%)	0 (0%)	3 (<0.1%)
Military Related Health Care		893 (2.5%)	140 (2.8%)	21 (2.5%)	23 (1.8%)	9 (4.7%)	40 (3.0%)	100 (1.7%)
Indian Health Service		11 (<0.1%)	1 (<0.1%)	172 (20%)	1 (<0.1%)	0 (0%)	37 (2.8%)	8 (0.1%)
State Sponsored Health Plan		1,265 (3.6%)	201 (4.1%)	27 (3.2%)	52 (4.2%)	23 (12%)	60 (4.5%)	438 (7.5%)
Other Governmental Program		920 (2.6%)	174 (3.5%)	27 (3.2%)	38 (3.1%)	8 (4.1%)	32 (2.4%)	230 (3.9%)
No Coverage of Any Type		1,388 (3.9%)	247 (5.0%)	54 (6.3%)	50 (4.0%)	7 (3.6%)	65 (4.9%)	1,171 (20%)
Personal Health Care Provider	50,000
No		2,323 (6.5%)	380 (7.7%)	113 (13%)	166 (13%)	22 (11%)	127 (9.6%)	1,310 (22%)
Yes only one		18,861 (53%)	2,497 (51%)	451 (53%)	689 (55%)	105 (54%)	639 (48%)	2,876 (49%)
More than one		14,411 (40%)	2,045 (42%)	293 (34%)	389 (31%)	66 (34%)	554 (42%)	1,683 (29%)
Could not afford to see Doctor (past 12 months)	50,000
No		31,548 (89%)	4,170 (85%)	717 (84%)	1,117 (90%)	159 (82%)	1,086 (82%)	4,539 (77%)
Yes		4,047 (11%)	752 (15%)	140 (16%)	127 (10%)	34 (18%)	234 (18%)	1,330 (23%)
Time since last routine checkup	50,000
Within past year		30,309 (85%)	4,411 (90%)	719 (84%)	1,030 (83%)	164 (85%)	1,074 (81%)	4,736 (81%)
Never		81 (0.2%)	4 (<0.1%)	1 (0.1%)	10 (0.8%)	1 (0.5%)	1 (<0.1%)	78 (1.3%)
Within past 2 years		3,708 (10%)	422 (8.6%)	103 (12%)	169 (14%)	21 (11%)	175 (13%)	836 (14%)
Within Past 5 years		0 (0%)	0 (0%)	0 (0%)	0 (0%)	0 (0%)	0 (0%)	0 (0%)
5 or more years ago		1,295 (3.6%)	54 (1.1%)	30 (3.5%)	28 (2.3%)	6 (3.1%)	61 (4.6%)	170 (2.9%)
Don't Know/Not Sure		202 (0.6%)	31 (0.6%)	4 (0.5%)	7 (0.6%)	1 (0.5%)	9 (0.7%)	49 (0.8%)
1Analytic sample includes 50,000 complete-case observations. Missing values were excluded listwise for all variables included in this table.

Cervical Screening Distribution and Percentages

# Table of Cervical Screening Distribution
brfss_analytic |>  
  select(cervicalscr_fac) |>  
  tbl_summary(
    label = list(
      cervicalscr_fac ~ "Every Had a Cervical Cancer Screening"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    ),
    missing = "no"
  ) |> 
  add_n() |> 
  bold_labels() |> 
  modify_caption("Unweighted Cervical Cancer Screening Distribution and Percentages")

Unweighted Cervical Cancer Screening Distribution and Percentages
Characteristic	N	N = 50,000¹
Every Had a Cervical Cancer Screening	50,000
Not screened		5,875 (12%)
Screened		44,125 (88%)
¹ n (%)

Race Distribution and Percentages

# Table of Race Distribution
brfss_analytic |>  
  select(race_cat) |>  
  tbl_summary(
    label = list(
      race_cat ~ "Race Category"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    ),
    missing = "no"
  ) |> 
  add_n() |> 
  bold_labels() |> 
  modify_caption("Unweighted Race Category Distribution and Percentages")

Unweighted Race Category Distribution and Percentages
Characteristic	N	N = 50,000¹
Race Category	50,000
White		35,595 (71%)
Black		4,922 (9.8%)
American Indian or Alaska Native		857 (1.7%)
Asian		1,244 (2.5%)
Native Hawaiian or other Pacific Islander		193 (0.4%)
Multiracial		1,320 (2.6%)
Hispanic		5,869 (12%)
¹ n (%)

Age Distribution and Percentages

# Table of Age Distribution
brfss_analytic |>  
  select(age) |>  
  tbl_summary(
    label = list(
      age ~ "Age Category"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    ),
    missing = "no"
  ) |> 
  add_n() |> 
  bold_labels() |> 
  modify_caption("Unweighted Age Distribution and Percentages")

Unweighted Age Distribution and Percentages
Characteristic	N	N = 50,000¹
Age Category	50,000
25-29		4,021 (8.0%)
30-34		4,842 (9.7%)
35-39		5,756 (12%)
40-44		6,288 (13%)
45-49		6,122 (12%)
50-54		6,662 (13%)
55-59		7,418 (15%)
60-64		8,891 (18%)
¹ n (%)

Level of Education Completed Distribution and Percentages

# Table of Level of Education Completed Distribution
brfss_analytic |>  
  select(education) |>  
  tbl_summary(
    label = list(
      education ~ "Level of Education Completed"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    ),
    missing = "no"
  ) |> 
  add_n() |> 
  bold_labels() |> 
  modify_caption("Unweighted Level of Education Completed Distribution and Percentages")

Unweighted Level of Education Completed Distribution and Percentages
Characteristic	N	N = 50,000¹
Level of Education Completed	50,000
Less than high school		2,376 (4.8%)
High school graduate		9,453 (19%)
Some college		13,161 (26%)
College graduate		25,010 (50%)
¹ n (%)

Current Employment Status Distribution and Percentages

# Table of Current Employment Status Distribution
brfss_analytic |>  
  select(employ1_cat) |>  
  tbl_summary(
    label = list(
      employ1_cat ~ "Current Employment Status" 
    ), 
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    ),
    missing = "no"
  ) |> 
  add_n() |> 
  bold_labels() |> 
  modify_caption("Unweighted Current Employment Status Distribution and Percentages")

Unweighted Current Employment Status Distribution and Percentages
Characteristic	N	N = 50,000¹
Current Employment Status	50,000
Employed for wages		30,980 (62%)
Self-employed		4,196 (8.4%)
Out of work for 1 year or more		1,220 (2.4%)
Out of work for less than 1 year		1,445 (2.9%)
A homemaker		4,023 (8.0%)
A student		675 (1.4%)
Retired		3,158 (6.3%)
Unable to work		4,303 (8.6%)
¹ n (%)

General Health Status Distribution and Percentages

# Table of General Health Status Distribution
brfss_analytic |>  
  select(gen_health) |>  
  tbl_summary(
    label = list(
      gen_health ~ "General Health Status"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    ),
    missing = "no"
  ) |> 
  add_n() |> 
  bold_labels() |> 
  modify_caption("Unweighted General Health Status Distribution and Percentages")

Unweighted General Health Status Distribution and Percentages
Characteristic	N	N = 50,000¹
General Health Status	50,000
Excellent		7,184 (14%)
Very Good		17,042 (34%)
Good		16,493 (33%)
Fair		7,069 (14%)
Poor		2,212 (4.4%)
¹ n (%)

Income Category Distribution and Percentages

# Tabke of Income Category Distribution
brfss_analytic |>  
  select(income) |>  
  tbl_summary(
    label = list(
      income ~ "Income Category"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    ),
    missing = "no"
  ) |> 
  add_n() |> 
  bold_labels() |> 
  modify_caption("Unweighted Income Category Distribution and Percentages")

Unweighted Income Category Distribution and Percentages
Characteristic	N	N = 50,000¹
Income Category	50,000
Less than $15,000		3,091 (6.2%)
$15,000 to < $25,000		4,014 (8.0%)
$25,000 to < $35,000		4,654 (9.3%)
$35,000 to < $50,000		5,646 (11%)
$50,000 to < $100,000		14,537 (29%)
$100,000 to < $200,000		13,283 (27%)
More than $200,000		4,775 (9.6%)
¹ n (%)

Urban Rural Status Distribution and Percentages

# Table of Ubran/Rural Status Distribution
brfss_analytic |>  
  select(urbstat1) |>  
  tbl_summary(
    label = list(
      urbstat1 ~ "Urban/Rural Status"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    ),
    missing = "no"
  ) |> 
  add_n() |> 
  bold_labels() |> 
  modify_caption("Unweighted Urban Rural Status Distribution and Percentages")

Unweighted Urban Rural Status Distribution and Percentages
Characteristic	N	N = 50,000¹
Urban/Rural Status	50,000
Urban County		43,569 (87%)
Rural County		6,431 (13%)
¹ n (%)

Current Health Care Coverage Distribution and Percentages

# Table of Current Health Care Coverage Distribution
brfss_analytic |>  
  select(priminsur) |>  
  tbl_summary(
    label = list(
      priminsur ~ "Current Health Care Coverage"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) |> 
  add_n() |> 
  bold_labels() |> 
  modify_caption("Unweighted Current Health Care Coverage Distribution and Percentages")

Unweighted Current Health Care Coverage Distribution and Percentages
Characteristic	N	N = 50,000¹
Current Health Care Coverage	50,000
Employer/Union Insurance		27,511 (55%)
Private Nongovernmental Insurance		5,471 (11%)
Medicare		3,213 (6.4%)
Medigap		31 (<0.1%)
Medicaid		5,821 (12%)
Children's Health Insurance Program		20 (<0.1%)
Military Related Health Care		1,226 (2.5%)
Indian Health Service		230 (0.5%)
State Sponsored Health Plan		2,066 (4.1%)
Other Governmental Program		1,429 (2.9%)
No Coverage of Any Type		2,982 (6.0%)
¹ n (%)

Personal Health Care Provider Distribution and Percentages

# Table of Personal Health Care Provider Distribution
  brfss_analytic |>  
  select(persdoc) |>  
  tbl_summary(
    label = list(
      persdoc ~ "Personal Health Care Provider"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    ),
    missing = "no"
  ) |> 
  add_n() |> 
  bold_labels() |> 
  modify_caption("Unweighted Personal Health Care Provider Distribution and Percentages")

Unweighted Personal Health Care Provider Distribution and Percentages
Characteristic	N	N = 50,000¹
Personal Health Care Provider	50,000
No		4,441 (8.9%)
Yes only one		26,118 (52%)
More than one		19,441 (39%)
¹ n (%)

Could not afford to see a Doctor Distribution and Percentages

# Table of Could not afford to see Doctor Distribution  
 brfss_analytic |>  
  select(medcost) |>  
  tbl_summary(
    type = list(medcost ~ "categorical"),
    label = list(
      medcost ~ "Could not afford to see Doctor (past 12 months)"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    ),
    missing = "no"
  ) |> 
  add_n() |> 
  bold_labels() |> 
  modify_caption("Unweighted Could not afford to see a Doctor Distribution and Percentages")

Unweighted Could not afford to see a Doctor Distribution and Percentages
Characteristic	N	N = 50,000¹
Could not afford to see Doctor (past 12 months)	50,000
No		43,336 (87%)
Yes		6,664 (13%)
¹ n (%)

Time since last routine checkup Distribution and Percentages

# Table of Time since last routine checkup Distribution
brfss_analytic |>  
  select(checkup) |>  
  tbl_summary(
    label = list(
      checkup ~ "Time since last routine checkup"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    ),
    missing = "no"
  ) |> 
  add_n() |> 
  bold_labels() |> 
  modify_caption("Unweighted Time since last routine checkup Distribution and Percentages")

Unweighted Time since last routine checkup Distribution and Percentages
Characteristic	N	N = 50,000¹
Time since last routine checkup	50,000
Within past year		42,443 (85%)
Never		176 (0.4%)
Within past 2 years		5,434 (11%)
Within Past 5 years		0 (0%)
5 or more years ago		1,644 (3.3%)
Don't Know/Not Sure		303 (0.6%)
¹ n (%)

Distribution of Cervical Cancer Screening

# Create a bar plot for cervical cancer screening count
ggplot(brfss_analytic, aes(x = cervicalscr_fac)) +
  geom_bar() +
  geom_text(aes(label = after_stat(count)), stat = "count", vjust = -0.5) +
  labs(title = "Bar Graph of Cervical Cancer Screening with Counts",
       x = "Cervical Cancer Screening",
       y = "Count") +
  theme_minimal()

Association Between Cervical Cancer Screening by Race/Ethnicity

# Create bar plot demonstrating the proportion of cervical cancer screening by race/ethnicity 
brfss_analytic |> 
  group_by(race_cat) |> 
  summarize(prop_screened = mean(cervicalscr_fac == "Screened")) |> 
  ggplot(aes(x = race_cat, y = prop_screened)) +
  geom_col(fill = "steelblue") +
  labs(
    x = "Race/Ethnicity",
    y = "Proportion Screened",
    title = "Proportion of Cervical Cancer Screening by Race/Ethnicity"
  ) +
  theme_minimal()

Cervical Cancer Screening Distribution by Race/Ethnicity

# Create split bar plot for race/ethnicity
ggplot(brfss_analytic, aes(x = race_cat, fill = cervicalscr_fac)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = "Race/Ethnicity",
    y = "Percent",
    fill = "Screened",
    title = "Cervical Screening Distribution by Race/Ethnicity"
  ) +
  theme_minimal()

Weighted Screening Proportion by Race and Ethnicity with Confidence Intervals

# Create race levels
race_levels <- unique(brfss_analytic$race_cat)

# Create table with race levels
tbl <- map_df(race_levels, function(r) {
  
  # subset design for this race group
  d_sub <- subset(brfss_design, race_cat == r)
  
  # compute survey-weighted proportion + CI
  est <- svyciprop(~I(cervicalscr_fac == "Screened"),
                   design = d_sub,
                   method = "logit")
  
  # return a clean row
  tibble(
    race_cat = r,
    prop  = as.numeric(coef(est)),
    lower = as.numeric(confint(est)[1]),
    upper = as.numeric(confint(est)[2])
  )
})

# Create the plot with confidence intervals 
ggplot(tbl, aes(x = race_cat, y = prop)) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2) +
  labs(
    x = "Race/Ethnicity",
    y = "Weighted Proportion Screened",
    title = "Survey-Weighted Screening Proportions with 95% Confidence Intervals"
  ) +
  theme_minimal()

Split Bar Plot for Education Levels

# Create split bar plot for education levels 
ggplot(brfss_analytic, aes(x = education, fill = cervicalscr_fac)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = "Level of Education",
    y = "Percent",
    fill = "Screened",
    title = "Cervical Screening Distribution by Level of Education"
  ) +
  theme_minimal()

Split Bar Plot for Employment Status

# Create split bar plot for employment status
ggplot(brfss_analytic, aes(x = employ1_cat, fill = cervicalscr_fac)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = "Employment Status",
    y = "Percent",
    fill = "Screened",
    title = "Cervical Screening Distribution by Employment Status"
  ) +
  theme_minimal()

Split Bar Plot for Personal Doctor Status

# Create split bar plot for personal doctor status
ggplot(brfss_analytic, aes(x = persdoc, fill = cervicalscr_fac)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = "Personal Doctor Status",
    y = "Percent",
    fill = "Screened",
    title = "Cervical Screening Distribution by Personal Doctor Status"
  ) +
  theme_minimal()

Split Bar Plot for Age Categories

# Create split bar plot for age categories 
ggplot(brfss_analytic, aes(x = age, fill = cervicalscr_fac)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = "Age Category (Years)",
    y = "Percent",
    fill = "Screened",
    title = "Cervical Screening Distribution by Age Category"
  ) +
  theme_minimal()

brfss_analytic <- brfss_analytic |>
  mutate(
    across(
      c(race_cat, employ1_cat, gen_health, income, urbstat1,
        priminsur, persdoc, medcost, checkup),
      droplevels
    )
  )

Section 2: Regression Model Specification

Model 1: Unadjusted Cervical Cancer Screening Regressed on Race Categories

# Create model limited that only includes exposure and outcome
model_limited <- svyglm(cervicalscr_bin ~ race_cat, data = brfss_analytic, design = brfss_design, family = "quasibinomial")

# Create a nice table
tidy(model_limited, conf.int = FALSE) |> 
  mutate(
    term = dplyr::recode(term,
      "(Intercept)" = "Intercept",
      "race_cat" = "Racial Identity"),
    across(where(is.numeric), ~ round(., 4))
  ) |> 
  kable(
    caption   = "Survey Weighted Quasibionomial Regression: Cervical Cancer Screening by Race Category (BRFSS 2024, n = 50,000)",
    col.names = c("Term", "Estimate", "Std. Error", "t-statistic", "p-value"),
    format = "html"
  ) |> 
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = TRUE) |> 
  row_spec(0, bold = TRUE)

Survey Weighted Quasibionomial Regression: Cervical Cancer Screening by Race Category (BRFSS 2024, n = 50,000)
Term	Estimate	Std. Error	t-statistic	p-value
Intercept	2.2950	0.0358	64.1482	0.0000
race_catBlack	-0.7992	0.0870	-9.1877	0.0000
race_catAmerican Indian or Alaska Native	-1.1544	0.2344	-4.9252	0.0000
race_catAsian	-1.1270	0.1429	-7.8873	0.0000
race_catNative Hawaiian or other Pacific Islander	-0.5332	0.2697	-1.9772	0.0480
race_catMultiracial	-0.0538	0.1636	-0.3287	0.7424
race_catHispanic	-0.8943	0.0802	-11.1502	0.0000

Model Full: Adjusted Cervical Cancer Screening Regressed on Race Categories plus all covariates

# Create model full that includes exposure and outcome and all covariates
model <- svyglm(cervicalscr_bin ~ race_cat + age + education  + employ1_cat + gen_health + income + urbstat1 +  priminsur + persdoc + medcost + checkup, data = brfss_analytic, design = brfss_design, family = "quasibinomial")

# Create a nice table
tidy(model, conf.int = FALSE) |> 
  mutate(
    term = dplyr::recode(term,
      "(Intercept)" = "Intercept",
      "race_cat" = "Racial Identity", 
      "age" = "Age Category",
      "education" = "Level of Education Completed",
      "employ1_cat" = "Current Employment Status",
      "gen_health" = "General Health Status",
      "income" = "Income Category",
      "urbstat1" = "Urban/Rural Status",
      "priminsur" = "Current Health Care Coverage",
      "persdoc" = "Personal Health Care Provider",
      "medcost" = "Could not afford to see Doctor (past 12 months)",
      "checkup" = "Time since last routine checkup"
    ),
    across(where(is.numeric), ~ round(., 4))
  ) |> 
  kable(
    caption   = "Survey Weighted Quasibinomial Regression: Cervical Cancer Screening and all other covariates (BRFSS 2024, n = 50,000)",
    col.names = c("Term", "Estimate", "Std. Error", "t-statistic", "p-value"),
    format = "html"
  ) |> 
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = TRUE) |> 
  row_spec(0, bold = TRUE)

Survey Weighted Quasibinomial Regression: Cervical Cancer Screening and all other covariates (BRFSS 2024, n = 50,000)
Term	Estimate	Std. Error	t-statistic	p-value
Intercept	0.0548	0.2245	0.2439	0.8073
race_catBlack	-0.6425	0.0942	-6.8226	0.0000
race_catAmerican Indian or Alaska Native	-0.8781	0.3047	-2.8816	0.0040
race_catAsian	-1.3482	0.1399	-9.6334	0.0000
race_catNative Hawaiian or other Pacific Islander	-0.3514	0.2874	-1.2227	0.2214
race_catMultiracial	0.1172	0.1787	0.6559	0.5119
race_catHispanic	-0.2689	0.0843	-3.1907	0.0014
age30-34	0.5051	0.1214	4.1588	0.0000
age35-39	0.7203	0.1205	5.9787	0.0000
age40-44	0.6609	0.1244	5.3113	0.0000
age45-49	0.8489	0.1411	6.0168	0.0000
age50-54	0.6301	0.1302	4.8402	0.0000
age55-59	0.5513	0.1201	4.5896	0.0000
age60-64	0.5678	0.1184	4.7967	0.0000
educationHigh school graduate	0.5656	0.1145	4.9411	0.0000
educationSome college	0.8940	0.1140	7.8446	0.0000
educationCollege graduate	1.1093	0.1254	8.8459	0.0000
employ1_catSelf-employed	-0.2505	0.1459	-1.7176	0.0859
employ1_catOut of work for 1 year or more	-0.2821	0.1485	-1.8996	0.0575
employ1_catOut of work for less than 1 year	0.1284	0.1813	0.7082	0.4788
employ1_catA homemaker	0.1152	0.1201	0.9597	0.3372
employ1_catA student	0.1192	0.2109	0.5655	0.5717
employ1_catRetired	-0.2669	0.1492	-1.7891	0.0736
employ1_catUnable to work	0.0114	0.1215	0.0936	0.9254
gen_healthVery Good	0.3111	0.1060	2.9362	0.0033
gen_healthGood	-0.0273	0.1071	-0.2548	0.7989
gen_healthFair	0.1099	0.1172	0.9371	0.3487
gen_healthPoor	-0.0134	0.1585	-0.0845	0.9327
income$15,000 to < $25,000	0.1702	0.1334	1.2754	0.2022
income$25,000 to < $35,000	0.1551	0.1262	1.2293	0.2190
income$35,000 to < $50,000	0.1792	0.1316	1.3620	0.1732
income$50,000 to < $100,000	0.4920	0.1326	3.7102	0.0002
income$100,000 to < $200,000	0.8430	0.1516	5.5606	0.0000
incomeMore than $200,000	1.2860	0.2046	6.2850	0.0000
urbstat1Rural County	0.0061	0.0898	0.0679	0.9459
priminsurPrivate Nongovernmental Insurance	-0.0942	0.1268	-0.7431	0.4574
priminsurMedicare	-0.2645	0.1310	-2.0198	0.0434
priminsurMedigap	-0.4408	0.7095	-0.6213	0.5344
priminsurMedicaid	-0.0206	0.1186	-0.1740	0.8619
priminsurChildren’s Health Insurance Program	1.3137	1.5271	0.8602	0.3897
priminsurMilitary Related Health Care	-0.1448	0.2268	-0.6382	0.5233
priminsurIndian Health Service	-0.0419	0.4286	-0.0978	0.9221
priminsurState Sponsored Health Plan	0.1611	0.1483	1.0868	0.2771
priminsurOther Governmental Program	-0.1239	0.1697	-0.7300	0.4654
priminsurNo Coverage of Any Type	-0.1517	0.1409	-1.0763	0.2818
persdocYes only one	0.3172	0.0984	3.2242	0.0013
persdocMore than one	0.4993	0.1102	4.5304	0.0000
medcostYes	0.0386	0.0844	0.4569	0.6477
checkupNever	-0.7603	0.2753	-2.7617	0.0058
checkupWithin past 2 years	-0.1598	0.0935	-1.7079	0.0877
checkup5 or more years ago	-0.4645	0.1458	-3.1858	0.0014
checkupDon’t Know/Not Sure	-1.4106	0.3703	-3.8094	0.0001

Section 3: Regression Results

Model Limited: Unadjusted Cervical Cancer Screening Regressed on Race Categories Odds Ratios, 95% Confidence Interval, and P-Values

# Extract estimates and make OR and 95% CI
tidy_ci_lim <- tidy(model_limited, conf.int = FALSE, exponentiate = TRUE) |> 
  mutate(
    conf.low = exp(estimate - 1.96 * std.error),
    conf.high = exp(estimate + 1.96 * std.error)
  )

# Recode it and put it into a tidy table
tidy_table <- tidy_ci_lim |> 
  mutate(
    term = dplyr::recode(term,
      "(Intercept)" = "Intercept",
      "race_cat" = "Racial Identity"
    ),
    estimate = round(estimate, 4),
    conf.low  = round(conf.low, 4),
    conf.high = round(conf.high, 4),
    p.value   = round(p.value, 4)
  ) |> 
  select(
    term,
    estimate,
    conf.low,
    conf.high,
    p.value
  )

tidy_table |> 
  kable(
    caption = "Table 2. Survey-Weighted Quasibnomial Regression Results: Cervical Cancer Screening by Race Category (BRFSS 2024, n = 50,000)",
    col.names = c("Term", "Odds Ratio", "95% CI Lower", "95% CI Upper", "p-value"),
    format = "html"
  ) |> 
  kable_styling(
    bootstrap_options = c("striped", "hover"),
    full_width = TRUE
  ) |> 
  row_spec(0, bold = TRUE)

Table 2. Survey-Weighted Quasibnomial Regression Results: Cervical Cancer Screening by Race Category (BRFSS 2024, n = 50,000)
Term	Odds Ratio	95% CI Lower	95% CI Upper	p-value
Intercept	9.9245	19040.6602	21907.3190	0.0000
race_catBlack	0.4497	1.3221	1.8593	0.0000
race_catAmerican Indian or Alaska Native	0.3153	0.8658	2.1698	0.0000
race_catAsian	0.3240	1.0449	1.8295	0.0000
race_catNative Hawaiian or other Pacific Islander	0.5868	1.0600	3.0504	0.0480
race_catMultiracial	0.9477	1.8721	3.5546	0.7424
race_catHispanic	0.4089	1.2862	1.7614	0.0000

# Create a publication ready table
model_limited |>
  tbl_regression(
    exponentiate = TRUE,
    label = list(race_cat ~ "Racial Identity")
  ) |>
  modify_caption(
    "**Table 2. Survey-Weighted Quasibinomial Regression Results: Cervical Cancer Screening by Race Category (BRFSS 2024, n = 50,000)**"
  ) |>
  bold_labels() |>
  bold_p()

**Table 2. Survey-Weighted Quasibinomial Regression Results: Cervical Cancer Screening by Race Category (BRFSS 2024, n = 50,000)**
Characteristic	OR	95% CI	p-value
Racial Identity
White	—	—
Black	0.45	0.38, 0.53	<0.001
American Indian or Alaska Native	0.32	0.20, 0.50	<0.001
Asian	0.32	0.24, 0.43	<0.001
Native Hawaiian or other Pacific Islander	0.59	0.35, 1.00	0.048
Multiracial	0.95	0.69, 1.31	0.7
Hispanic	0.41	0.35, 0.48	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

Model Full: Adjusted Cervical Cancer Screening Regressed on Race Categories plus all covariated Odds Ratios, 95% Confidence Interval, and P-Values

# Extract estimates and make OR and 95% CI
tidy_ci <- tidy(model, conf.int = FALSE, exponentiate = TRUE) |> 
  mutate(
    conf.low = exp(estimate - 1.96 * std.error),
    conf.high = exp(estimate + 1.96 * std.error)
  )

# Recode it and put it into a tidy table
tidy_table <- tidy_ci |> 
  mutate(
    term = dplyr::recode(term,
      "(Intercept)" = "Intercept",
      "race_cat" = "Racial Identity",
      "age" = "Age Category",
      "education" = "Level of Education Completed",
      "employ1_cat" = "Current Employment Status",
      "gen_health" = "General Health Status",
      "income" = "Income Category",
      "urbstat1" = "Urban/Rural Status",
      "priminsur" = "Current Health Care Coverage",
      "persdoc" = "Personal Health Care Provider",
      "medcost" = "Could not afford to see Doctor (past 12 months)",
      "checkup" = "Time since last routine checkup"
    ),
    estimate        = round(estimate, 4),
    conf.low  = round(conf.low, 4),
    conf.high = round(conf.high, 4),
    p.value   = round(p.value, 4)
  ) |> 
  select(
    term,
    estimate,
    conf.low,
    conf.high,
    p.value
  )

tidy_table |> 
  kable(
    caption = "Table 3. Survey-Weighted Quasibinomial Regression Results: Cervical Cancer Screening and all other covariates (BRFSS 2024, n = 50,000)",
    col.names = c("Term", "Odds Ratio", "95% CI Lower", "95% CI Upper", "p-value"),
    format = "html"
  ) |> 
  kable_styling(
    bootstrap_options = c("striped", "hover"),
    full_width = TRUE
  ) |> 
  row_spec(0, bold = TRUE)

Table 3. Survey-Weighted Quasibinomial Regression Results: Cervical Cancer Screening and all other covariates (BRFSS 2024, n = 50,000)
Term	Odds Ratio	95% CI Lower	95% CI Upper	p-value
Intercept	1.0563	1.8519	4.4654	0.8073
race_catBlack	0.5260	1.4069	2.0351	0.0000
race_catAmerican Indian or Alaska Native	0.4156	0.8339	2.7534	0.0040
race_catAsian	0.2597	0.9855	1.7058	0.0000
race_catNative Hawaiian or other Pacific Islander	0.7037	1.1508	3.5501	0.2214
race_catMultiracial	1.1243	2.1687	4.3689	0.5119
race_catHispanic	0.7642	1.8204	2.5330	0.0014
age30-34	1.6571	4.1332	6.6532	0.0000
age35-39	2.0551	6.1656	9.8877	0.0000
age40-44	1.9366	5.4341	8.8507	0.0000
age45-49	2.3371	7.8505	13.6489	0.0000
age50-54	1.8778	5.0667	8.4403	0.0000
age55-59	1.7355	4.4818	7.1770	0.0000
age60-64	1.7643	4.6289	7.3618	0.0000
educationHigh school graduate	1.7604	4.6464	7.2774	0.0000
educationSome college	2.4449	9.2218	14.4157	0.0000
educationCollege graduate	3.0323	16.2240	26.5248	0.0000
employ1_catSelf-employed	0.7784	1.6365	2.8987	0.0859
employ1_catOut of work for 1 year or more	0.7542	1.5890	2.8442	0.0575
employ1_catOut of work for less than 1 year	1.1370	2.1850	4.4481	0.4788
employ1_catA homemaker	1.1221	2.4273	3.8864	0.3372
employ1_catA student	1.1266	2.0408	4.6641	0.5717
employ1_catRetired	0.7658	1.6054	2.8810	0.0736
employ1_catUnable to work	1.0114	2.1669	3.4888	0.9254
gen_healthVery Good	1.3650	3.1812	4.8193	0.0033
gen_healthGood	0.9731	2.1451	3.2641	0.7989
gen_healthFair	1.1161	2.4262	3.8417	0.3487
gen_healthPoor	0.9867	1.9663	3.6593	0.9327
income$15,000 to < $25,000	1.1855	2.5193	4.2506	0.2022
income$25,000 to < $35,000	1.1678	2.5105	4.1169	0.2190
income$35,000 to < $50,000	1.1963	2.5558	4.2809	0.1732
income$50,000 to < $100,000	1.6355	3.9576	6.6554	0.0002
income$100,000 to < $200,000	2.3233	7.5849	13.7415	0.0000
incomeMore than $200,000	3.6183	24.9585	55.6617	0.0000
urbstat1Rural County	1.0061	2.2934	3.2616	0.9459
priminsurPrivate Nongovernmental Insurance	0.9101	1.9378	3.1855	0.4574
priminsurMedicare	0.7676	1.6667	2.7850	0.0434
priminsurMedigap	0.6435	0.4738	7.6453	0.5344
priminsurMedicaid	0.9796	2.1110	3.3603	0.8619
priminsurChildren’s Health Insurance Program	3.7199	2.0682	823.0803	0.3897
priminsurMilitary Related Health Care	0.8652	1.5230	3.7054	0.5233
priminsurIndian Health Service	0.9589	1.1261	6.0442	0.9221
priminsurState Sponsored Health Plan	1.1748	2.4212	4.3293	0.2771
priminsurOther Governmental Program	0.8835	1.7348	3.3740	0.4654
priminsurNo Coverage of Any Type	0.8593	1.7915	3.1127	0.2818
persdocYes only one	1.3733	3.2558	4.7880	0.0013
persdocMore than one	1.6475	4.1850	6.4464	0.0000
medcostYes	1.0393	2.3963	3.3357	0.6477
checkupNever	0.4675	0.9305	2.7377	0.0058
checkupWithin past 2 years	0.8523	1.9523	2.8171	0.0877
checkup5 or more years ago	0.6285	1.4088	2.4948	0.0014
checkupDon’t Know/Not Sure	0.2440	0.6177	2.6373	0.0001

# Create a publication ready table
model |>
  tbl_regression(
    exponentiate = TRUE,
    label = list(
      race_cat ~ "Racial Identity",
      age ~ "Age Category",
      education ~ "Level of Education Completed",
      employ1_cat ~ "Current Employment Status",
      gen_health ~ "General Health Status",
      income ~ "Income Category",
      urbstat1 ~ "Urban/Rural Status",
      priminsur ~ "Current Health Care Coverage",
      persdoc ~ "Personal Health Care Provider",
      medcost ~ "Could not afford to see Doctor (past 12 months)",
      checkup ~ "Time since last routine checkup"
    )
  ) |>
  modify_caption(
    "**Table 3. Survey-Weighted Quasibinomial Regression Results: Cervical Cancer Screening and All Covariates (BRFSS 2024, n = 50,000)**"
  ) |>
  bold_labels() |>
  bold_p()

**Table 3. Survey-Weighted Quasibinomial Regression Results: Cervical Cancer Screening and All Covariates (BRFSS 2024, n = 50,000)**
Characteristic	OR	95% CI	p-value
Racial Identity
White	—	—
Black	0.53	0.44, 0.63	<0.001
American Indian or Alaska Native	0.42	0.23, 0.76	0.004
Asian	0.26	0.20, 0.34	<0.001
Native Hawaiian or other Pacific Islander	0.70	0.40, 1.24	0.2
Multiracial	1.12	0.79, 1.60	0.5
Hispanic	0.76	0.65, 0.90	0.001
Age Category
25-29	—	—
30-34	1.66	1.31, 2.10	<0.001
35-39	2.06	1.62, 2.60	<0.001
40-44	1.94	1.52, 2.47	<0.001
45-49	2.34	1.77, 3.08	<0.001
50-54	1.88	1.45, 2.42	<0.001
55-59	1.74	1.37, 2.20	<0.001
60-64	1.76	1.40, 2.23	<0.001
Level of Education Completed
Less than high school	—	—
High school graduate	1.76	1.41, 2.20	<0.001
Some college	2.44	1.96, 3.06	<0.001
College graduate	3.03	2.37, 3.88	<0.001
Current Employment Status
Employed for wages	—	—
Self-employed	0.78	0.58, 1.04	0.086
Out of work for 1 year or more	0.75	0.56, 1.01	0.057
Out of work for less than 1 year	1.14	0.80, 1.62	0.5
A homemaker	1.12	0.89, 1.42	0.3
A student	1.13	0.75, 1.70	0.6
Retired	0.77	0.57, 1.03	0.074
Unable to work	1.01	0.80, 1.28	>0.9
General Health Status
Excellent	—	—
Very Good	1.36	1.11, 1.68	0.003
Good	0.97	0.79, 1.20	0.8
Fair	1.12	0.89, 1.40	0.3
Poor	0.99	0.72, 1.35	>0.9
Income Category
Less than $15,000	—	—
$15,000 to < $25,000	1.19	0.91, 1.54	0.2
$25,000 to < $35,000	1.17	0.91, 1.50	0.2
$35,000 to < $50,000	1.20	0.92, 1.55	0.2
$50,000 to < $100,000	1.64	1.26, 2.12	<0.001
$100,000 to < $200,000	2.32	1.73, 3.13	<0.001
More than $200,000	3.62	2.42, 5.40	<0.001
Urban/Rural Status
Urban County	—	—
Rural County	1.01	0.84, 1.20	>0.9
Current Health Care Coverage
Employer/Union Insurance	—	—
Private Nongovernmental Insurance	0.91	0.71, 1.17	0.5
Medicare	0.77	0.59, 0.99	0.043
Medigap	0.64	0.16, 2.59	0.5
Medicaid	0.98	0.78, 1.24	0.9
Children's Health Insurance Program	3.72	0.19, 74.2	0.4
Military Related Health Care	0.87	0.55, 1.35	0.5
Indian Health Service	0.96	0.41, 2.22	>0.9
State Sponsored Health Plan	1.17	0.88, 1.57	0.3
Other Governmental Program	0.88	0.63, 1.23	0.5
No Coverage of Any Type	0.86	0.65, 1.13	0.3
Personal Health Care Provider
No	—	—
Yes only one	1.37	1.13, 1.67	0.001
More than one	1.65	1.33, 2.04	<0.001
Could not afford to see Doctor (past 12 months)
No	—	—
Yes	1.04	0.88, 1.23	0.6
Time since last routine checkup
Within past year	—	—
Never	0.47	0.27, 0.80	0.006
Within past 2 years	0.85	0.71, 1.02	0.088
5 or more years ago	0.63	0.47, 0.84	0.001
Don't Know/Not Sure	0.24	0.12, 0.50	<0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

Which Coefficient Moves the Race Category the Most

# Extract all race coefficients (excluding intercept)
race_coefs0 <- coef(model_limited)[grep("^race_cat", names(coef(model_limited)))]

# Select covariates to test
covars <- c("age", "education", "employ1_cat", "gen_health",
            "income", "urbstat1", "priminsur", "persdoc", "medcost", "checkup")

# Create a loop to test each covariate
results_or <- map_dfr(covars, function(v) {
  
  f <- as.formula(paste("cervicalscr_bin ~ race_cat +", v))
  
  m <- svyglm(f, design = brfss_design, family = quasibinomial())
  
  race_adj <- coef(m)[grep("^race_cat", names(coef(m)))]
  race_unadj <- race_coefs0[names(race_adj)]
  
  OR_unadj <- exp(race_unadj)
  OR_adj <- exp(race_adj)
  
  tibble(
    covariate = v,
    race_level = str_remove(names(race_adj), "race_cat"),
    Odds_ratio_unadjusted = OR_unadj,
    Odds_ratio_adjusted = OR_adj,
    abs_change = abs(OR_adj - OR_unadj),
    percent_change = 100 * (OR_adj - OR_unadj) / OR_unadj
  )
})

results_or |>
  mutate(across(where(is.numeric), ~ round(.x, 3))) |>
  arrange(race_level, desc(abs_change)) |>
  knitr::kable(
    caption = "Survey-Weighted Change in Race Odds Ratios After Adding Each Covariate"
  ) |>
  kableExtra::kable_styling(
    bootstrap_options = c("striped", "hover"),
    full_width = FALSE
  )

Survey-Weighted Change in Race Odds Ratios After Adding Each Covariate
covariate	race_level	Odds_ratio_unadjusted	Odds_ratio_adjusted	abs_change	percent_change
income	American Indian or Alaska Native	0.315	0.454	0.139	43.981
education	American Indian or Alaska Native	0.315	0.370	0.055	17.392
priminsur	American Indian or Alaska Native	0.315	0.369	0.053	16.967
gen_health	American Indian or Alaska Native	0.315	0.340	0.024	7.726
persdoc	American Indian or Alaska Native	0.315	0.339	0.024	7.577
employ1_cat	American Indian or Alaska Native	0.315	0.338	0.023	7.137
age	American Indian or Alaska Native	0.315	0.303	0.012	-3.795
medcost	American Indian or Alaska Native	0.315	0.326	0.011	3.398
checkup	American Indian or Alaska Native	0.315	0.306	0.009	-2.995
urbstat1	American Indian or Alaska Native	0.315	0.318	0.003	0.875
education	Asian	0.324	0.245	0.079	-24.249
income	Asian	0.324	0.265	0.059	-18.348
priminsur	Asian	0.324	0.296	0.028	-8.664
persdoc	Asian	0.324	0.346	0.022	6.710
employ1_cat	Asian	0.324	0.310	0.014	-4.253
urbstat1	Asian	0.324	0.317	0.007	-2.183
medcost	Asian	0.324	0.317	0.007	-2.176
gen_health	Asian	0.324	0.318	0.006	-1.957
checkup	Asian	0.324	0.328	0.004	1.370
age	Asian	0.324	0.326	0.002	0.712
income	Black	0.450	0.573	0.124	27.487
priminsur	Black	0.450	0.505	0.056	12.400
gen_health	Black	0.450	0.475	0.025	5.531
checkup	Black	0.450	0.426	0.024	-5.235
education	Black	0.450	0.469	0.019	4.199
persdoc	Black	0.450	0.443	0.007	-1.452
employ1_cat	Black	0.450	0.456	0.006	1.368
urbstat1	Black	0.450	0.443	0.006	-1.408
medcost	Black	0.450	0.455	0.005	1.136
age	Black	0.450	0.446	0.004	-0.922
education	Hispanic	0.409	0.634	0.225	55.045
income	Hispanic	0.409	0.591	0.182	44.430
priminsur	Hispanic	0.409	0.530	0.121	29.519
persdoc	Hispanic	0.409	0.472	0.063	15.410
gen_health	Hispanic	0.409	0.442	0.033	8.080
employ1_cat	Hispanic	0.409	0.428	0.019	4.567
medcost	Hispanic	0.409	0.425	0.016	3.918
urbstat1	Hispanic	0.409	0.401	0.008	-1.840
age	Hispanic	0.409	0.405	0.004	-1.010
checkup	Hispanic	0.409	0.410	0.002	0.380
income	Multiracial	0.948	1.093	0.145	15.317
priminsur	Multiracial	0.948	1.038	0.090	9.498
education	Multiracial	0.948	1.004	0.057	5.979
persdoc	Multiracial	0.948	0.984	0.036	3.852
checkup	Multiracial	0.948	0.975	0.027	2.885
employ1_cat	Multiracial	0.948	0.973	0.025	2.683
gen_health	Multiracial	0.948	0.973	0.025	2.648
medcost	Multiracial	0.948	0.970	0.022	2.341
age	Multiracial	0.948	0.966	0.019	1.973
urbstat1	Multiracial	0.948	0.941	0.006	-0.683
income	Native Hawaiian or other Pacific Islander	0.587	0.708	0.121	20.585
education	Native Hawaiian or other Pacific Islander	0.587	0.686	0.100	16.996
priminsur	Native Hawaiian or other Pacific Islander	0.587	0.635	0.048	8.239
employ1_cat	Native Hawaiian or other Pacific Islander	0.587	0.627	0.040	6.869
age	Native Hawaiian or other Pacific Islander	0.587	0.551	0.036	-6.099
medcost	Native Hawaiian or other Pacific Islander	0.587	0.620	0.034	5.728
gen_health	Native Hawaiian or other Pacific Islander	0.587	0.619	0.032	5.482
persdoc	Native Hawaiian or other Pacific Islander	0.587	0.603	0.016	2.754
checkup	Native Hawaiian or other Pacific Islander	0.587	0.572	0.014	-2.444
urbstat1	Native Hawaiian or other Pacific Islander	0.587	0.576	0.011	-1.838

Section 4: Model Diagnostics

Residuals vs. Fitted Plot

# Extract fitted + residuals
df_resid <- data.frame(
  fitted = fitted(model),
  dev_resid = residuals(model, type = "deviance"),
  pearson_resid = residuals(model, type = "pearson")
)

# Deviance residuals vs fitted
ggplot(df_resid, aes(x = fitted, y = dev_resid)) +
  geom_point(alpha = 0.3) +
  geom_smooth(se = TRUE, color = "green") +
  labs(
    title = "Deviance Residuals vs Fitted (Survey-Weighted Full Model)",
    x = "Fitted Values",
    y = "Deviance Residuals"
  ) +
  theme_minimal()

Calibration Plot

# Create predicted and observed values from brfss_analytic and split sample into 10 group (deciles)
calib <- data.frame(
  pred = fitted(model),
  obs = brfss_analytic$cervicalscr_bin
) |> 
  mutate(decile = ntile(pred, 10)) |> 
  group_by(decile) |> 
  summarize(
    mean_pred = mean(pred),
    mean_obs = mean(obs)
  )

# Plot Predicted vs. Proportion for Calibration 
ggplot(calib, aes(x = mean_pred, y = mean_obs)) +
  geom_point(size = 3) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  labs(
    title = "Calibration Plot (Survey-Weighted Full Model)",
    x = "Mean Predicted Probability",
    y = "Observed Proportion"
  ) +
  theme_minimal()

# Hosmer-Lemeshow Test 
hl_test <- ResourceSelection::hoslem.test(
  x = as.numeric(brfss_analytic$cervicalscr_bin) - 1,
  y = fitted(model),
  g = 10
)

hl_test

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  as.numeric(brfss_analytic$cervicalscr_bin) - 1, fitted(model)
## X-squared = 772960, df = 8, p-value < 2.2e-16

Cook’s Distance Plot

# Fit unweighted glm for Cook's Distance
model_glm <- glm(
  cervicalscr_bin ~ age + education + race_cat + employ1_cat +
    gen_health + income + urbstat1 + priminsur + persdoc +
    medcost + checkup,
  data = brfss_analytic,
  family = binomial()
)

# Number of observations
n <- nrow(brfss_analytic)

# Compute Cook's distance
cooks_d <- cooks.distance(model_glm)

# Identify influential points
influential_flag <- ifelse(cooks_d > 4/n,
                           "Potentially influential",
                           "Not influential")

# Count how many exceed the threshold
n_influential <- sum(cooks_d > 4/n)

# Build the augmented dataset
augment <- data.frame(
  obs_num = 1:n,
  cooks_d = cooks_d,
  influential = influential_flag
)

# Plot Cook's Distance
cooks_distance <- ggplot(augment, aes(x = obs_num, y = cooks_d, color = influential)) +
  geom_point(alpha = 0.6, size = 1.2) +
  geom_hline(yintercept = 4/n, linetype = "dashed",
             color = "red", linewidth = 1) +
  scale_color_manual(values = c("Potentially influential" = "tomato",
                                "Not influential" = "steelblue")) +
  labs(
    title = "Cook's Distance with Unweighted GLM as Approximation",
    subtitle = paste0("Dashed line = 4/n threshold | ",
                      n_influential, " potentially influential observations"),
    x = "Observation Number",
    y = "Cook's Distance",
    color = ""
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "top")

# Print Cooks Distance Plot
print(cooks_distance)

Leverage vs. Standardized Deviance Residuals Plot

# Extract leverage for each observation from unweighted logistic regression
lev <- hatvalues(model_glm)

# Compute standard deviance residuals from unweigthted logistic regression
std_dev_resid <- rstandard(model_glm, type = "deviance")

# Create leverage vs. standardized residuals plot 
ggplot(data.frame(lev, std_dev_resid),
       aes(x = lev, y = std_dev_resid)) +
  geom_point(alpha = 0.3) +
  geom_smooth(se = TRUE, color = "red") +
  labs(
    title = "Leverage vs Standardized Deviance Residuals with Unweighted GLM as Approximation",
    x = "Leverage",
    y = "Standardized Deviance Residuals"
  ) +
  theme_minimal()

Section 5: Regression Visualization

Figure 1: Primary Association Visualization

# Create prediction from ggpredict
pred <- ggpredict(model, terms = "race_cat")

# Plot primary association visualization
ggplot(pred, aes(x = x, y = predicted)) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2) +
  labs(
    x = "Race/Ethnicity",
    y = "Adjusted Predicted Probability of Screening",
    title = "Figure 1. Adjusted Predicted Cervical Cancer Screening Probability by Race/Ethnicity Category with 95% Confidence Intervals",
    caption = "Asian and American Indian or Alaska Native women have the lowest predicted probability of being screened for cervical cancer. White and multiracial females have the highest predicted probability of screening"
  ) +
  theme_minimal() +
  theme(
    plot.caption = element_text(hjust = 0.5, size = 10, face = "italic"),
    plot.margin = margin(t = 10, r = 10, b = 60, l = 10) 
  )

Figure 2: Full Model Coefficient Plot

# Tidy the weighted model output 
tbl_coef <- tidy(model, conf.int = TRUE, exponentiate = TRUE) |>
  filter(term != "(Intercept)") |> 
  mutate(
    term = forcats::fct_reorder(term, estimate)  # reorder for nicer plotting
  )

# Clean up term labels so the plot looks nice
tbl_coef_clean <- tbl_coef |>
  mutate(
    term_clean = case_when(
      str_detect(term, "^race_cat") ~ str_replace(term, "race_cat", "Race: "),
      str_detect(term, "^education") ~ str_replace(term, "education", "Education: "),
      str_detect(term, "^income") ~ str_replace(term, "income", "Income: "),
      str_detect(term, "^gen_health") ~ str_replace(term, "gen_health", "General health: "),
      str_detect(term, "^priminsur") ~ str_replace(term, "priminsur", "Insurance: "),
      str_detect(term, "^persdoc") ~ str_replace(term, "persdoc", "Personal doctor: "),
      str_detect(term, "^medcost") ~ str_replace(term, "medcost", "Cost barrier: "),
      str_detect(term, "^checkup") ~ str_replace(term, "checkup", "Time since checkup: "),
      str_detect(term, "^employ1_cat") ~ str_replace(term, "employ1_cat", "Employment: "),
      str_detect(term, "^urbstat1") ~ str_replace(term, "urbstat1", "Urbanicity: "),
      str_detect(term, "^age") ~ str_replace(term, "age", "Age: "),
      TRUE ~ term
    )
  ) 

# Create full plot
ggplot(tbl_coef_clean, aes(x = estimate, y = term_clean)) +
  geom_point(size = 3) +
  geom_errorbar(
    aes(xmin = conf.low, xmax = conf.high),
    width = 0.2,
    orientation = "y"
  ) +
  geom_vline(xintercept = 1, linetype = "dashed", color = "blue") +
  scale_x_log10() +
  labs(
    title = "Figure 2. Adjusted Odds Ratios of Cervical Cancer Screening with 95% Wald Confidence Intervals for All Covariates",
    x = "Odds Ratio (log scale)",
    y = "",
    caption = "Hispanic, Black, Asian, and American Indian or Alaska Native are significantly associated with lowered odds of cervical cancer screening. Education, income, and age demonstrate a dose response."
  ) +
  theme_minimal() +
  theme(
    plot.caption = element_text(hjust = 0.5, size = 10, face = "italic"),
    plot.margin = margin(t = 10, r = 10, b = 60, l = 10)
  )

Figure 3: Poster Model Coefficient Plot

# Put them all in order
race_order <- c(
  "Race: Multiracial",
  "Race: Hispanic",
  "Race: Native Hawaiian or other Pacific Islander",
  "Race: Black",
  "Race: American Indian or Alaska Native",
  "Race: Asian"
)

checkup_order <- c(
  "Time since checkup: Within past 2 years",
  "Time since checkup: 5 or more years ago",
  "Time since checkup: Never",
  "Time since checkup: Don't Know/Not Sure"
)

education_order <- c(
  "Education: High school graduate",
  "Education: Some college",
  "Education: College graduate"
)

income_order <- c(
  "Income: $15,000 to < $25,000",
  "Income: $25,000 to < $35,000",
  "Income: $35,000 to < $50,000",
  "Income: $50,000 to < $100,000",
  "Income: $100,000 to < $200,000",
  "Income: More than $200,000"
)

persdoc_order <- c(
  "Personal doctor: Yes only one",
  "Personal doctor: More than one"
)

tbl_coef_clean <- tbl_coef |>
  filter(
    str_detect(term, "^(race_cat|checkup|education|income|persdoc)")
  ) |>

  mutate(
    term_clean = case_when(
      str_detect(term, "^race_catMultiracial") ~ "Race: Multiracial",
      str_detect(term, "^race_catHispanic") ~ "Race: Hispanic",
      str_detect(term, "^race_catBlack") ~ "Race: Black",
      str_detect(term, "^race_catAmerican Indian or Alaska Native") ~ "Race: American Indian or Alaska Native",
      str_detect(term, "^race_catAsian") ~ "Race: Asian",
      str_detect(term, "^race_catNative Hawaiian or other Pacific Islander") ~ "Race: Native Hawaiian or other Pacific Islander",
      str_detect(term, "^checkup") ~ str_replace(term, "checkup", "Time since checkup: "),
      str_detect(term, "^education") ~ str_replace(term, "education", "Education: "),
      str_detect(term, "^income") ~ str_replace(term, "income", "Income: "),
      str_detect(term, "^persdocYes only one") ~ "Personal doctor: Yes only one",
str_detect(term, "^persdocMore than one") ~ "Personal doctor: More than one",
      TRUE ~ term
    )
  ) |>

  mutate(
    term_clean = factor(
      term_clean,
      levels = c(
        race_order,
        checkup_order,
        persdoc_order,
        education_order,
        income_order
      )
    )
  )

ggplot(tbl_coef_clean, aes(x = estimate, y = term_clean)) +
  geom_point(size = 3) +
  geom_errorbar(
    aes(xmin = conf.low, xmax = conf.high),
    width = 0.2,
    orientation = "y"
  ) +
  geom_vline(xintercept = 1, linetype = "dashed", color = "blue") +
  scale_x_log10() +
  labs(
    title = "Adjusted Odds Ratios of Specific Covariates with 95% Confidence Intervals",
    x = "Odds Ratio (log scale)",
    y = ""
  ) +
  theme_minimal(base_size = 13)

Final Project Submissions

EPI 553 - Advanced Epidemiologic Methods

Zoya Hayes

2026-05-05

Section 1: Analytical Sample Update

Setup and Data Preparation

Loading BRFSS 2024 Data

Creating a Working Subset

Descriptive Statistics

Cervical Screening Distribution and Percentages

Race Distribution and Percentages

Age Distribution and Percentages

Level of Education Completed Distribution and Percentages

Current Employment Status Distribution and Percentages

General Health Status Distribution and Percentages

Income Category Distribution and Percentages

Urban Rural Status Distribution and Percentages

Current Health Care Coverage Distribution and Percentages

Personal Health Care Provider Distribution and Percentages

Could not afford to see a Doctor Distribution and Percentages

Time since last routine checkup Distribution and Percentages

Distribution of Cervical Cancer Screening

Association Between Cervical Cancer Screening by Race/Ethnicity

Cervical Cancer Screening Distribution by Race/Ethnicity

Weighted Screening Proportion by Race and Ethnicity with Confidence Intervals

Split Bar Plot for Education Levels

Split Bar Plot for Employment Status

Split Bar Plot for Personal Doctor Status

Split Bar Plot for Age Categories

Section 2: Regression Model Specification

Model 1: Unadjusted Cervical Cancer Screening Regressed on Race Categories

Model Full: Adjusted Cervical Cancer Screening Regressed on Race Categories plus all covariates

Section 3: Regression Results

Model Limited: Unadjusted Cervical Cancer Screening Regressed on Race Categories Odds Ratios, 95% Confidence Interval, and P-Values

Model Full: Adjusted Cervical Cancer Screening Regressed on Race Categories plus all covariated Odds Ratios, 95% Confidence Interval, and P-Values

Which Coefficient Moves the Race Category the Most

Section 4: Model Diagnostics

Residuals vs. Fitted Plot

Calibration Plot

Cook’s Distance Plot

Leverage vs. Standardized Deviance Residuals Plot

Section 5: Regression Visualization

Figure 1: Primary Association Visualization

Figure 2: Full Model Coefficient Plot

Figure 3: Poster Model Coefficient Plot