Setup and Data Preparation

# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)
library(janitor)
library(broom.helpers)
library(ggplot2)

Loading BRFSS 2024 Data

The BRFSS is a large-scale telephone survey that collects data on health-related risk behaviors, chronic health conditions, and use of preventive services from U.S. residents.

# Load the full BRFSS 2024 dataset
brfss_full <- read_xpt("/Users/zoya_hayes/Desktop/EPI553/EPI553_Coding/FinalProject/BRFSS2024") %>%
  janitor::clean_names()

# Display dataset dimensions
names(brfss_full)
##   [1] "state"    "fmonth"   "idate"    "imonth"   "iday"     "iyear"   
##   [7] "dispcode" "seqno"    "psu"      "ctelenm1" "pvtresd1" "colghous"
##  [13] "statere1" "celphon1" "ladult1"  "numadult" "respslc1" "landsex3"
##  [19] "safetime" "ctelnum1" "cellfon5" "cadult1"  "cellsex3" "pvtresd3"
##  [25] "cclghous" "cstate1"  "landline" "hhadult"  "sexvar"   "genhlth" 
##  [31] "physhlth" "menthlth" "poorhlth" "primins2" "persdoc3" "medcost1"
##  [37] "checkup1" "exerany2" "lastden4" "rmvteth4" "cvdinfr4" "cvdcrhd4"
##  [43] "cvdstrk3" "asthma3"  "asthnow"  "chcscnc1" "chcocnc1" "chccopd3"
##  [49] "addepev3" "chckdny2" "havarth4" "diabete4" "diabage4" "marital" 
##  [55] "educa"    "renthom1" "numhhol4" "numphon4" "cpdemo1c" "veteran3"
##  [61] "employ1"  "children" "income3"  "pregnant" "weight2"  "height3" 
##  [67] "deaf"     "blind"    "decide"   "diffwalk" "diffdres" "diffalon"
##  [73] "hadmam"   "howlong"  "cervscrn" "crvclcnc" "crvclpap" "crvclhpv"
##  [79] "hadhyst2" "hadsigm4" "colnsigm" "colntes1" "sigmtes1" "lastsig4"
##  [85] "colncncr" "vircolo1" "vclntes2" "smalstol" "stoltest" "stooldn2"
##  [91] "bldstfit" "sdnates1" "smoke100" "smokday2" "usenow3"  "ecignow3"
##  [97] "lcsfirst" "lcsnumcg" "lcsctsc1" "lcsscncr" "lcsctwhn" "alcday4" 
## [103] "avedrnk4" "drnk3ge5" "maxdrnks" "flushot7" "flshtmy3" "imfvpla5"
## [109] "pneuvac4" "hivtst7"  "hivtstd3" "hivrisk5" "pdiabts1" "prediab2"
## [115] "diabtype" "insulin1" "chkhemo3" "eyeexam1" "diabeye1" "diabedu1"
## [121] "feetsore" "arthexer" "shingle2" "hpvadvc4" "hpvadsh1" "tetanus1"
## [127] "cncrdiff" "cncrage"  "cncrtyp2" "csrvtrt3" "csrvdoc1" "csrvsum" 
## [133] "csrvrtrn" "csrvinst" "csrvinsr" "csrvdein" "csrvclin" "csrvpain"
## [139] "csrvctl2" "psatest1" "psatime1" "pcpsars2" "psasugs1" "pcstalk2"
## [145] "cimemlo1" "cdworry"  "cddiscu1" "cdhous1"  "cdsocia1" "caregiv1"
## [151] "crgvrel5" "crgvprb4" "crgvalzd" "crgvnurs" "crgvper2" "crgvhou2"
## [157] "crgvhrs2" "crgvlng2" "acedeprs" "acedrink" "acedrugs" "aceprisn"
## [163] "acedivrc" "acepunch" "acehurt1" "aceswear" "acetouch" "acetthem"
## [169] "acehvsex" "aceadsaf" "aceadned" "lsatisfy" "emtsuprt" "sdlonely"
## [175] "sdhemply" "foodstmp" "sdhfood1" "sdhbills" "sdhutils" "sdhtrnsp"
## [181] "howsafe1" "marijan1" "marjsmok" "marjeat"  "marjvape" "marjdab" 
## [187] "marjothr" "usemrjn4" "lastsmk2" "stopsmk2" "mentcigs" "mentecig"
## [193] "heattbco" "ssbsugr2" "ssbfrut3" "firearm5" "gunload"  "loadulk2"
## [199] "rcsgend1" "rcsxbrth" "rcsrltn2" "casthdx2" "casthno2" "somale"  
## [205] "sofemale" "hadsex"   "pfpprvn4" "typcntr9" "nobcuse8" "qstver"  
## [211] "qstlang"  "hpvdsht"  "icfqstvr" "metstat"  "urbstat"  "mscode"  
## [217] "ststr"    "strwt"    "rawrake"  "wt2rake"  "imprace"  "chispnc" 
## [223] "crace1"   "cageg"    "cllcpwt"  "dualuse"  "dualcor"  "llcpwt2" 
## [229] "llcpwt"   "rfhlth"   "phys14d"  "ment14d"  "hlthpl2"  "hcvu654" 
## [235] "totinda"  "exteth3"  "alteth3"  "denvst3"  "michd"    "ltasth1" 
## [241] "casthm1"  "asthms1"  "drdxar2"  "mrace1"   "hispanc"  "race"    
## [247] "raceg21"  "racegr3"  "raceprv"  "sex"      "ageg5yr"  "age65yr" 
## [253] "age80"    "age_g"    "htin4"    "htm4"     "wtkg3"    "bmi5"    
## [259] "bmi5cat"  "rfbmi5"   "chldcnt"  "educag"   "incomg1"  "rfmam23" 
## [265] "mam402y"  "crvscrn"  "rfpap37"  "hpv5yr1"  "paphpv1"  "hadcoln" 
## [271] "clnscp2"  "hadsigm"  "sgmscp2"  "sgms102"  "rfblds6"  "stoldn2" 
## [277] "vircol2"  "sbonti2"  "crcrec3"  "smoker3"  "rfsmok3"  "cureci3" 
## [283] "lcslast"  "lcsnumc"  "lcsage"   "lcsysmk"  "packday"  "packyrs" 
## [289] "lcsyqts"  "lcssmkg"  "lcselig"  "lcsctsn"  "lcspstf"  "drnkany6"
## [295] "drocdy4"  "rfbing6"  "drnkwk3"  "rfdrhv9"  "flshot7"  "pneumo3" 
## [301] "aidtst4"

Creating a Working Subset

# Select variables of interest and create analytic dataset
brfss_subset <- brfss_full %>%
  select(
    # Outcome: Have you ever had a cervical cancer screening test
    cervscrn,
    # Demographics
    ageg5yr,      # Age category, fourteen-level age category
    sexvar,         # Sex
    race,       # Race/ethnicity
    educag,     # Education level
    employ1,    # Employment status 
    # Health status
    genhlth,    # General health
    incomg1,    # Income category
    urbstat,    # Urban/Rural Status
    primins2,   # Primary source of health care coverage
    persdoc3,   # Personal health care provider
    medcost1,   # Delayed care due to cost
    checkup1,   # Time since last routine checkup
  )

# Filter for only females
brfss_female <- brfss_subset %>%
  filter(
    sexvar == 2)         # 2 = Female in BRFSS

# Filter for age (25-64)
brfss_filtered <- brfss_female %>% 
  filter(
    ageg5yr %in% 2:9)     # 1–9 correspond to ages 25–64

# Recode variables
brfss_recode <- brfss_filtered %>% 
 mutate(
   cervicalscr = factor(case_when(
     cervscrn == 1 ~ "Screened",
     cervscrn == 2 ~ "Not Screened"),
     levels =
       c("Screened", "Not Screened")), 
   age = factor(case_when(
      ageg5yr == 2 ~ "25-29",
      ageg5yr == 3 ~ "30-34",
      ageg5yr == 4 ~ "35-39",
      ageg5yr == 5 ~ "40-44",
      ageg5yr == 6 ~ "45-49",
      ageg5yr == 7 ~ "50-54",
      ageg5yr == 8 ~ "55-59",
      ageg5yr == 9 ~ "60-64"), 
      levels = c("25-29","30-34","35-39","40-44", "45-49","50-54","55-59","60-64")),
    sex = factor("Female", 
                 levels = "Female"),
    education = factor(case_when(
      educag == 1 ~ "< High school",
      educag == 2 ~ "High school graduate",
      educag == 3 ~ "Some college",
      educag == 4 ~ "College graduate"), 
      levels = c("< High school", "High school graduate",
                  "Some college", "College graduate")),
    race_cat = factor(case_when(
      race == 1 ~ "White",
      race == 2 ~ "Black",
      race == 3 ~ "American Indian or Alaska Native",
      race == 4 ~ "Asian",
      race == 5 ~ "Native Hawaiian or other Pacific Islander",
      race == 6 ~ "Other race",
      race == 7 ~ "Multiracial",
      race == 8 ~ "Hispanic"), 
      levels = c("White",
      "Black",
      "American Indian or Alaska Native",
      "Asian",
      "Native Hawaiian or other Pacific Islander",
      "Other race",
      "Multiracial",
      "Hispanic")),
    employ1_cat = factor(case_when(
      employ1 == 1 ~ "Employed for wages",
      employ1 == 2 ~ "Self-employed",
      employ1 == 3 ~ "Out of work for 1 year or more",
      employ1 == 4 ~ "Out of work for less than 1 year",
      employ1 == 5 ~ "A homemaker",
      employ1 == 6 ~ "A student",
      employ1 == 7 ~ "Retired",
      employ1 == 8 ~ "Unable to work"), 
      levels = c("Employed for wages", "Self-employed",
      "Out of work for 1 year or more", "Out of work for less than 1 year",
      "A homemaker", "A student", "Retired", "Unable to work")),
    gen_health = factor(case_when(
      genhlth == 1 ~ "Excellent",
      genhlth == 2 ~ "Very Good",
      genhlth == 3 ~ "Good", 
      genhlth == 4 ~ "Fair",
      genhlth == 5 ~ "Poor"), 
      levels = c("Excellent", "Very Good", "Good", "Fair", "Poor")),
    income = factor(case_when(
      incomg1 == 1 ~ "< $15,000",
      incomg1 == 2 ~ "$15,000 to < $25,000",
      incomg1 == 3 ~ "$25,000 to < $35,000",
      incomg1 == 4 ~ "$35,000 to < $50,000",
      incomg1 == 5 ~ "$50,000 to < $100,000", 
      incomg1 == 6 ~ "$100,000 to < $200,000", 
      incomg1 == 7 ~ "> $200,000"), 
      levels = c("< $15,000", "$15,000 to < $25,000", "$25,000 to < $35,000", "$35,000 to < $50,000", "$50,000 to < $100,000", "$100,000 to < $200,000", "> $200,000")), 
    urbstat1 = factor(case_when(
      urbstat == 1 ~ "Urban County",
      urbstat == 2 ~ "Rural County"),
      levels = c("Urban County", "Rural County")),
    priminsur = factor(case_when(
      primins2 == 1 ~ "Employer/Union Insurance",
      primins2 == 2 ~ "Private Nongovernmental Insurance",
      primins2 == 3 ~ "Medicare",
      primins2 == 4 ~ "Medigap",
      primins2 == 5 ~ "Medicaid",
      primins2 == 6 ~ "Children's Health Insurance Program",
      primins2 == 7 ~ "Military Related Health Care",
      primins2 == 8 ~ "Indian Health Service",
      primins2 == 9 ~ "State Sponsored Health Plan",
      primins2 == 10 ~ "Other Governmental Program",
      primins2 == 88 ~ "No Coverage of Any Type"), 
      levels = c("Employer/Union Insurance", "Private Nongovernmental Insurance", "Medicare", "Medigap", "Medicaid", "Children's Health Insurance Program", "Military Related Health Care", "Indian Health Service", "State Sponsored Health Plan",  "Other Governmental Program", "No Coverage of Any Type")), 
    persdoc = factor(case_when(
      persdoc3 == 1 ~ "Yes, only one",
      persdoc3 == 2 ~ "More than one",
      persdoc3 == 3 ~ "No"), 
      levels = c("Yes, only one", "More than one", "No")), 
    medcost = factor(case_when(
      medcost1 == 1 ~ "Yes",
      medcost1 == 2 ~ "No"), 
      levels = c("Yes", "No")), 
    checkup = factor(case_when(
      checkup1 == 1 ~ "Within past year",
      checkup1 == 2 ~ "Within past 2 years",
      checkup1 == 3 ~ "Within past 5 years", 
      checkup1 == 4 ~ "5 or more years ago",
      checkup1 == 7 ~ "Don't Know/Not Sure",
      checkup1 == 8 ~ "Never"),
      levels = c("Within past year", "Within past 2 years", "Within Past 5 years", "5 or more years ago","Don't Know/Not Sure", "Never"))
 )
  
 # Select only the recoded variables
  brfss_recode2 <- brfss_recode %>% 
    select(cervicalscr, age, sex, education, race_cat, employ1_cat, gen_health, income, urbstat1, priminsur, persdoc, medcost, checkup)

  # Count missing values in each column
missing_summary <- data.frame(
  Variable = names(brfss_recode2),
  Missing_Count = colSums(is.na(brfss_recode2)),
  Missing_Percent = round(colSums(is.na(brfss_recode2)) / nrow(brfss_recode2) * 100, 2)
)

# Show variables with the most missing data
print(missing_summary[order(-missing_summary$Missing_Count), ][1:13, ]) %>% 
  kable(
    caption = "Variables by Missingness"
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE,
    position = "center"
  )
##                Variable Missing_Count Missing_Percent
## income           income         19076           14.88
## cervicalscr cervicalscr         10405            8.11
## checkup         checkup          6352            4.95
## urbstat1       urbstat1          5044            3.93
## priminsur     priminsur          3494            2.72
## employ1_cat employ1_cat          2230            1.74
## race_cat       race_cat          1722            1.34
## persdoc         persdoc           924            0.72
## education     education           422            0.33
## medcost         medcost           400            0.31
## gen_health   gen_health           259            0.20
## age                 age             0            0.00
## sex                 sex             0            0.00
Variables by Missingness
Variable Missing_Count Missing_Percent
income income 19076 14.88
cervicalscr cervicalscr 10405 8.11
checkup checkup 6352 4.95
urbstat1 urbstat1 5044 3.93
priminsur priminsur 3494 2.72
employ1_cat employ1_cat 2230 1.74
race_cat race_cat 1722 1.34
persdoc persdoc 924 0.72
education education 422 0.33
medcost medcost 400 0.31
gen_health gen_health 259 0.20
age age 0 0.00
sex sex 0 0.00
  brfss_recode3 <- brfss_recode2 %>% 
  # Filter to complete cases only
  drop_na()
 
  set.seed(553)  # For reproducibility
  # Sample 50000 observations for manageable analysis
  brfss_analytic <- brfss_recode3 %>% 
  slice_sample(n = 50000)
  
# Display subset dimensions
cat("Working subset dimensions:",
    nrow(brfss_analytic), "observations,",
    ncol(brfss_analytic), "variables\n")
## Working subset dimensions: 50000 observations, 13 variables
# Save for later use
saveRDS(brfss_analytic,
  "/Users/zoya_hayes/Desktop/EPI553/EPI553_Coding/FinalProject/brfss_final_project_2024.rds")

tibble(Metric = c("Observations", "Variables"),
       Value  = c(nrow(brfss_analytic), ncol(brfss_analytic))) %>%
  kable(caption = "Analytic Dataset Dimensions") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Analytic Dataset Dimensions
Metric Value
Observations 50000
Variables 13

Descriptive Statistics

# Create Table 1 Descriptive Statistics
brfss_analytic %>% 
  select(cervicalscr, race_cat, age, education, employ1_cat, gen_health, income, urbstat1, priminsur, persdoc, medcost, checkup) %>% 
  tbl_summary(
    by = race_cat,   # Stratify Table by Race/Ethnicity
    label = list(
      cervicalscr ~ "Ever Had a Cervical Cancer Screening Test",
      age ~ "Age Category",
      education ~ "Level of Education Completed",
      employ1_cat ~ "Current Employment Status",
      gen_health ~ "General Health Status",
      income ~ "Income Category",
      urbstat1 ~ "Urban/Rural Status",
      priminsur ~ "Current Health Care Coverage",
      persdoc ~ "Personal Health Care Provider",
      medcost ~ "Could not afford to see Doctor (past 12 months)",
      checkup ~ "Time since last routine checkup"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) %>%
  add_n() %>%
  bold_labels() %>%
  modify_caption("**Table 1. Descriptive Statistics — BRFSS 2024 Analytic Sample (n = 50,000)**") %>% 
 modify_footnote(
    all_stat_cols() ~ 
      "Analytic sample includes 50,000 complete-case observations. Missing values were excluded listwise for all variables included in this table."
  )
Table 1. Descriptive Statistics — BRFSS 2024 Analytic Sample (n = 50,000)
Characteristic N White
N = 35,496
1
Black
N = 4,879
1
American Indian or Alaska Native
N = 870
1
Asian
N = 1,248
1
Native Hawaiian or other Pacific Islander
N = 178
1
Other race
N = 272
1
Multiracial
N = 1,272
1
Hispanic
N = 5,785
1
Ever Had a Cervical Cancer Screening Test 50,000







    Screened
32,904 (93%) 4,070 (83%) 735 (84%) 924 (74%) 138 (78%) 241 (89%) 1,161 (91%) 4,732 (82%)
    Not Screened
2,592 (7.3%) 809 (17%) 135 (16%) 324 (26%) 40 (22%) 31 (11%) 111 (8.7%) 1,053 (18%)
Age Category 50,000







    25-29
2,341 (6.6%) 386 (7.9%) 54 (6.2%) 204 (16%) 16 (9.0%) 22 (8.1%) 169 (13%) 808 (14%)
    30-34
2,980 (8.4%) 539 (11%) 80 (9.2%) 227 (18%) 23 (13%) 20 (7.4%) 189 (15%) 927 (16%)
    35-39
3,762 (11%) 560 (11%) 102 (12%) 184 (15%) 18 (10%) 25 (9.2%) 176 (14%) 852 (15%)
    40-44
4,254 (12%) 680 (14%) 110 (13%) 160 (13%) 36 (20%) 45 (17%) 172 (14%) 812 (14%)
    45-49
4,272 (12%) 604 (12%) 118 (14%) 148 (12%) 18 (10%) 30 (11%) 153 (12%) 762 (13%)
    50-54
4,913 (14%) 705 (14%) 110 (13%) 138 (11%) 18 (10%) 44 (16%) 133 (10%) 655 (11%)
    55-59
5,692 (16%) 685 (14%) 140 (16%) 89 (7.1%) 25 (14%) 42 (15%) 143 (11%) 516 (8.9%)
    60-64
7,282 (21%) 720 (15%) 156 (18%) 98 (7.9%) 24 (13%) 44 (16%) 137 (11%) 453 (7.8%)
Level of Education Completed 50,000







    < High school
855 (2.4%) 177 (3.6%) 65 (7.5%) 11 (0.9%) 7 (3.9%) 8 (2.9%) 42 (3.3%) 1,195 (21%)
    High school graduate
6,248 (18%) 1,096 (22%) 221 (25%) 142 (11%) 62 (35%) 60 (22%) 217 (17%) 1,398 (24%)
    Some college
9,355 (26%) 1,478 (30%) 315 (36%) 166 (13%) 58 (33%) 72 (26%) 354 (28%) 1,321 (23%)
    College graduate
19,038 (54%) 2,128 (44%) 269 (31%) 929 (74%) 51 (29%) 132 (49%) 659 (52%) 1,871 (32%)
Current Employment Status 50,000







    Employed for wages
22,392 (63%) 3,106 (64%) 480 (55%) 851 (68%) 102 (57%) 147 (54%) 756 (59%) 3,130 (54%)
    Self-employed
3,017 (8.5%) 322 (6.6%) 70 (8.0%) 102 (8.2%) 17 (9.6%) 23 (8.5%) 116 (9.1%) 536 (9.3%)
    Out of work for 1 year or more
687 (1.9%) 171 (3.5%) 47 (5.4%) 30 (2.4%) 8 (4.5%) 13 (4.8%) 40 (3.1%) 239 (4.1%)
    Out of work for less than 1 year
749 (2.1%) 221 (4.5%) 46 (5.3%) 38 (3.0%) 8 (4.5%) 10 (3.7%) 61 (4.8%) 277 (4.8%)
    A homemaker
2,720 (7.7%) 145 (3.0%) 57 (6.6%) 103 (8.3%) 15 (8.4%) 26 (9.6%) 82 (6.4%) 956 (17%)
    A student
336 (0.9%) 117 (2.4%) 17 (2.0%) 55 (4.4%) 2 (1.1%) 3 (1.1%) 37 (2.9%) 79 (1.4%)
    Retired
2,581 (7.3%) 245 (5.0%) 40 (4.6%) 34 (2.7%) 12 (6.7%) 11 (4.0%) 45 (3.5%) 127 (2.2%)
    Unable to work
3,014 (8.5%) 552 (11%) 113 (13%) 35 (2.8%) 14 (7.9%) 39 (14%) 135 (11%) 441 (7.6%)
General Health Status 50,000







    Excellent
5,372 (15%) 584 (12%) 90 (10%) 229 (18%) 37 (21%) 47 (17%) 156 (12%) 740 (13%)
    Very Good
13,208 (37%) 1,335 (27%) 194 (22%) 429 (34%) 33 (19%) 85 (31%) 382 (30%) 1,361 (24%)
    Good
10,979 (31%) 1,882 (39%) 342 (39%) 459 (37%) 60 (34%) 68 (25%) 442 (35%) 2,257 (39%)
    Fair
4,369 (12%) 864 (18%) 171 (20%) 105 (8.4%) 36 (20%) 42 (15%) 213 (17%) 1,163 (20%)
    Poor
1,568 (4.4%) 214 (4.4%) 73 (8.4%) 26 (2.1%) 12 (6.7%) 30 (11%) 79 (6.2%) 264 (4.6%)
Income Category 50,000







    < $15,000
1,634 (4.6%) 462 (9.5%) 111 (13%) 49 (3.9%) 15 (8.4%) 28 (10%) 102 (8.0%) 716 (12%)
    $15,000 to < $25,000
2,201 (6.2%) 506 (10%) 103 (12%) 62 (5.0%) 20 (11%) 31 (11%) 106 (8.3%) 934 (16%)
    $25,000 to < $35,000
2,575 (7.3%) 615 (13%) 144 (17%) 91 (7.3%) 22 (12%) 28 (10%) 123 (9.7%) 993 (17%)
    $35,000 to < $50,000
3,453 (9.7%) 798 (16%) 153 (18%) 125 (10%) 28 (16%) 24 (8.8%) 159 (13%) 845 (15%)
    $50,000 to < $100,000
10,922 (31%) 1,388 (28%) 226 (26%) 323 (26%) 44 (25%) 74 (27%) 358 (28%) 1,234 (21%)
    $100,000 to < $200,000
10,822 (30%) 864 (18%) 104 (12%) 346 (28%) 37 (21%) 63 (23%) 304 (24%) 799 (14%)
    > $200,000
3,889 (11%) 246 (5.0%) 29 (3.3%) 252 (20%) 12 (6.7%) 24 (8.8%) 120 (9.4%) 264 (4.6%)
Urban/Rural Status 50,000







    Urban County
30,102 (85%) 4,630 (95%) 543 (62%) 1,223 (98%) 173 (97%) 249 (92%) 1,136 (89%) 5,494 (95%)
    Rural County
5,394 (15%) 249 (5.1%) 327 (38%) 25 (2.0%) 5 (2.8%) 23 (8.5%) 136 (11%) 291 (5.0%)
Current Health Care Coverage 50,000







    Employer/Union Insurance
21,322 (60%) 2,294 (47%) 264 (30%) 821 (66%) 83 (47%) 124 (46%) 655 (51%) 2,073 (36%)
    Private Nongovernmental Insurance
4,223 (12%) 415 (8.5%) 49 (5.6%) 135 (11%) 11 (6.2%) 33 (12%) 105 (8.3%) 472 (8.2%)
    Medicare
2,109 (5.9%) 437 (9.0%) 63 (7.2%) 40 (3.2%) 20 (11%) 27 (9.9%) 83 (6.5%) 358 (6.2%)
    Medigap
16 (<0.1%) 0 (0%) 3 (0.3%) 0 (0%) 2 (1.1%) 0 (0%) 0 (0%) 4 (<0.1%)
    Medicaid
3,360 (9.5%) 976 (20%) 179 (21%) 106 (8.5%) 24 (13%) 45 (17%) 183 (14%) 954 (16%)
    Children's Health Insurance Program
12 (<0.1%) 3 (<0.1%) 1 (0.1%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 6 (0.1%)
    Military Related Health Care
905 (2.5%) 137 (2.8%) 20 (2.3%) 20 (1.6%) 6 (3.4%) 6 (2.2%) 44 (3.5%) 98 (1.7%)
    Indian Health Service
11 (<0.1%) 1 (<0.1%) 177 (20%) 2 (0.2%) 0 (0%) 4 (1.5%) 28 (2.2%) 11 (0.2%)
    State Sponsored Health Plan
1,236 (3.5%) 190 (3.9%) 31 (3.6%) 42 (3.4%) 15 (8.4%) 11 (4.0%) 67 (5.3%) 396 (6.8%)
    Other Governmental Program
933 (2.6%) 179 (3.7%) 24 (2.8%) 42 (3.4%) 11 (6.2%) 6 (2.2%) 39 (3.1%) 220 (3.8%)
    No Coverage of Any Type
1,369 (3.9%) 247 (5.1%) 59 (6.8%) 40 (3.2%) 6 (3.4%) 16 (5.9%) 68 (5.3%) 1,193 (21%)
Personal Health Care Provider 50,000







    Yes, only one
18,810 (53%) 2,425 (50%) 454 (52%) 669 (54%) 103 (58%) 123 (45%) 610 (48%) 2,842 (49%)
    More than one
14,355 (40%) 2,060 (42%) 304 (35%) 420 (34%) 58 (33%) 116 (43%) 543 (43%) 1,617 (28%)
    No
2,331 (6.6%) 394 (8.1%) 112 (13%) 159 (13%) 17 (9.6%) 33 (12%) 119 (9.4%) 1,326 (23%)
Could not afford to see Doctor (past 12 months) 50,000 4,022 (11%) 744 (15%) 147 (17%) 102 (8.2%) 33 (19%) 54 (20%) 230 (18%) 1,362 (24%)
Time since last routine checkup 50,000







    Within past year
30,254 (85%) 4,400 (90%) 733 (84%) 1,038 (83%) 141 (79%) 225 (83%) 1,039 (82%) 4,667 (81%)
    Within past 2 years
3,695 (10%) 402 (8.2%) 93 (11%) 166 (13%) 25 (14%) 38 (14%) 165 (13%) 808 (14%)
    Within Past 5 years
0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
    5 or more years ago
1,289 (3.6%) 48 (1.0%) 34 (3.9%) 25 (2.0%) 8 (4.5%) 8 (2.9%) 62 (4.9%) 176 (3.0%)
    Don't Know/Not Sure
196 (0.6%) 25 (0.5%) 6 (0.7%) 11 (0.9%) 2 (1.1%) 0 (0%) 5 (0.4%) 44 (0.8%)
    Never
62 (0.2%) 4 (<0.1%) 4 (0.5%) 8 (0.6%) 2 (1.1%) 1 (0.4%) 1 (<0.1%) 90 (1.6%)
1 Analytic sample includes 50,000 complete-case observations. Missing values were excluded listwise for all variables included in this table.

Distribution of Cervical Cancer Screening

# Create a bar plot for cervical cancer screening count
ggplot(brfss_analytic, aes(x = cervicalscr)) +
  geom_bar() +
  geom_text(aes(label = after_stat(count)), stat = "count", vjust = -0.5) +
  labs(title = "Figure 1: Bar Graph of Cervical Cancer Screening with Counts",
       x = "Cervical Cancer Screening",
       y = "Count") +
  theme_minimal()

Association Between Outcome and Primary Exposure

# Create bar plot demonstrating the proportion of cervical cancer screening by race/ethnicity 
brfss_analytic %>%
  group_by(race_cat) %>%
  summarize(prop_screened = mean(cervicalscr == "Screened")) %>%
  ggplot(aes(x = race_cat, y = prop_screened)) +
  geom_col(fill = "steelblue") +
  labs(
    x = "Race/Ethnicity",
    y = "Proportion Screened",
    title = "Figure 2: Proportion of Cervical Cancer Screening by Race/Ethnicity"
  ) +
  theme_minimal()

# Create split bar plot for race/ethnicity
ggplot(brfss_analytic, aes(x = race_cat, fill = cervicalscr)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = "Race/Ethnicity",
    y = "Percent",
    fill = "Screened",
    title = "Figure 3: Cervical Screening Distribution by Race/Ethnicity"
  ) +
  theme_minimal()

# Create proportions for each race/ethnicity category
tbl <- brfss_analytic %>%
  group_by(race_cat) %>%
  summarize(
    prop = mean(cervicalscr == "Screened"),
    n = n()
  )

# Create Screening Proportion by Race and Ethnicity with Confidence Intervals 
ggplot(tbl, aes(x = race_cat, y = prop)) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = prop - 0.03, ymax = prop + 0.03), width = 0.2) +
  labs(
    x = "Race/Ethnicity",
    y = "Proportion Screened",
    title = "Figure 4: Screening Proportions with Approximate CIs"
  ) +
  theme_minimal()

Associations between Covariates and Cervical Cancer Screening

# Create split bar plot for education levels 
ggplot(brfss_analytic, aes(x = education, fill = cervicalscr)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = "Level of Education",
    y = "Percent",
    fill = "Screened",
    title = "Figure 5: Cervical Screening Distribution by Level of Education"
  ) +
  theme_minimal()

# Create split bar plot for employment status
ggplot(brfss_analytic, aes(x = employ1_cat, fill = cervicalscr)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = "Employment Status",
    y = "Percent",
    fill = "Screened",
    title = "Cervical Screening Distribution by Employment Status"
  ) +
  theme_minimal()

# Create split bar plot for personal doctor status
ggplot(brfss_analytic, aes(x = persdoc, fill = cervicalscr)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = "Personal Doctor Status",
    y = "Percent",
    fill = "Screened",
    title = "Cervical Screening Distribution by Personal Doctor Status"
  ) +
  theme_minimal()

# Create split bar plot for age categories 
ggplot(brfss_analytic, aes(x = age, fill = cervicalscr)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = "Age Category (Years)",
    y = "Percent",
    fill = "Screened",
    title = "Figure 6: Cervical Screening Distribution by Age Category"
  ) +
  theme_minimal()