Setup and Data Preparation

# Load required packages
library(tidyverse)
library(haven)
library(knitr)
library(kableExtra)
library(plotly)
library(broom)
library(car)
library(ggeffects)
library(gtsummary)
library(ggstats)
library(janitor)
library(broom.helpers)
library(ggplot2)

Loading BRFSS 2024 Data

The BRFSS is a large-scale telephone survey that collects data on health-related risk behaviors, chronic health conditions, and use of preventive services from U.S. residents.

# Load the full BRFSS 2024 dataset
brfss_full <- read_xpt("/Users/zoya_hayes/Desktop/EPI553/EPI553_Coding/FinalProject/BRFSS2024") %>%
  janitor::clean_names()

# Display dataset dimensions
names(brfss_full)

##   [1] "state"    "fmonth"   "idate"    "imonth"   "iday"     "iyear"   
##   [7] "dispcode" "seqno"    "psu"      "ctelenm1" "pvtresd1" "colghous"
##  [13] "statere1" "celphon1" "ladult1"  "numadult" "respslc1" "landsex3"
##  [19] "safetime" "ctelnum1" "cellfon5" "cadult1"  "cellsex3" "pvtresd3"
##  [25] "cclghous" "cstate1"  "landline" "hhadult"  "sexvar"   "genhlth" 
##  [31] "physhlth" "menthlth" "poorhlth" "primins2" "persdoc3" "medcost1"
##  [37] "checkup1" "exerany2" "lastden4" "rmvteth4" "cvdinfr4" "cvdcrhd4"
##  [43] "cvdstrk3" "asthma3"  "asthnow"  "chcscnc1" "chcocnc1" "chccopd3"
##  [49] "addepev3" "chckdny2" "havarth4" "diabete4" "diabage4" "marital" 
##  [55] "educa"    "renthom1" "numhhol4" "numphon4" "cpdemo1c" "veteran3"
##  [61] "employ1"  "children" "income3"  "pregnant" "weight2"  "height3" 
##  [67] "deaf"     "blind"    "decide"   "diffwalk" "diffdres" "diffalon"
##  [73] "hadmam"   "howlong"  "cervscrn" "crvclcnc" "crvclpap" "crvclhpv"
##  [79] "hadhyst2" "hadsigm4" "colnsigm" "colntes1" "sigmtes1" "lastsig4"
##  [85] "colncncr" "vircolo1" "vclntes2" "smalstol" "stoltest" "stooldn2"
##  [91] "bldstfit" "sdnates1" "smoke100" "smokday2" "usenow3"  "ecignow3"
##  [97] "lcsfirst" "lcsnumcg" "lcsctsc1" "lcsscncr" "lcsctwhn" "alcday4" 
## [103] "avedrnk4" "drnk3ge5" "maxdrnks" "flushot7" "flshtmy3" "imfvpla5"
## [109] "pneuvac4" "hivtst7"  "hivtstd3" "hivrisk5" "pdiabts1" "prediab2"
## [115] "diabtype" "insulin1" "chkhemo3" "eyeexam1" "diabeye1" "diabedu1"
## [121] "feetsore" "arthexer" "shingle2" "hpvadvc4" "hpvadsh1" "tetanus1"
## [127] "cncrdiff" "cncrage"  "cncrtyp2" "csrvtrt3" "csrvdoc1" "csrvsum" 
## [133] "csrvrtrn" "csrvinst" "csrvinsr" "csrvdein" "csrvclin" "csrvpain"
## [139] "csrvctl2" "psatest1" "psatime1" "pcpsars2" "psasugs1" "pcstalk2"
## [145] "cimemlo1" "cdworry"  "cddiscu1" "cdhous1"  "cdsocia1" "caregiv1"
## [151] "crgvrel5" "crgvprb4" "crgvalzd" "crgvnurs" "crgvper2" "crgvhou2"
## [157] "crgvhrs2" "crgvlng2" "acedeprs" "acedrink" "acedrugs" "aceprisn"
## [163] "acedivrc" "acepunch" "acehurt1" "aceswear" "acetouch" "acetthem"
## [169] "acehvsex" "aceadsaf" "aceadned" "lsatisfy" "emtsuprt" "sdlonely"
## [175] "sdhemply" "foodstmp" "sdhfood1" "sdhbills" "sdhutils" "sdhtrnsp"
## [181] "howsafe1" "marijan1" "marjsmok" "marjeat"  "marjvape" "marjdab" 
## [187] "marjothr" "usemrjn4" "lastsmk2" "stopsmk2" "mentcigs" "mentecig"
## [193] "heattbco" "ssbsugr2" "ssbfrut3" "firearm5" "gunload"  "loadulk2"
## [199] "rcsgend1" "rcsxbrth" "rcsrltn2" "casthdx2" "casthno2" "somale"  
## [205] "sofemale" "hadsex"   "pfpprvn4" "typcntr9" "nobcuse8" "qstver"  
## [211] "qstlang"  "hpvdsht"  "icfqstvr" "metstat"  "urbstat"  "mscode"  
## [217] "ststr"    "strwt"    "rawrake"  "wt2rake"  "imprace"  "chispnc" 
## [223] "crace1"   "cageg"    "cllcpwt"  "dualuse"  "dualcor"  "llcpwt2" 
## [229] "llcpwt"   "rfhlth"   "phys14d"  "ment14d"  "hlthpl2"  "hcvu654" 
## [235] "totinda"  "exteth3"  "alteth3"  "denvst3"  "michd"    "ltasth1" 
## [241] "casthm1"  "asthms1"  "drdxar2"  "mrace1"   "hispanc"  "race"    
## [247] "raceg21"  "racegr3"  "raceprv"  "sex"      "ageg5yr"  "age65yr" 
## [253] "age80"    "age_g"    "htin4"    "htm4"     "wtkg3"    "bmi5"    
## [259] "bmi5cat"  "rfbmi5"   "chldcnt"  "educag"   "incomg1"  "rfmam23" 
## [265] "mam402y"  "crvscrn"  "rfpap37"  "hpv5yr1"  "paphpv1"  "hadcoln" 
## [271] "clnscp2"  "hadsigm"  "sgmscp2"  "sgms102"  "rfblds6"  "stoldn2" 
## [277] "vircol2"  "sbonti2"  "crcrec3"  "smoker3"  "rfsmok3"  "cureci3" 
## [283] "lcslast"  "lcsnumc"  "lcsage"   "lcsysmk"  "packday"  "packyrs" 
## [289] "lcsyqts"  "lcssmkg"  "lcselig"  "lcsctsn"  "lcspstf"  "drnkany6"
## [295] "drocdy4"  "rfbing6"  "drnkwk3"  "rfdrhv9"  "flshot7"  "pneumo3" 
## [301] "aidtst4"

Creating a Working Subset

# Select variables of interest and create analytic dataset
brfss_subset <- brfss_full %>%
  select(
    # Outcome: Have you ever had a cervical cancer screening test
    cervscrn,
    # Demographics
    ageg5yr,      # Age category, fourteen-level age category
    sexvar,         # Sex
    race,       # Race/ethnicity
    educag,     # Education level
    employ1,    # Employment status 
    # Health status
    genhlth,    # General health
    incomg1,    # Income category
    urbstat,    # Urban/Rural Status
    primins2,   # Primary source of health care coverage
    persdoc3,   # Personal health care provider
    medcost1,   # Delayed care due to cost
    checkup1,   # Time since last routine checkup
  )

# Filter for only females
brfss_female <- brfss_subset %>%
  filter(
    sexvar == 2)         # 2 = Female in BRFSS

# Filter for age (25-64)
brfss_filtered <- brfss_female %>% 
  filter(
    ageg5yr %in% 2:9)     # 1–9 correspond to ages 25–64

# Recode variables
brfss_recode <- brfss_filtered %>% 
 mutate(
   cervicalscr = factor(case_when(
     cervscrn == 1 ~ "Screened",
     cervscrn == 2 ~ "Not Screened"),
     levels =
       c("Screened", "Not Screened")), 
   age = factor(case_when(
      ageg5yr == 2 ~ "25-29",
      ageg5yr == 3 ~ "30-34",
      ageg5yr == 4 ~ "35-39",
      ageg5yr == 5 ~ "40-44",
      ageg5yr == 6 ~ "45-49",
      ageg5yr == 7 ~ "50-54",
      ageg5yr == 8 ~ "55-59",
      ageg5yr == 9 ~ "60-64"), 
      levels = c("25-29","30-34","35-39","40-44", "45-49","50-54","55-59","60-64")),
    sex = factor("Female", 
                 levels = "Female"),
    education = factor(case_when(
      educag == 1 ~ "< High school",
      educag == 2 ~ "High school graduate",
      educag == 3 ~ "Some college",
      educag == 4 ~ "College graduate"), 
      levels = c("< High school", "High school graduate",
                  "Some college", "College graduate")),
    race_cat = factor(case_when(
      race == 1 ~ "White",
      race == 2 ~ "Black",
      race == 3 ~ "American Indian or Alaska Native",
      race == 4 ~ "Asian",
      race == 5 ~ "Native Hawaiian or other Pacific Islander",
      race == 6 ~ "Other race",
      race == 7 ~ "Multiracial",
      race == 8 ~ "Hispanic"), 
      levels = c("White",
      "Black",
      "American Indian or Alaska Native",
      "Asian",
      "Native Hawaiian or other Pacific Islander",
      "Other race",
      "Multiracial",
      "Hispanic")),
    employ1_cat = factor(case_when(
      employ1 == 1 ~ "Employed for wages",
      employ1 == 2 ~ "Self-employed",
      employ1 == 3 ~ "Out of work for 1 year or more",
      employ1 == 4 ~ "Out of work for less than 1 year",
      employ1 == 5 ~ "A homemaker",
      employ1 == 6 ~ "A student",
      employ1 == 7 ~ "Retired",
      employ1 == 8 ~ "Unable to work"), 
      levels = c("Employed for wages", "Self-employed",
      "Out of work for 1 year or more", "Out of work for less than 1 year",
      "A homemaker", "A student", "Retired", "Unable to work")),
    gen_health = factor(case_when(
      genhlth == 1 ~ "Excellent",
      genhlth == 2 ~ "Very Good",
      genhlth == 3 ~ "Good", 
      genhlth == 4 ~ "Fair",
      genhlth == 5 ~ "Poor"), 
      levels = c("Excellent", "Very Good", "Good", "Fair", "Poor")),
    income = factor(case_when(
      incomg1 == 1 ~ "< $15,000",
      incomg1 == 2 ~ "$15,000 to < $25,000",
      incomg1 == 3 ~ "$25,000 to < $35,000",
      incomg1 == 4 ~ "$35,000 to < $50,000",
      incomg1 == 5 ~ "$50,000 to < $100,000", 
      incomg1 == 6 ~ "$100,000 to < $200,000", 
      incomg1 == 7 ~ "> $200,000"), 
      levels = c("< $15,000", "$15,000 to < $25,000", "$25,000 to < $35,000", "$35,000 to < $50,000", "$50,000 to < $100,000", "$100,000 to < $200,000", "> $200,000")), 
    urbstat1 = factor(case_when(
      urbstat == 1 ~ "Urban County",
      urbstat == 2 ~ "Rural County"),
      levels = c("Urban County", "Rural County")),
    priminsur = factor(case_when(
      primins2 == 1 ~ "Employer/Union Insurance",
      primins2 == 2 ~ "Private Nongovernmental Insurance",
      primins2 == 3 ~ "Medicare",
      primins2 == 4 ~ "Medigap",
      primins2 == 5 ~ "Medicaid",
      primins2 == 6 ~ "Children's Health Insurance Program",
      primins2 == 7 ~ "Military Related Health Care",
      primins2 == 8 ~ "Indian Health Service",
      primins2 == 9 ~ "State Sponsored Health Plan",
      primins2 == 10 ~ "Other Governmental Program",
      primins2 == 88 ~ "No Coverage of Any Type"), 
      levels = c("Employer/Union Insurance", "Private Nongovernmental Insurance", "Medicare", "Medigap", "Medicaid", "Children's Health Insurance Program", "Military Related Health Care", "Indian Health Service", "State Sponsored Health Plan",  "Other Governmental Program", "No Coverage of Any Type")), 
    persdoc = factor(case_when(
      persdoc3 == 1 ~ "Yes, only one",
      persdoc3 == 2 ~ "More than one",
      persdoc3 == 3 ~ "No"), 
      levels = c("Yes, only one", "More than one", "No")), 
    medcost = factor(case_when(
      medcost1 == 1 ~ "Yes",
      medcost1 == 2 ~ "No"), 
      levels = c("Yes", "No")), 
    checkup = factor(case_when(
      checkup1 == 1 ~ "Within past year",
      checkup1 == 2 ~ "Within past 2 years",
      checkup1 == 3 ~ "Within past 5 years", 
      checkup1 == 4 ~ "5 or more years ago",
      checkup1 == 7 ~ "Don't Know/Not Sure",
      checkup1 == 8 ~ "Never"),
      levels = c("Within past year", "Within past 2 years", "Within Past 5 years", "5 or more years ago","Don't Know/Not Sure", "Never"))
 )
  
 # Select only the recoded variables
  brfss_recode2 <- brfss_recode %>% 
    select(cervicalscr, age, sex, education, race_cat, employ1_cat, gen_health, income, urbstat1, priminsur, persdoc, medcost, checkup)

  # Count missing values in each column
missing_summary <- data.frame(
  Variable = names(brfss_recode2),
  Missing_Count = colSums(is.na(brfss_recode2)),
  Missing_Percent = round(colSums(is.na(brfss_recode2)) / nrow(brfss_recode2) * 100, 2)
)

# Show variables with the most missing data
print(missing_summary[order(-missing_summary$Missing_Count), ][1:13, ]) %>% 
  kable(
    caption = "Variables by Missingness"
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE,
    position = "center"
  )

##                Variable Missing_Count Missing_Percent
## income           income         19076           14.88
## cervicalscr cervicalscr         10405            8.11
## checkup         checkup          6352            4.95
## urbstat1       urbstat1          5044            3.93
## priminsur     priminsur          3494            2.72
## employ1_cat employ1_cat          2230            1.74
## race_cat       race_cat          1722            1.34
## persdoc         persdoc           924            0.72
## education     education           422            0.33
## medcost         medcost           400            0.31
## gen_health   gen_health           259            0.20
## age                 age             0            0.00
## sex                 sex             0            0.00

Variables by Missingness
	Variable	Missing_Count	Missing_Percent
income	income	19076	14.88
cervicalscr	cervicalscr	10405	8.11
checkup	checkup	6352	4.95
urbstat1	urbstat1	5044	3.93
priminsur	priminsur	3494	2.72
employ1_cat	employ1_cat	2230	1.74
race_cat	race_cat	1722	1.34
persdoc	persdoc	924	0.72
education	education	422	0.33
medcost	medcost	400	0.31
gen_health	gen_health	259	0.20
age	age	0	0.00
sex	sex	0	0.00

  brfss_recode3 <- brfss_recode2 %>% 
  # Filter to complete cases only
  drop_na()
 
  set.seed(553)  # For reproducibility
  # Sample 50000 observations for manageable analysis
  brfss_analytic <- brfss_recode3 %>% 
  slice_sample(n = 50000)
  
# Display subset dimensions
cat("Working subset dimensions:",
    nrow(brfss_analytic), "observations,",
    ncol(brfss_analytic), "variables\n")

## Working subset dimensions: 50000 observations, 13 variables

# Save for later use
saveRDS(brfss_analytic,
  "/Users/zoya_hayes/Desktop/EPI553/EPI553_Coding/FinalProject/brfss_final_project_2024.rds")

tibble(Metric = c("Observations", "Variables"),
       Value  = c(nrow(brfss_analytic), ncol(brfss_analytic))) %>%
  kable(caption = "Analytic Dataset Dimensions") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Analytic Dataset Dimensions
Metric	Value
Observations	50000
Variables	13

Descriptive Statistics

# Create Table 1 Descriptive Statistics
brfss_analytic %>% 
  select(cervicalscr, race_cat, age, education, employ1_cat, gen_health, income, urbstat1, priminsur, persdoc, medcost, checkup) %>% 
  tbl_summary(
    by = race_cat,   # Stratify Table by Race/Ethnicity
    label = list(
      cervicalscr ~ "Ever Had a Cervical Cancer Screening Test",
      age ~ "Age Category",
      education ~ "Level of Education Completed",
      employ1_cat ~ "Current Employment Status",
      gen_health ~ "General Health Status",
      income ~ "Income Category",
      urbstat1 ~ "Urban/Rural Status",
      priminsur ~ "Current Health Care Coverage",
      persdoc ~ "Personal Health Care Provider",
      medcost ~ "Could not afford to see Doctor (past 12 months)",
      checkup ~ "Time since last routine checkup"
    ),
    statistic = list(
      all_categorical() ~ "{n} ({p}%)"
    ),
    digits = all_continuous() ~ 1,
    missing = "no"
  ) %>%
  add_n() %>%
  bold_labels() %>%
  modify_caption("**Table 1. Descriptive Statistics — BRFSS 2024 Analytic Sample (n = 50,000)**") %>% 
 modify_footnote(
    all_stat_cols() ~ 
      "Analytic sample includes 50,000 complete-case observations. Missing values were excluded listwise for all variables included in this table."
  )

**Table 1. Descriptive Statistics — BRFSS 2024 Analytic Sample (n = 50,000)**
Characteristic	N	White N = 35,496¹	Black N = 4,879¹	American Indian or Alaska Native N = 870¹	Asian N = 1,248¹	Native Hawaiian or other Pacific Islander N = 178¹	Other race N = 272¹	Multiracial N = 1,272¹	Hispanic N = 5,785¹
Ever Had a Cervical Cancer Screening Test	50,000
Screened		32,904 (93%)	4,070 (83%)	735 (84%)	924 (74%)	138 (78%)	241 (89%)	1,161 (91%)	4,732 (82%)
Not Screened		2,592 (7.3%)	809 (17%)	135 (16%)	324 (26%)	40 (22%)	31 (11%)	111 (8.7%)	1,053 (18%)
Age Category	50,000
25-29		2,341 (6.6%)	386 (7.9%)	54 (6.2%)	204 (16%)	16 (9.0%)	22 (8.1%)	169 (13%)	808 (14%)
30-34		2,980 (8.4%)	539 (11%)	80 (9.2%)	227 (18%)	23 (13%)	20 (7.4%)	189 (15%)	927 (16%)
35-39		3,762 (11%)	560 (11%)	102 (12%)	184 (15%)	18 (10%)	25 (9.2%)	176 (14%)	852 (15%)
40-44		4,254 (12%)	680 (14%)	110 (13%)	160 (13%)	36 (20%)	45 (17%)	172 (14%)	812 (14%)
45-49		4,272 (12%)	604 (12%)	118 (14%)	148 (12%)	18 (10%)	30 (11%)	153 (12%)	762 (13%)
50-54		4,913 (14%)	705 (14%)	110 (13%)	138 (11%)	18 (10%)	44 (16%)	133 (10%)	655 (11%)
55-59		5,692 (16%)	685 (14%)	140 (16%)	89 (7.1%)	25 (14%)	42 (15%)	143 (11%)	516 (8.9%)
60-64		7,282 (21%)	720 (15%)	156 (18%)	98 (7.9%)	24 (13%)	44 (16%)	137 (11%)	453 (7.8%)
Level of Education Completed	50,000
< High school		855 (2.4%)	177 (3.6%)	65 (7.5%)	11 (0.9%)	7 (3.9%)	8 (2.9%)	42 (3.3%)	1,195 (21%)
High school graduate		6,248 (18%)	1,096 (22%)	221 (25%)	142 (11%)	62 (35%)	60 (22%)	217 (17%)	1,398 (24%)
Some college		9,355 (26%)	1,478 (30%)	315 (36%)	166 (13%)	58 (33%)	72 (26%)	354 (28%)	1,321 (23%)
College graduate		19,038 (54%)	2,128 (44%)	269 (31%)	929 (74%)	51 (29%)	132 (49%)	659 (52%)	1,871 (32%)
Current Employment Status	50,000
Employed for wages		22,392 (63%)	3,106 (64%)	480 (55%)	851 (68%)	102 (57%)	147 (54%)	756 (59%)	3,130 (54%)
Self-employed		3,017 (8.5%)	322 (6.6%)	70 (8.0%)	102 (8.2%)	17 (9.6%)	23 (8.5%)	116 (9.1%)	536 (9.3%)
Out of work for 1 year or more		687 (1.9%)	171 (3.5%)	47 (5.4%)	30 (2.4%)	8 (4.5%)	13 (4.8%)	40 (3.1%)	239 (4.1%)
Out of work for less than 1 year		749 (2.1%)	221 (4.5%)	46 (5.3%)	38 (3.0%)	8 (4.5%)	10 (3.7%)	61 (4.8%)	277 (4.8%)
A homemaker		2,720 (7.7%)	145 (3.0%)	57 (6.6%)	103 (8.3%)	15 (8.4%)	26 (9.6%)	82 (6.4%)	956 (17%)
A student		336 (0.9%)	117 (2.4%)	17 (2.0%)	55 (4.4%)	2 (1.1%)	3 (1.1%)	37 (2.9%)	79 (1.4%)
Retired		2,581 (7.3%)	245 (5.0%)	40 (4.6%)	34 (2.7%)	12 (6.7%)	11 (4.0%)	45 (3.5%)	127 (2.2%)
Unable to work		3,014 (8.5%)	552 (11%)	113 (13%)	35 (2.8%)	14 (7.9%)	39 (14%)	135 (11%)	441 (7.6%)
General Health Status	50,000
Excellent		5,372 (15%)	584 (12%)	90 (10%)	229 (18%)	37 (21%)	47 (17%)	156 (12%)	740 (13%)
Very Good		13,208 (37%)	1,335 (27%)	194 (22%)	429 (34%)	33 (19%)	85 (31%)	382 (30%)	1,361 (24%)
Good		10,979 (31%)	1,882 (39%)	342 (39%)	459 (37%)	60 (34%)	68 (25%)	442 (35%)	2,257 (39%)
Fair		4,369 (12%)	864 (18%)	171 (20%)	105 (8.4%)	36 (20%)	42 (15%)	213 (17%)	1,163 (20%)
Poor		1,568 (4.4%)	214 (4.4%)	73 (8.4%)	26 (2.1%)	12 (6.7%)	30 (11%)	79 (6.2%)	264 (4.6%)
Income Category	50,000
< $15,000		1,634 (4.6%)	462 (9.5%)	111 (13%)	49 (3.9%)	15 (8.4%)	28 (10%)	102 (8.0%)	716 (12%)
$15,000 to < $25,000		2,201 (6.2%)	506 (10%)	103 (12%)	62 (5.0%)	20 (11%)	31 (11%)	106 (8.3%)	934 (16%)
$25,000 to < $35,000		2,575 (7.3%)	615 (13%)	144 (17%)	91 (7.3%)	22 (12%)	28 (10%)	123 (9.7%)	993 (17%)
$35,000 to < $50,000		3,453 (9.7%)	798 (16%)	153 (18%)	125 (10%)	28 (16%)	24 (8.8%)	159 (13%)	845 (15%)
$50,000 to < $100,000		10,922 (31%)	1,388 (28%)	226 (26%)	323 (26%)	44 (25%)	74 (27%)	358 (28%)	1,234 (21%)
$100,000 to < $200,000		10,822 (30%)	864 (18%)	104 (12%)	346 (28%)	37 (21%)	63 (23%)	304 (24%)	799 (14%)
> $200,000		3,889 (11%)	246 (5.0%)	29 (3.3%)	252 (20%)	12 (6.7%)	24 (8.8%)	120 (9.4%)	264 (4.6%)
Urban/Rural Status	50,000
Urban County		30,102 (85%)	4,630 (95%)	543 (62%)	1,223 (98%)	173 (97%)	249 (92%)	1,136 (89%)	5,494 (95%)
Rural County		5,394 (15%)	249 (5.1%)	327 (38%)	25 (2.0%)	5 (2.8%)	23 (8.5%)	136 (11%)	291 (5.0%)
Current Health Care Coverage	50,000
Employer/Union Insurance		21,322 (60%)	2,294 (47%)	264 (30%)	821 (66%)	83 (47%)	124 (46%)	655 (51%)	2,073 (36%)
Private Nongovernmental Insurance		4,223 (12%)	415 (8.5%)	49 (5.6%)	135 (11%)	11 (6.2%)	33 (12%)	105 (8.3%)	472 (8.2%)
Medicare		2,109 (5.9%)	437 (9.0%)	63 (7.2%)	40 (3.2%)	20 (11%)	27 (9.9%)	83 (6.5%)	358 (6.2%)
Medigap		16 (<0.1%)	0 (0%)	3 (0.3%)	0 (0%)	2 (1.1%)	0 (0%)	0 (0%)	4 (<0.1%)
Medicaid		3,360 (9.5%)	976 (20%)	179 (21%)	106 (8.5%)	24 (13%)	45 (17%)	183 (14%)	954 (16%)
Children's Health Insurance Program		12 (<0.1%)	3 (<0.1%)	1 (0.1%)	0 (0%)	0 (0%)	0 (0%)	0 (0%)	6 (0.1%)
Military Related Health Care		905 (2.5%)	137 (2.8%)	20 (2.3%)	20 (1.6%)	6 (3.4%)	6 (2.2%)	44 (3.5%)	98 (1.7%)
Indian Health Service		11 (<0.1%)	1 (<0.1%)	177 (20%)	2 (0.2%)	0 (0%)	4 (1.5%)	28 (2.2%)	11 (0.2%)
State Sponsored Health Plan		1,236 (3.5%)	190 (3.9%)	31 (3.6%)	42 (3.4%)	15 (8.4%)	11 (4.0%)	67 (5.3%)	396 (6.8%)
Other Governmental Program		933 (2.6%)	179 (3.7%)	24 (2.8%)	42 (3.4%)	11 (6.2%)	6 (2.2%)	39 (3.1%)	220 (3.8%)
No Coverage of Any Type		1,369 (3.9%)	247 (5.1%)	59 (6.8%)	40 (3.2%)	6 (3.4%)	16 (5.9%)	68 (5.3%)	1,193 (21%)
Personal Health Care Provider	50,000
Yes, only one		18,810 (53%)	2,425 (50%)	454 (52%)	669 (54%)	103 (58%)	123 (45%)	610 (48%)	2,842 (49%)
More than one		14,355 (40%)	2,060 (42%)	304 (35%)	420 (34%)	58 (33%)	116 (43%)	543 (43%)	1,617 (28%)
No		2,331 (6.6%)	394 (8.1%)	112 (13%)	159 (13%)	17 (9.6%)	33 (12%)	119 (9.4%)	1,326 (23%)
Could not afford to see Doctor (past 12 months)	50,000	4,022 (11%)	744 (15%)	147 (17%)	102 (8.2%)	33 (19%)	54 (20%)	230 (18%)	1,362 (24%)
Time since last routine checkup	50,000
Within past year		30,254 (85%)	4,400 (90%)	733 (84%)	1,038 (83%)	141 (79%)	225 (83%)	1,039 (82%)	4,667 (81%)
Within past 2 years		3,695 (10%)	402 (8.2%)	93 (11%)	166 (13%)	25 (14%)	38 (14%)	165 (13%)	808 (14%)
Within Past 5 years		0 (0%)	0 (0%)	0 (0%)	0 (0%)	0 (0%)	0 (0%)	0 (0%)	0 (0%)
5 or more years ago		1,289 (3.6%)	48 (1.0%)	34 (3.9%)	25 (2.0%)	8 (4.5%)	8 (2.9%)	62 (4.9%)	176 (3.0%)
Don't Know/Not Sure		196 (0.6%)	25 (0.5%)	6 (0.7%)	11 (0.9%)	2 (1.1%)	0 (0%)	5 (0.4%)	44 (0.8%)
Never		62 (0.2%)	4 (<0.1%)	4 (0.5%)	8 (0.6%)	2 (1.1%)	1 (0.4%)	1 (<0.1%)	90 (1.6%)
¹ Analytic sample includes 50,000 complete-case observations. Missing values were excluded listwise for all variables included in this table.

Distribution of Cervical Cancer Screening

# Create a bar plot for cervical cancer screening count
ggplot(brfss_analytic, aes(x = cervicalscr)) +
  geom_bar() +
  geom_text(aes(label = after_stat(count)), stat = "count", vjust = -0.5) +
  labs(title = "Figure 1: Bar Graph of Cervical Cancer Screening with Counts",
       x = "Cervical Cancer Screening",
       y = "Count") +
  theme_minimal()

Association Between Outcome and Primary Exposure

# Create bar plot demonstrating the proportion of cervical cancer screening by race/ethnicity 
brfss_analytic %>%
  group_by(race_cat) %>%
  summarize(prop_screened = mean(cervicalscr == "Screened")) %>%
  ggplot(aes(x = race_cat, y = prop_screened)) +
  geom_col(fill = "steelblue") +
  labs(
    x = "Race/Ethnicity",
    y = "Proportion Screened",
    title = "Figure 2: Proportion of Cervical Cancer Screening by Race/Ethnicity"
  ) +
  theme_minimal()

# Create split bar plot for race/ethnicity
ggplot(brfss_analytic, aes(x = race_cat, fill = cervicalscr)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = "Race/Ethnicity",
    y = "Percent",
    fill = "Screened",
    title = "Figure 3: Cervical Screening Distribution by Race/Ethnicity"
  ) +
  theme_minimal()

# Create proportions for each race/ethnicity category
tbl <- brfss_analytic %>%
  group_by(race_cat) %>%
  summarize(
    prop = mean(cervicalscr == "Screened"),
    n = n()
  )

# Create Screening Proportion by Race and Ethnicity with Confidence Intervals 
ggplot(tbl, aes(x = race_cat, y = prop)) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = prop - 0.03, ymax = prop + 0.03), width = 0.2) +
  labs(
    x = "Race/Ethnicity",
    y = "Proportion Screened",
    title = "Figure 4: Screening Proportions with Approximate CIs"
  ) +
  theme_minimal()

Associations between Covariates and Cervical Cancer Screening

# Create split bar plot for education levels 
ggplot(brfss_analytic, aes(x = education, fill = cervicalscr)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = "Level of Education",
    y = "Percent",
    fill = "Screened",
    title = "Figure 5: Cervical Screening Distribution by Level of Education"
  ) +
  theme_minimal()

# Create split bar plot for employment status
ggplot(brfss_analytic, aes(x = employ1_cat, fill = cervicalscr)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = "Employment Status",
    y = "Percent",
    fill = "Screened",
    title = "Cervical Screening Distribution by Employment Status"
  ) +
  theme_minimal()

# Create split bar plot for personal doctor status
ggplot(brfss_analytic, aes(x = persdoc, fill = cervicalscr)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = "Personal Doctor Status",
    y = "Percent",
    fill = "Screened",
    title = "Cervical Screening Distribution by Personal Doctor Status"
  ) +
  theme_minimal()

# Create split bar plot for age categories 
ggplot(brfss_analytic, aes(x = age, fill = cervicalscr)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(
    x = "Age Category (Years)",
    y = "Percent",
    fill = "Screened",
    title = "Figure 6: Cervical Screening Distribution by Age Category"
  ) +
  theme_minimal()

Progress Check-in 1: Data Cleaning and Exploratory Analysis

EPI 553 - Advanced Epidemiologic Methods

Zoya Hayes

2026-03-10

Setup and Data Preparation

Loading BRFSS 2024 Data

Creating a Working Subset

Descriptive Statistics

Distribution of Cervical Cancer Screening

Association Between Outcome and Primary Exposure

Associations between Covariates and Cervical Cancer Screening