library(readxl)
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(tidyr)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(patchwork)
library(tibble)
library(zipcodeR)
library(sf)
## Warning: package 'sf' was built under R version 4.5.2
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
library(rnaturalearth)
library(maps)
df <- read_excel("DF2.xlsx", col_types = "text")
# Standardize NA-like codes across T0/T1/T2 survey columns
survey_cols <- grep("^T[012]-", names(df), value = TRUE)
na_codes <- c("", "NA", "N/A", "na", "n/a", "Na", "NULL", "null")
df <- df %>%
mutate(
across(all_of(survey_cols), ~{
x <- trimws(as.character(.x))
x[x %in% na_codes] <- NA_character_
x
})
)
# ---- 1) Identify the ENROLLMENT column in DF2 (no guessing) ----
enroll_candidates <- names(df)[grepl("enroll", names(df), ignore.case = TRUE)]
print(enroll_candidates)
## [1] "Pre Enrollment Notes"
## [2] "(Screen for Enrollment) Interested in Program"
## [3] "Enrolled"
stopifnot(length(enroll_candidates) >= 1)
# Pick the first candidate by default (edit this ONE line if needed)
ENROLL_COL <- enroll_candidates[3]
cat("Enrolled:", ENROLL_COL, "\n")
## Enrolled: Enrolled
# Quick audit of values in enrollment column
print(sort(unique(na.omit(trimws(df[[ENROLL_COL]])))))
## [1] "N" "y" "Y"
# ---- 2) Identify Cohort column (must exist) ----
stopifnot("Cohort" %in% names(df))
# ---- 3) Build df_status with correct rules: Enrolled wins ----
df_status <- df %>%
mutate(
Enrolled_flag = case_when(
toupper(trimws(.data[[ENROLL_COL]])) %in% c("YES","Y","ENROLLED","TRUE","T","1") ~ TRUE,
toupper(trimws(.data[[ENROLL_COL]])) %in% c("NO","N","NOT ENROLLED","FALSE","F","0") ~ FALSE,
TRUE ~ FALSE # treat blanks/unknown as NOT enrolled for safety
),
Cohort_clean = case_when(
str_to_lower(trimws(as.character(Cohort))) %in% c("1","cohort 1","cohort1","c1") ~ "Cohort 1",
str_to_lower(trimws(as.character(Cohort))) %in% c("2","cohort 2","cohort2","c2") ~ "Cohort 2",
TRUE ~ NA_character_
),
Cohort_final = case_when(
Enrolled_flag & Cohort_clean %in% c("Cohort 1","Cohort 2") ~ Cohort_clean,
TRUE ~ "Not enrolled"
),
Enrolled = if_else(Cohort_final %in% c("Cohort 1","Cohort 2"), "Enrolled", "Not enrolled")
)
df_status <- df_status %>%
dplyr::mutate(
Enrolled = dplyr::if_else(Enrolled == "Enrolled", "Enrolled", "Not enrolled")
)
df_COHORT1 <- df_status %>% filter(Cohort_final == "Cohort 1")
df_COHORT2 <- df_status %>% filter(Cohort_final == "Cohort 2")
df_enrolled <- df_status %>% filter(Enrolled == "Enrolled")
t1_q_cols <- grep("^T1-", names(df_status), value = TRUE)
t1_q_cols <- t1_q_cols[!grepl("Contact|NOTES", t1_q_cols, ignore.case = TRUE)]
df_COHORT1 <- df_COHORT1 %>%
mutate(
T1_answer_count = rowSums(!is.na(dplyr::select(., dplyr::all_of(t1_q_cols))))
)
n_c1_total <- nrow(df_COHORT1)
n_c1_t1_gt2 <- sum(df_COHORT1$T1_answer_count > 2, na.rm = TRUE)
pct_c1_t1_gt2 <- ifelse(n_c1_total > 0, round(100 * n_c1_t1_gt2 / n_c1_total, 1), NA_real_)
n_c1_attrition <- n_c1_total - n_c1_t1_gt2
pct_c1_attrition <- ifelse(n_c1_total > 0, round(100 * n_c1_attrition / n_c1_total, 1), NA_real_)
# ---- 4) PDSA summary table (counts only; targets included) ----
TARGET_COHORT1 <- 60
TARGET_COHORT2 <- 60
n_total <- nrow(df_status)
n_c1 <- nrow(df_COHORT1)
n_c2 <- nrow(df_COHORT2)
n_enrolled <- nrow(df_enrolled)
n_not_enrolled <- sum(df_status$Enrolled == "Not enrolled")
pdsa_summary <- tibble(
Metric = c(
"Total people in dataset",
"Cohort 1 — actual (ENROLLED==Yes AND Cohort==1)",
"Cohort 1 — target",
"Cohort 1 — shortfall",
"Cohort 2 — actual (ENROLLED==Yes AND Cohort==2)",
"Cohort 2 — target",
"Cohort 2 — shortfall",
"Enrolled total (Cohort 1 + Cohort 2)",
"Not enrolled total (everyone else)"
),
Count = c(
n_total,
n_c1,
TARGET_COHORT1,
TARGET_COHORT1 - n_c1,
n_c2,
TARGET_COHORT2,
TARGET_COHORT2 - n_c2,
n_enrolled,
n_not_enrolled
)
)
pdsa_summary <- pdsa_summary %>%
dplyr::bind_rows(
tibble::tibble(
Metric = c(
"Cohort 1 with >2 T1 questions answered",
"Cohort 1 attrition (<=2 T1 questions answered)"
),
Count = c(n_c1_t1_gt2, n_c1_attrition),
Denominator = c(n_c1_total, n_c1_total),
Percent = c(pct_c1_t1_gt2, pct_c1_attrition)
)
)
pdsa_summary
## # A tibble: 11 × 4
## Metric Count Denominator Percent
## <chr> <dbl> <int> <dbl>
## 1 Total people in dataset 764 NA NA
## 2 Cohort 1 — actual (ENROLLED==Yes AND Cohort==1) 60 NA NA
## 3 Cohort 1 — target 60 NA NA
## 4 Cohort 1 — shortfall 0 NA NA
## 5 Cohort 2 — actual (ENROLLED==Yes AND Cohort==2) 41 NA NA
## 6 Cohort 2 — target 60 NA NA
## 7 Cohort 2 — shortfall 19 NA NA
## 8 Enrolled total (Cohort 1 + Cohort 2) 101 NA NA
## 9 Not enrolled total (everyone else) 663 NA NA
## 10 Cohort 1 with >2 T1 questions answered 38 60 63.3
## 11 Cohort 1 attrition (<=2 T1 questions answered) 22 60 36.7
# Compare Enrolled vs All patients
# 1) Build long race rows (one row per race code per patient)
race_long <- df_status %>%
dplyr::select(Enrolled, Race) %>%
dplyr::filter(!is.na(Race), trimws(Race) != "") %>%
dplyr::mutate(Race = stringr::str_split(as.character(Race), "\\r\\n")) %>%
tidyr::unnest(Race) %>%
dplyr::mutate(Race = stringr::str_trim(Race)) %>%
dplyr::filter(stringr::str_detect(Race, "^R")) %>%
dplyr::mutate(
Race_clean = dplyr::case_when(
stringr::str_detect(Race, "R1") ~ "American Indian / Alaska Native",
stringr::str_detect(Race, "R2") ~ "Asian",
stringr::str_detect(Race, "R3") ~ "Black or African American",
stringr::str_detect(Race, "R4") ~ "Native Hawaiian / Pacific Islander",
stringr::str_detect(Race, "R5") ~ "White",
stringr::str_detect(Race, "R9") ~ "Other",
TRUE ~ NA_character_
)
) %>%
dplyr::filter(!is.na(Race_clean))
# 2) Duplicate into two comparison groups: All patients vs Enrolled
race_grouped <- dplyr::bind_rows(
race_long %>% dplyr::mutate(Group = "All patients"),
race_long %>% dplyr::filter(Enrolled == "Enrolled") %>% dplyr::mutate(Group = "Enrolled")
) %>%
dplyr::filter(Group %in% c("All patients", "Enrolled")) %>%
dplyr::mutate(Group = factor(Group, levels = c("All patients", "Enrolled")))
# 3) Tabulate within-group percentages
race_tab <- race_grouped %>%
dplyr::count(Group, Race_clean, name = "n") %>%
dplyr::group_by(Group) %>%
dplyr::mutate(percent = round(100 * n / sum(n), 1)) %>%
dplyr::ungroup()
# 4)
race_tab
## # A tibble: 10 × 4
## Group Race_clean n percent
## <fct> <chr> <int> <dbl>
## 1 All patients American Indian / Alaska Native 4 0.5
## 2 All patients Asian 20 2.7
## 3 All patients Black or African American 206 27.8
## 4 All patients Native Hawaiian / Pacific Islander 2 0.3
## 5 All patients Other 193 26.1
## 6 All patients White 315 42.6
## 7 Enrolled American Indian / Alaska Native 1 1
## 8 Enrolled Black or African American 40 40
## 9 Enrolled Other 37 37
## 10 Enrolled White 22 22
# Order races by max % in either group (keeps visual comparison intuitive)
race_order <- race_tab %>%
dplyr::group_by(Race_clean) %>%
dplyr::summarise(max_pct = max(percent), .groups = "drop") %>%
dplyr::arrange(max_pct) %>%
dplyr::pull(Race_clean)
race_tab <- race_tab %>%
dplyr::mutate(
Race_clean = factor(Race_clean, levels = race_order)
)
# Plot: Enrolled vs All patients
ggplot2::ggplot(
race_tab,
ggplot2::aes(x = Race_clean, y = percent, fill = Group)
) +
ggplot2::geom_col(
position = ggplot2::position_dodge(width = 0.8),
width = 0.7
) +
ggplot2::geom_text(
ggplot2::aes(label = paste0(percent, "%")),
position = ggplot2::position_dodge(width = 0.8),
hjust = -0.1,
size = 3.6
) +
ggplot2::coord_flip(clip = "off") +
ggplot2::scale_x_discrete(limits = rev(levels(race_tab$Race_clean))) +
ggplot2::scale_fill_manual(
values = c(
"All patients" = "#CFE1F2",
"Enrolled" = "#1F4E79"
)
) +
ggplot2::expand_limits(y = max(race_tab$percent, na.rm = TRUE) + 8) +
ggplot2::theme_bw() +
ggplot2::labs(
title = "Race Distribution: Enrolled vs All Patients",
x = "Race",
y = "% within group",
fill = ""
) +
ggplot2::theme(
plot.title = ggplot2::element_text(face = "bold", size = 14),
axis.text.y = ggplot2::element_text(size = 10),
axis.text.x = ggplot2::element_text(size = 10),
legend.position = "top"
)

### SEX ANALYSIS — Enrolled vs All patients
sex_long <- df_status %>%
dplyr::transmute(
Enrolled,
Sex = trimws(as.character(`Legal Sex`))
) %>%
dplyr::filter(!is.na(Sex), Sex != "")
sex_tab <- dplyr::bind_rows(
sex_long %>% dplyr::mutate(Group = "All patients"),
sex_long %>% dplyr::filter(Enrolled == "Enrolled") %>% dplyr::mutate(Group = "Enrolled")
) %>%
dplyr::count(Group, Sex, name = "n") %>%
dplyr::group_by(Group) %>%
dplyr::mutate(percent = round(100 * n / sum(n), 1)) %>%
dplyr::ungroup()
sex_order <- sex_tab %>%
dplyr::group_by(Sex) %>%
dplyr::summarise(max_pct = max(percent), .groups = "drop") %>%
dplyr::arrange(max_pct) %>%
dplyr::pull(Sex)
sex_tab <- sex_tab %>%
dplyr::mutate(
Sex = factor(Sex, levels = sex_order),
Group = factor(Group, levels = c("All patients", "Enrolled"))
)
ggplot2::ggplot(sex_tab, ggplot2::aes(x = Sex, y = percent, fill = Group)) +
ggplot2::geom_col(position = ggplot2::position_dodge(width = 0.8), width = 0.7) +
ggplot2::geom_text(
ggplot2::aes(label = paste0(percent, "%")),
position = ggplot2::position_dodge(width = 0.8),
hjust = -0.1,
size = 3.6,
color = "black"
) +
ggplot2::coord_flip(clip = "off") +
ggplot2::scale_x_discrete(limits = rev(levels(sex_tab$Sex))) +
ggplot2::scale_fill_manual(values = c("All patients" = "#CFE1F2", "Enrolled" = "#1F4E79")) +
ggplot2::expand_limits(y = max(sex_tab$percent, na.rm = TRUE) + 8) +
ggplot2::theme_bw() +
ggplot2::labs(
title = "Legal Sex: Enrolled vs All Patients",
x = "",
y = "% within group",
fill = ""
) +
ggplot2::theme(
plot.title = ggplot2::element_text(face = "bold", size = 14),
axis.text.y = ggplot2::element_text(size = 10),
axis.text.x = ggplot2::element_text(size = 10),
legend.position = "top"
)

### ETHNICITY — Enrolled vs All patients
# 1) Clean ethnicity (DO NOT overwrite df_status; just create a temp view)
eth_long <- df_status %>%
dplyr::transmute(
Enrolled,
Ethnicity_raw = trimws(as.character(Ethnicity)),
Ethnicity_clean = dplyr::case_when(
grepl("E1", Ethnicity_raw) ~ "Spanish/Hispanic/Latino",
grepl("E2", Ethnicity_raw) ~ "Not Spanish/Hispanic/Latino",
grepl("S1", Ethnicity_raw) ~ "Patient declined",
grepl("S2", Ethnicity_raw) ~ "Patient unavailable",
TRUE ~ NA_character_
)
) %>%
dplyr::filter(!is.na(Ethnicity_clean), Ethnicity_clean != "")
# 2) Build comparison groups: All vs Enrolled
eth_tab <- dplyr::bind_rows(
eth_long %>% dplyr::mutate(Group = "All patients"),
eth_long %>% dplyr::filter(Enrolled == "Enrolled") %>% dplyr::mutate(Group = "Enrolled")
) %>%
dplyr::count(Group, Ethnicity_clean, name = "n") %>%
dplyr::group_by(Group) %>%
dplyr::mutate(percent = round(100 * n / sum(n), 1)) %>%
dplyr::ungroup()
# 3) Order categories by max % across groups (optional)
eth_order <- eth_tab %>%
dplyr::group_by(Ethnicity_clean) %>%
dplyr::summarise(max_pct = max(percent), .groups = "drop") %>%
dplyr::arrange(max_pct) %>%
dplyr::pull(Ethnicity_clean)
eth_tab <- eth_tab %>%
dplyr::mutate(
Ethnicity_clean = factor(Ethnicity_clean, levels = eth_order),
Group = factor(Group, levels = c("All patients", "Enrolled"))
)
# 4) Plot
ggplot2::ggplot(eth_tab, ggplot2::aes(x = Ethnicity_clean, y = percent, fill = Group)) +
ggplot2::geom_col(position = ggplot2::position_dodge(width = 0.8), width = 0.7) +
ggplot2::geom_text(
ggplot2::aes(label = paste0(percent, "%")),
position = ggplot2::position_dodge(width = 0.8),
hjust = -0.1,
size = 3.6,
color = "black"
) +
ggplot2::coord_flip(clip = "off") +
ggplot2::scale_x_discrete(limits = rev(levels(eth_tab$Ethnicity_clean))) +
ggplot2::scale_fill_manual(values = c("All patients" = "#CFE1F2", "Enrolled" = "#1F4E79")) +
ggplot2::expand_limits(y = max(eth_tab$percent, na.rm = TRUE) + 8) +
ggplot2::theme_bw() +
ggplot2::labs(
title = "Ethnicity: Enrolled vs All Patients",
x = "",
y = "% within group",
fill = ""
) +
ggplot2::theme(
plot.title = ggplot2::element_text(face = "bold", size = 14),
axis.text.y = ggplot2::element_text(size = 10),
axis.text.x = ggplot2::element_text(size = 10),
legend.position = "top"
)

## ---- hba1c_hist_all_vs_enrolled (DF2) ---------------------------------
# 0) Ensure df_status + Enrolled exist (KEEP your existing rule here)
t0_q_cols <- grep("^T0-", names(df), value = TRUE)
t0_q_cols <- t0_q_cols[!grepl("Contact|NOTES", t0_q_cols, ignore.case = TRUE)]
df_status <- df %>%
dplyr::mutate(
T0_answer_count = rowSums(!is.na(dplyr::select(., dplyr::all_of(t0_q_cols)))),
Enrolled = dplyr::if_else(T0_answer_count >= 3, "Enrolled", "Not enrolled")
)
# 1) Clean HbA1c (baseline column in DF2)
clean_a1c <- function(x) {
x <- trimws(as.character(x))
x[x %in% c("", "NA", "N/A", "na", "n/a", "Na", "NULL", "null")] <- NA_character_
x[x == "<4.0"] <- "4.0"
x[x == ">14.0"] <- "14.0"
x[x == ">20.0"] <- "20.0"
x <- gsub("%", "", x)
suppressWarnings(as.numeric(x))
}
df_status <- df_status %>%
dplyr::mutate(HbA1c = clean_a1c(`HbA1c Value (Last 3 Months)`))
# 2) Build “All patients” vs “Enrolled” groups (NO “Not enrolled” histogram)
df_long <- df_status %>%
dplyr::filter(!is.na(HbA1c)) %>%
dplyr::mutate(
Hb_bin = floor(HbA1c),
Hb_mid = Hb_bin + 0.5
)
df_hist <- dplyr::bind_rows(
df_long %>% dplyr::mutate(Group = "All patients"),
df_long %>% dplyr::filter(Enrolled == "Enrolled") %>% dplyr::mutate(Group = "Enrolled")
) %>%
dplyr::count(Group, Hb_bin, Hb_mid, name = "n") %>%
dplyr::group_by(Group) %>%
dplyr::mutate(pct = n / sum(n) * 100) %>%
dplyr::ungroup() %>%
dplyr::mutate(
Group = factor(Group, levels = c("All patients", "Enrolled")),
label = ifelse(pct > 0, paste0(round(pct, 1), "%"), "")
)
# 3) Plot (two stacked histograms, same style + y-limits)
ggplot2::ggplot(df_hist, ggplot2::aes(x = Hb_mid, y = pct)) +
ggplot2::geom_col(width = 0.9, fill = "#4E79A7", color = "white") +
ggplot2::geom_text(ggplot2::aes(label = label), vjust = -0.35, size = 3.5) +
ggplot2::facet_wrap(~ Group, ncol = 1, scales = "fixed") +
ggplot2::scale_x_continuous(
breaks = seq(min(df_hist$Hb_bin, na.rm = TRUE), max(df_hist$Hb_bin, na.rm = TRUE), by = 1),
minor_breaks = NULL
) +
ggplot2::coord_cartesian(ylim = c(0, 50)) +
ggplot2::scale_y_continuous(labels = function(x) paste0(x, "%")) +
ggplot2::theme_bw() +
ggplot2::labs(
title = "HbA1c Distribution: Enrolled vs All Patients",
x = "HbA1c (%)",
y = "% of patients within group"
) +
ggplot2::theme(
plot.title = ggplot2::element_text(size = 14, face = "bold"),
strip.text = ggplot2::element_text(size = 12, face = "bold"),
axis.text = ggplot2::element_text(size = 11)
)

### Blood sugar check frequency (T0 vs T1)
## 1. Identify the two survey columns
t0_col <- grep(
"^T0- In the past week, how often did you check your blood sugar",
names(df_COHORT1),
value = TRUE
)
t1_col <- grep(
"^T1- In the past week, how often did you check your blood sugar",
names(df_COHORT1),
value = TRUE
)
stopifnot(length(t0_col) == 1, length(t1_col) == 1)
t0_raw <- df_COHORT1[[t0_col]]
t1_raw <- df_COHORT1[[t1_col]]
## 2. Cleaning function: collapse equivalent strings
clean_freq <- function(x) {
x_std <- toupper(trimws(as.character(x)))
dplyr::case_when(
# 1–2 times (all variants)
x_std %in% c("1–2 TIMES", "1-2 TIMES", "1 TO 2 TIMES") ~ "1–2 times",
# 3–4 times
x_std %in% c("3–4 TIMES", "3-4 TIMES") ~ "3–4 times",
# Daily
x_std %in% c("DAILY", "DAILY ", "DAILY.") ~ "Daily",
# More than once daily
x_std %in% c("MORE THAN DAILY",
"MORE THAN ONCE DAILY",
"MORE THAN ONCE DAILY",
"MORE THAN ONCE A DAY") ~ "More than once daily",
# Not at all
x_std %in% c("NOT AT ALL") ~ "Not at all",
# Anything unexpected
TRUE ~ NA_character_
)
}
## 3. Long data with cleaned responses
df_bs <- tibble::tibble(
Timepoint = c(rep("T0", length(t0_raw)), rep("T1", length(t1_raw))),
Response_raw = c(t0_raw, t1_raw)
) %>%
mutate(Response_clean = clean_freq(Response_raw)) %>%
filter(!is.na(Response_clean))
## 4. Summary table for plotting
freq_summary <- df_bs %>%
count(Timepoint, Response_clean) %>%
group_by(Timepoint) %>%
mutate(pct = round(100 * n / sum(n), 1)) %>%
ungroup() %>%
mutate(
Response_clean = factor(
Response_clean,
levels = c("Not at all",
"1–2 times",
"3–4 times",
"Daily",
"More than once daily")
)
)
## 5. Plot: frequency of blood sugar checks at T0 vs T1
ggplot(freq_summary,
aes(x = Response_clean, y = pct, fill = Timepoint)) +
geom_col(position = position_dodge(width = 0.8), width = 0.7) +
geom_text(aes(label = paste0(pct, "%")),
position = position_dodge(width = 0.8),
vjust = -0.3,
size = 3.5) +
theme_bw() +
labs(
title = "How Often Patients Checked Blood Sugar (T0 vs T1)",
x = "Frequency in past week",
y = "% of participants",
fill = "Timepoint"
) +
theme(
axis.text.x = element_text(angle = 30, hjust = 1, size = 10),
plot.title = element_text(size = 14, face = "bold")
)

###blood sugar in range
sugar_levels_T0 <- c("No", "Maybe", "Yes")
sugar_levels_T1 <- c(
"Never",
"Sometimes",
"About half the time",
"Most of the time",
"Always"
)
sugar_T0_raw <- df_COHORT1$`T0- Were your sugar levels usually in your target range last week?`
sugar_T1_raw <- df_COHORT1$`T1- Were your sugar levels usually in your target range last week?`
length(sugar_T0_raw)
## [1] 60
length(sugar_T1_raw)
## [1] 60
clean_T0 <- function(x) {
x_std <- toupper(trimws(x))
case_when(
x_std == "YES" ~ "Yes",
x_std == "NO" ~ "No",
x_std == "MAYBE" ~ "Maybe",
TRUE ~ NA_character_
)
}
clean_T1 <- function(x) {
x_std <- toupper(trimws(x))
case_when(
x_std %in% c("NEVER") ~ "Never",
x_std == "SOMETIMES" ~ "Sometimes",
x_std %in% c("ABOUT HALF THE TIME", "HALF THE TIME") ~ "About half",
x_std %in% c("MOST OF THE TIME", "YES") ~ "Most",
x_std == "ALWAYS" ~ "Always",
TRUE ~ NA_character_
)
}
T0_clean <- clean_T0(sugar_T0_raw)
T1_clean <- clean_T1(sugar_T1_raw)
score_map <- c(
"No" = 0,
"Never" = 0,
"Maybe" = 1,
"Sometimes" = 1,
"About half" = 2,
"Yes" = 3,
"Most" = 3,
"Always" = 4
)
T0_score <- unname(score_map[T0_clean])
T1_score <- unname(score_map[T1_clean])
df_change <- data.frame(
patient_id = seq_len(nrow(df_COHORT1)),
T0_raw = T0_clean,
T1_raw = T1_clean,
T0_score = T0_score,
T1_score = T1_score
) %>%
filter(!is.na(T0_score) & !is.na(T1_score)) %>%
mutate(
delta = T1_score - T0_score,
change = case_when(
delta > 0 ~ "Increased",
delta < 0 ~ "Decreased",
TRUE ~ "No change"
)
)
change_summary <- df_change %>%
count(change) %>%
mutate(percent = round(100 * n / sum(n), 1))
change_summary$change <- factor(
change_summary$change,
levels = c("Decreased", "No change", "Increased")
)
ggplot(change_summary, aes(x = change, y = percent, fill = change)) +
geom_col() +
geom_text(
aes(label = paste0(percent, "%")),
vjust = -0.3,
size = 4
) +
scale_fill_manual(
values = c(
"Decreased" = "#d73027",
"No change" = "#4E79A7",
"Increased" = "#1a9850"
)
) +
theme_bw() +
labs(
title = "Change in Blood Sugar in Target Range (T0 → T1)",
x = "",
y = "% of participants"
) +
theme(
legend.position = "none",
plot.title = element_text(size = 14, face = "bold")
)

change_grouped_table <- df_change %>%
count(change, T0_raw, T1_raw) %>%
arrange(change, desc(n))
change_grouped_percent <- df_change %>%
count(change, T0_raw, T1_raw) %>%
group_by(change) %>%
mutate(percent = round(100 * n / sum(n), 1)) %>%
ungroup() %>%
arrange(change, desc(percent))
change_grouped_wide <- change_grouped_percent %>%
mutate(label = paste0(n, " (", percent, "%)")) %>%
select(change, T0_raw, T1_raw, label) %>%
tidyr::pivot_wider(
names_from = T1_raw,
values_from = label,
values_fill = ""
)
change_grouped_wide
## # A tibble: 8 × 7
## change T0_raw Sometimes Never Most `About half` Always
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Decreased Yes "2 (66.7%)" "" "" "" ""
## 2 Decreased Maybe "" "1 (33.3%)" "" "" ""
## 3 Increased No "6 (33.3%)" "" "2 (11.1%)" "" "1 (5.6%)"
## 4 Increased Maybe "" "" "5 (27.8%)" "2 (11.1%)" "1 (5.6%)"
## 5 Increased Yes "" "" "" "" "1 (5.6%)"
## 6 No change Yes "" "" "7 (63.6%)" "" ""
## 7 No change Maybe "3 (27.3%)" "" "" "" ""
## 8 No change No "" "1 (9.1%)" "" "" ""
## ---- missed_medication_T0_T1_summary ---------------------------------
## Assumes df_COHORT1 already exists (Cohort 1 only)
t0_col <- "T0- In the past week, did you miss any doses of you diabetes medications?"
t1_col <- "T1- In the past week, did you miss any doses of you diabetes medications?"
stopifnot(t0_col %in% names(df_COHORT1),
t1_col %in% names(df_COHORT1))
clean_yesno <- function(x) {
x <- toupper(trimws(as.character(x)))
dplyr::case_when(
x %in% c("Y","YES") ~ "Yes",
x %in% c("N","NO") ~ "No",
TRUE ~ NA_character_
)
}
df_meds <- df_COHORT1 %>%
transmute(
T0 = clean_yesno(.data[[t0_col]]),
T1 = clean_yesno(.data[[t1_col]])
)
## ------------------------------------------------
## 1) Distribution at T0
## ------------------------------------------------
t0_dist <- df_meds %>%
filter(!is.na(T0)) %>%
count(T0) %>%
mutate(
Denominator = sum(n),
Percent = round(100 * n / Denominator, 1)
)
t0_dist
## # A tibble: 2 × 4
## T0 n Denominator Percent
## <chr> <int> <int> <dbl>
## 1 No 47 60 78.3
## 2 Yes 13 60 21.7
## ------------------------------------------------
## 2) Distribution at T1
## ------------------------------------------------
t1_dist <- df_meds %>%
filter(!is.na(T1)) %>%
count(T1) %>%
mutate(
Denominator = sum(n),
Percent = round(100 * n / Denominator, 1)
)
t1_dist
## # A tibble: 2 × 4
## T1 n Denominator Percent
## <chr> <int> <int> <dbl>
## 1 No 32 38 84.2
## 2 Yes 6 38 15.8
## ------------------------------------------------
## 3) Change from T0 → T1 (only those with BOTH)
## ------------------------------------------------
change_dist <- df_meds %>%
filter(!is.na(T0), !is.na(T1)) %>%
mutate(
Change = case_when(
T0 == "Yes" & T1 == "No" ~ "Improved (missed → did not miss)",
T0 == "No" & T1 == "Yes" ~ "Worsened (did not miss → missed)",
TRUE ~ "No change"
)
) %>%
count(Change) %>%
mutate(
Denominator = sum(n),
Percent = round(100 * n / Denominator, 1)
)
change_dist
## # A tibble: 3 × 4
## Change n Denominator Percent
## <chr> <int> <int> <dbl>
## 1 Improved (missed → did not miss) 7 38 18.4
## 2 No change 27 38 71.1
## 3 Worsened (did not miss → missed) 4 38 10.5
## ---- missed_meds_T0_plot --------------------------------------------
ggplot(t0_dist, aes(x = T0, y = Percent, fill = T0)) +
geom_col(width = 0.7) +
geom_text(aes(label = paste0(Percent, "%")),
vjust = -0.3, size = 4) +
scale_fill_manual(values = c(
"Yes" = "#d73027",
"No" = "#1a9850"
)) +
scale_y_continuous(limits = c(0, 100)) +
theme_bw() +
labs(
title = "Missed Diabetes Medication Doses at T0 (Cohort 1)",
x = "",
y = "% of patients"
) +
theme(
legend.position = "none",
plot.title = element_text(face = "bold", size = 14)
)

## ---- missed_meds_T1_plot --------------------------------------------
ggplot(t1_dist, aes(x = T1, y = Percent, fill = T1)) +
geom_col(width = 0.7) +
geom_text(aes(label = paste0(Percent, "%")),
vjust = -0.3, size = 4) +
scale_fill_manual(values = c(
"Yes" = "#d73027",
"No" = "#1a9850"
)) +
scale_y_continuous(limits = c(0, 100)) +
theme_bw() +
labs(
title = "Missed Diabetes Medication Doses at T1 (Cohort 1)",
x = "",
y = "% of patients"
) +
theme(
legend.position = "none",
plot.title = element_text(face = "bold", size = 14)
)

## ---- missed_meds_change_plot ----------------------------------------
ggplot(change_dist,
aes(x = Change, y = Percent, fill = Change)) +
geom_col(width = 0.7) +
geom_text(aes(label = paste0(Percent, "%")),
vjust = -0.3, size = 4) +
scale_fill_manual(values = c(
"Improved (missed → did not miss)" = "#1a9850",
"No change" = "#4E79A7",
"Worsened (did not miss → missed)" = "#d73027"
)) +
scale_y_continuous(limits = c(0, 100)) +
theme_bw() +
labs(
title = "Change in Missed Medication Doses from T0 to T1 (Cohort 1)",
x = "",
y = "% of evaluable patients"
) +
theme(
legend.position = "none",
plot.title = element_text(face = "bold", size = 14),
axis.text.x = element_text(size = 11)
)

###Confidence in blood sugar maintenance practices
confidence_levels <- c(
"Not confident",
"Somewhat not confident",
"Maybe",
"Confident",
"Very confident"
)
# T0: Yes / Maybe / No → 1–3
conf_T0_raw <- df_COHORT1$`T0- Are you Confident in making food choices that help control your blood sugar?`
conf_T0_num <- dplyr::case_when(
toupper(trimws(conf_T0_raw)) == "NO" ~ 1,
toupper(trimws(conf_T0_raw)) == "MAYBE" ~ 2,
toupper(trimws(conf_T0_raw)) == "YES" ~ 3,
TRUE ~ NA_real_
)
# T1: 1–4 scale or text variants → 1–4
conf_T1_raw <- df_COHORT1$`T1- How Confident are you in making food choices that help control your blood sugar?`
conf_T1_num <- {
x <- toupper(trimws(as.character(conf_T1_raw)))
dplyr::case_when(
x %in% c("1", "NOT CONFIDENT") ~ 1,
x %in% c("2", "SOMEWHAT NOT CONFIDENT") ~ 2,
x %in% c("3", "CONFIDENT", "COFIDENT", "CONFIDEMT") ~ 3,
x %in% c("4", "VERY CONFIDENT") ~ 4,
TRUE ~ NA_real_
)
}
# Combine & classify change
df_conf_change <- data.frame(
conf_T0_num = conf_T0_num,
conf_T1_num = conf_T1_num
) %>%
dplyr::filter(!is.na(conf_T0_num), !is.na(conf_T1_num)) %>%
dplyr::mutate(
change = conf_T1_num - conf_T0_num,
ChangeCategory = dplyr::case_when(
change > 0 ~ "Increased confidence",
change == 0 ~ "No change",
change < 0 ~ "Decreased confidence"
)
)
change_summary <- df_conf_change %>%
dplyr::count(ChangeCategory) %>%
dplyr::mutate(
percent = round(100 * n / sum(n), 1),
label = paste0(n, " (", percent, "%)")
)
ggplot(change_summary, aes(x = ChangeCategory, y = n)) +
geom_col(fill = "#2E86C1", width = 0.6) +
geom_text(aes(label = label),
vjust = -0.3, size = 3.8) +
scale_y_continuous(
expand = expansion(mult = c(0, 0.1))
) +
theme_bw() +
labs(
title = "Change in Confidence Making Food Choices (T0 → T1)",
x = "",
y = "Number of participants"
) +
theme(
axis.text.x = element_text(size = 11),
axis.title.y = element_text(size = 11),
plot.title = element_text(size = 14, face = "bold")
)

# ---- T0 confidence (Yes / Maybe / No) ----
conf_T0_raw <- df_COHORT1$`T0- Are you Confident in making food choices that help control your blood sugar?`
conf_T0_label <- dplyr::case_when(
toupper(trimws(conf_T0_raw)) == "NO" ~ "No",
toupper(trimws(conf_T0_raw)) == "MAYBE" ~ "Maybe",
toupper(trimws(conf_T0_raw)) == "YES" ~ "Yes",
TRUE ~ NA_character_
)
# ---- T1 confidence (1–4 or text variants) ----
conf_T1_raw <- df_COHORT1$`T1- How Confident are you in making food choices that help control your blood sugar?`
conf_T1_label <- {
x <- toupper(trimws(as.character(conf_T1_raw)))
dplyr::case_when(
x %in% c("1", "NOT CONFIDENT") ~ "Not confident",
x %in% c("2", "SOMEWHAT NOT CONFIDENT") ~ "Somewhat not confident",
x %in% c("3", "CONFIDENT", "COFIDENT", "CONFIDEMT") ~ "Confident",
x %in% c("4", "VERY CONFIDENT") ~ "Very confident",
TRUE ~ NA_character_
)
}
conf_transition <- data.frame(
T0 = conf_T0_label,
T1 = conf_T1_label,
stringsAsFactors = FALSE
) %>%
dplyr::filter(!is.na(T0), !is.na(T1))
conf_transition_table <- conf_transition %>%
dplyr::count(T0, T1, name = "n") %>%
dplyr::arrange(desc(n))
conf_transition_table
## T0 T1 n
## 1 Yes Very confident 19
## 2 Yes Confident 5
## 3 Maybe Confident 4
## 4 No Very confident 2
## 5 Yes Somewhat not confident 2
## 6 Maybe Somewhat not confident 1
## 7 Maybe Very confident 1
## 8 No Confident 1
## 9 No Somewhat not confident 1
## 10 Yes Not confident 1
## Appointments T0 vs T1 – enrolled pts (>= 3 T0 answers, >=1 T1 answer)
# 1. Identify T0 / T1 question columns ------------------------------
t0_q_cols <- grep("^T0-", names(df), value = TRUE)
t1_q_cols <- grep("^T1-", names(df), value = TRUE)
# 2. Define analysis cohort locally (DOES NOT overwrite df_status) --
df_appt_base <- df %>%
mutate(
T0_answer_count = rowSums(!is.na(dplyr::select(., all_of(t0_q_cols)))),
T1_answer_count = rowSums(!is.na(dplyr::select(., all_of(t1_q_cols))))
) %>%
filter(
T0_answer_count >= 3, # your enrollment rule
T1_answer_count > 0 # must have answered something at T1
)
# 3. Pull raw appointment columns from this filtered cohort ---------
appt_T0_raw <- df_appt_base$`T0- Since your discharge from the hospital, have you had an appointment with a healthcare provider for your diabetes?`
appt_T1_raw <- df_appt_base$`T1- Since our last check in, have you had an appointment with a healthcare provider for your diabetes?`
# 4. Clean Yes/No responses -----------------------------------------
clean_yesno <- function(x) {
x_std <- toupper(trimws(as.character(x)))
dplyr::case_when(
x_std %in% c("YES", "Y", "YS") ~ "Yes",
x_std %in% c("NO", "N") ~ "No",
TRUE ~ NA_character_
)
}
appt_T0_clean <- clean_yesno(appt_T0_raw)
appt_T1_clean <- clean_yesno(appt_T1_raw)
# 5. Build paired change dataset (complete appointment data only) ---
df_appt_change <- tibble(
patient_id = seq_len(nrow(df_appt_base)),
T0_raw = appt_T0_clean,
T1_raw = appt_T1_clean
) %>%
filter(!is.na(T0_raw), !is.na(T1_raw)) # drops any remaining NA pairs
# 6. Score Yes/No and classify change -------------------------------
score_map <- c("No" = 0, "Yes" = 1)
df_appt_change <- df_appt_change %>%
mutate(
T0_score = unname(score_map[T0_raw]),
T1_score = unname(score_map[T1_raw]),
delta = T1_score - T0_score,
change = dplyr::case_when(
delta > 0 ~ "Increased", # No -> Yes
delta < 0 ~ "Decreased", # Yes -> No
TRUE ~ "No change" # Yes->Yes or No->No
)
)
# 7. Summary for bar chart ------------------------------------------
change_summary <- df_appt_change %>%
count(change) %>%
mutate(
percent = round(100 * n / sum(n), 1),
change = factor(change, levels = c("Decreased", "No change", "Increased"))
)
# 8. (Optional) transition table object (not printed automatically) -
transition_table_wide <- df_appt_change %>%
count(change, T0_raw, T1_raw) %>%
group_by(change) %>%
mutate(
percent = round(100 * n / sum(n), 1),
label = paste0(n, " (", percent, "%)")
) %>%
ungroup() %>%
select(change, T0_raw, T1_raw, label) %>%
pivot_wider(
names_from = T1_raw,
values_from = label,
values_fill = ""
)
# View(transition_table_wide) if you want to inspect it
# 9. Pie-chart data (same cohort as change plot) --------------------
appt_pie_df <- df_appt_change %>%
select(T0_raw, T1_raw) %>%
mutate(row = row_number()) %>%
pivot_longer(cols = c(T0_raw, T1_raw),
names_to = "Timepoint",
values_to = "Response") %>%
mutate(
Timepoint = if_else(Timepoint == "T0_raw", "T0", "T1"),
Response = factor(Response, levels = c("Yes", "No"))
)
appt_pie_summary <- appt_pie_df %>%
count(Timepoint, Response) %>%
group_by(Timepoint) %>%
mutate(
percent = round(100 * n / sum(n), 1),
label = paste0(percent, "%\n(n=", n, ")")
) %>%
ungroup()
# 10. Plots: bar of change + two pies -------------------------------
p_change <- ggplot(change_summary,
aes(x = change, y = percent, fill = change)) +
geom_col() +
geom_text(
aes(label = paste0(percent, "%")),
vjust = -0.3,
size = 4
) +
scale_fill_manual(
values = c(
"Decreased" = "#d73027",
"No change" = "#4E79A7",
"Increased" = "#1a9850"
)
) +
scale_y_continuous(
limits = c(0, 80),
breaks = seq(0, 80, by = 10),
labels = function(x) paste0(x, "%")
) +
theme_bw() +
labs(
title = "Change in Having a Diabetes-Related Appointment (T0 → T1)",
x = "",
y = "% of participants"
) +
theme(
legend.position = "none",
plot.title = element_text(size = 14, face = "bold")
)
p_pies <- ggplot(appt_pie_summary,
aes(x = "", y = percent, fill = Response)) +
geom_col(width = 1, color = "white") +
coord_polar(theta = "y") +
geom_text(
aes(label = label),
position = position_stack(vjust = 0.5),
size = 3.8
) +
facet_wrap(~ Timepoint) +
scale_fill_manual(values = c("Yes" = "#1a9850", "No" = "#d73027")) +
theme_void() +
labs(
title = "Appointments with Diabetes Provider at T0 and T1",
fill = "Response"
) +
theme(
plot.title = element_text(size = 14, face = "bold"),
strip.text = element_text(size = 12, face = "bold")
)
p_change / p_pies # <- this is the only thing that prints

## ---- impact_meals_made ---------------------------------------------
meals_raw <- df_COHORT1$`T1- In the past week, how many meals did you make using food from the program?`
clean_meals <- function(x) {
x_std <- toupper(trimws(as.character(x)))
dplyr::case_when(
x_std %in% c("0") ~ "0",
x_std %in% c("1","1 MEAL") ~ "1",
x_std %in% c("2 TO 3","2-3","2–3","2 TO3","2 TO 3") ~ "2–3",
x_std %in% c("4 TO 6","4-6","4–6","4 TO 6") ~ "4–6",
str_detect(x_std,"7") ~ "7+",
TRUE ~ NA_character_
)
}
meals_clean <- clean_meals(meals_raw)
meals_summary <- tibble::tibble(meals = meals_clean) %>%
filter(!is.na(meals)) %>%
count(meals) %>%
mutate(
percent = round(100 * n / sum(n), 1),
label = paste0(percent, "%")
)
ggplot(meals_summary, aes(x = meals, y = percent)) +
geom_col(fill = "#1F4E79") +
geom_text(aes(label = label), vjust = -0.3, size = 3.8) +
theme_bw() +
labs(
title = "Meals prepared using program food (past week)",
x = "Number of meals",
y = "% of participants"
)

## ---- impact_ease_of_use --------------------------------------------
## ---- impact_ease_of_use_final_fix ----------------------------------
ease_raw <- df_COHORT1$`T1- In the past week, how easy was it to use the food in your meals?`
ease_clean <- dplyr::case_when(
trimws(ease_raw) == "1" ~ "Very easy",
TRUE ~ str_to_sentence(trimws(ease_raw))
)
ease_summary <- tibble(response = ease_clean) %>%
filter(!is.na(response)) %>%
count(response) %>%
mutate(
percent = round(100 * n / sum(n), 1),
response = factor(
response,
levels = c("Neutral", "Easy", "Very easy"),
ordered = TRUE
)
)
ggplot(ease_summary, aes(x = response, y = percent)) +
geom_col(fill = "#2E86C1") +
geom_text(
aes(label = paste0(percent, "%")),
vjust = -0.3,
size = 4
) +
theme_bw() +
labs(
title = "Ease of Using Program Food in Meals (T1)",
x = "",
y = "% of participants"
) +
theme(
axis.text.x = element_text(size = 11),
plot.title = element_text(size = 14, face = "bold")
)

## -------------------------------
## FOOD WASTE (T1 ONLY – IMPACT)
## -------------------------------
# 1. Identify the T1 waste column explicitly
waste_col <- grep(
"^T1-.*throw away any of the food",
names(df_COHORT1),
value = TRUE
)
# sanity check (must be length 1)
stopifnot(length(waste_col) == 1)
# 2. Pull raw responses
waste_raw <- df_COHORT1[[waste_col]]
# 3. Clean Yes / No
waste_clean <- case_when(
toupper(trimws(waste_raw)) %in% c("YES", "Y") ~ "Yes",
toupper(trimws(waste_raw)) %in% c("NO", "N") ~ "No",
TRUE ~ NA_character_
)
# 4. Build summary table
waste_summary <- tibble(Response = waste_clean) %>%
filter(!is.na(Response)) %>%
count(Response) %>%
mutate(
percent = round(100 * n / sum(n), 1),
label = paste0(percent, "%\n(n=", n, ")")
)
# 5. Plot (simple, low-risk)
ggplot(waste_summary, aes(x = Response, y = percent, fill = Response)) +
geom_col(width = 0.6) +
geom_text(aes(label = label), vjust = -0.3, size = 4) +
scale_fill_manual(
values = c("No" = "#1a9850", "Yes" = "#d73027")
) +
scale_y_continuous(
limits = c(0, 100),
labels = function(x) paste0(x, "%")
) +
theme_bw() +
labs(
title = "Did Patients Throw Away Delivered Food? (T1)",
x = "",
y = "% of participants"
) +
theme(
legend.position = "none",
plot.title = element_text(size = 14, face = "bold")
)

## ------------------------------
## Program impact on feeling in control of diabetes (T1)
# 1. Pull the T1 column
control_col <- grep(
"^T1-How much did this program help you feel more in control of your diabetes",
names(df_COHORT1),
value = TRUE
)
control_raw <- df_COHORT1[[control_col]]
# 2. Clean / recode responses
control_clean <- toupper(trimws(as.character(control_raw)))
control_clean <- dplyr::case_when(
control_clean %in% c("NOT AT ALL", "0") ~ "Not at all",
control_clean %in% c("A LITTLE", "A LITTLE ", "1") ~ "A little",
control_clean %in% c("SOMEWHAT", "2") ~ "Somewhat",
control_clean %in% c("A LOT", "A LOT ", "3") ~ "A lot",
TRUE ~ NA_character_
)
# 3. Summarize % of participants
control_summary <- tibble::tibble(response = control_clean) %>%
dplyr::filter(!is.na(response)) %>%
dplyr::count(response) %>%
dplyr::mutate(
percent = round(100 * n / sum(n), 1),
response = factor(response,
levels = c("Not at all", "A little", "Somewhat", "A lot"))
)
# 4. Plot
ggplot(control_summary, aes(x = response, y = percent)) +
geom_col(fill = "#4E79A7", width = 0.7) +
geom_text(
aes(label = paste0(percent, "%")),
vjust = -0.3,
size = 4
) +
theme_bw() +
labs(
title = "Program Impact on Feeling in Control of Diabetes (T1)",
x = "",
y = "% of participants"
) +
coord_cartesian(ylim = c(0, 100)) +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.text.x = element_text(size = 11, angle = 30, hjust = 1),
axis.text.y = element_text(size = 11)
)

# Count patients by cohort
df %>%
count(Cohort) %>%
arrange(desc(n))
## # A tibble: 3 × 2
## Cohort n
## <chr> <int>
## 1 1 489
## 2 2 220
## 3 3 55
clean_yesno <- function(x) {
x <- toupper(trimws(as.character(x)))
dplyr::case_when(
x %in% c("Y","YES","TRUE","1") ~ "Enrolled",
x %in% c("N","NO","FALSE","0") ~ "Not enrolled",
TRUE ~ NA_character_
)
}
clean_cohort <- function(x) {
x <- toupper(trimws(as.character(x)))
dplyr::case_when(
x %in% c("1","COHORT 1","COHORT1","C1") ~ "Cohort 1",
x %in% c("2","COHORT 2","COHORT2","C2") ~ "Cohort 2",
TRUE ~ NA_character_
)
}
df_status <- df %>%
mutate(
Enrolled = clean_yesno(.data[["Enrolled"]]),
Cohort_clean = clean_cohort(.data[["Cohort"]]),
Cohort_final = dplyr::case_when(
Enrolled == "Enrolled" & Cohort_clean %in% c("Cohort 1","Cohort 2") ~ Cohort_clean,
TRUE ~ "Not enrolled"
)
)
df_status %>%
count(Enrolled, Cohort_final)
## # A tibble: 4 × 3
## Enrolled Cohort_final n
## <chr> <chr> <int>
## 1 Enrolled Cohort 1 60
## 2 Enrolled Cohort 2 41
## 3 Not enrolled Not enrolled 610
## 4 <NA> Not enrolled 53
## ---- outside_delivery_range_map_and_counts, echo=TRUE -----------------------
library(dplyr)
library(stringr)
library(ggplot2)
# ---- explicit column names ----
contact_col <- "T0 Contact"
zip_col <- grep("zip", names(df), value = TRUE, ignore.case = TRUE)[1]
stopifnot(contact_col %in% names(df), !is.na(zip_col))
# ---- build patient-level base table ----
df_outside <- df_status %>%
mutate(
patient_id = row_number(),
contact_std = str_to_lower(trimws(as.character(.data[[contact_col]]))),
outside_range = str_detect(contact_std, "outside") & str_detect(contact_std, "range"),
zip5 = str_extract(as.character(.data[[zip_col]]), "\\b\\d{5}\\b"),
Enrolled = if_else(Enrolled == "Enrolled", "Enrolled", "Not enrolled")
) %>%
filter(outside_range, !is.na(zip5))
# ---- REQUIRED COUNTS (what you asked) ----
n_patients_outside_all <- df_outside %>% distinct(patient_id) %>% nrow()
n_unique_zips_outside_all <- df_outside %>% distinct(zip5) %>% nrow()
n_patients_outside_enrolled <- df_outside %>% filter(Enrolled == "Enrolled") %>% distinct(patient_id) %>% nrow()
n_unique_zips_outside_enrolled <- df_outside %>% filter(Enrolled == "Enrolled") %>% distinct(zip5) %>% nrow()
outside_counts_summary <- tibble::tibble(
Group = c("All patients", "Enrolled"),
Patients_flagged_outside_range = c(n_patients_outside_all, n_patients_outside_enrolled),
Unique_ZIPs_flagged_outside_range = c(n_unique_zips_outside_all, n_unique_zips_outside_enrolled)
)
outside_counts_summary
## # A tibble: 2 × 3
## Group Patients_flagged_outside_range Unique_ZIPs_flagged_outside_range
## <chr> <int> <int>
## 1 All patients 79 54
## 2 Enrolled 0 0
# ---- ZIP count table (most useful + easiest to read) ----
zip_counts_all <- df_outside %>%
count(zip5, name = "Patients") %>%
arrange(desc(Patients))
zip_counts_enrolled <- df_outside %>%
filter(Enrolled == "Enrolled") %>%
count(zip5, name = "Patients") %>%
arrange(desc(Patients))
# Print top zips (adjust n as needed)
head(zip_counts_all, 25)
## # A tibble: 25 × 2
## zip5 Patients
## <chr> <int>
## 1 10466 7
## 2 10566 4
## 3 10458 3
## 4 10465 3
## 5 10805 3
## 6 12533 3
## 7 10459 2
## 8 10462 2
## 9 10520 2
## 10 10541 2
## # ℹ 15 more rows
head(zip_counts_enrolled, 25)
## # A tibble: 0 × 2
## # ℹ 2 variables: zip5 <chr>, Patients <int>
# ---- Bar chart: Top ZIPs flagged Outside-of-Range (All patients) ----
top_n <- 20
zip_plot_df <- zip_counts_all %>%
slice_head(n = top_n) %>%
mutate(zip5 = factor(zip5, levels = rev(zip5)))
ggplot(zip_plot_df, aes(x = zip5, y = Patients)) +
geom_col(fill = "#1F4E79") +
geom_text(aes(label = Patients), hjust = -0.15, size = 3.5) +
coord_flip(clip = "off") +
expand_limits(y = max(zip_plot_df$Patients, na.rm = TRUE) * 1.15) +
theme_bw() +
labs(
title = "Outside of Delivery Range (T0 Contact) — Top ZIP Codes",
subtitle = paste0(
"All patients flagged: ", n_patients_outside_all,
" • Unique ZIPs: ", n_unique_zips_outside_all,
" • (Enrolled flagged: ", n_patients_outside_enrolled, ")"
),
x = "ZIP code",
y = "Patients flagged"
) +
theme(
plot.title = element_text(face = "bold", size = 14)
)

# ---- OPTIONAL: Put the old “state outline” map back IF maps is available ----
# This uses state outlines as a basemap and plots ZIP centroids on top.
# If 'maps' is not installed, it will fall back to points-only (still usable).
if (requireNamespace("zipcodeR", quietly = TRUE) && requireNamespace("sf", quietly = TRUE)) {
library(sf)
# attach ZIP centroids
zip_pts <- zip_counts_all %>%
left_join(
zipcodeR::zip_code_db %>% select(zipcode, lat, lng),
by = c("zip5" = "zipcode")
) %>%
filter(!is.na(lat), !is.na(lng))
zip_sf <- st_as_sf(zip_pts, coords = c("lng", "lat"), crs = 4326)
# tri-state-ish bounding box around your points (buffered)
xs <- st_coordinates(zip_sf)[,1]
ys <- st_coordinates(zip_sf)[,2]
xlim <- range(xs) + c(-0.5, 0.5)
ylim <- range(ys) + c(-0.5, 0.5)
if (requireNamespace("maps", quietly = TRUE)) {
states_map <- maps::map("state", fill = TRUE, plot = FALSE)
states_sf <- st_as_sf(states_map)
states_tri <- states_sf %>%
filter(ID %in% c("new york", "new jersey", "connecticut"))
ggplot() +
geom_sf(data = states_tri, fill = "grey95", color = "grey50", linewidth = 0.4) +
geom_sf(data = zip_sf, aes(size = Patients), color = "#1F4E79", alpha = 0.75) +
scale_size(range = c(2, 12)) +
coord_sf(xlim = xlim, ylim = ylim) +
theme_minimal() +
labs(
title = "Patients Outside of Delivery Range (T0 Contact)",
subtitle = paste0(
"All patients flagged: ", n_patients_outside_all,
" • Unique ZIPs: ", n_unique_zips_outside_all
),
size = "Patients"
) +
theme(plot.title = element_text(face = "bold", size = 14),
legend.position = "right")
} else {
# fallback: points only (no state outline)
ggplot() +
geom_sf(data = zip_sf, aes(size = Patients), color = "#1F4E79", alpha = 0.75) +
scale_size(range = c(2, 12)) +
coord_sf(xlim = xlim, ylim = ylim) +
theme_minimal() +
labs(
title = "Patients Outside of Delivery Range (T0 Contact) — ZIP centroids",
subtitle = paste0(
"All patients flagged: ", n_patients_outside_all,
" • Unique ZIPs: ", n_unique_zips_outside_all,
" • (Install 'maps' to add state outlines)"
),
size = "Patients"
) +
theme(plot.title = element_text(face = "bold", size = 14),
legend.position = "right")
}
} else {
message("To plot ZIP centroids, install packages: zipcodeR and sf.")
}

###wakefield bronx, peekskill, fordham bronx, throggs neck
df_zip_base <- df_status %>%
mutate(
zip5 = str_extract(as.character(.data[[zip_col]]), "\\b\\d{5}\\b"),
contact_std = str_to_lower(trimws(as.character(`T0 Contact`))),
outside_range = str_detect(contact_std, "outside")
) %>%
filter(!is.na(zip5), outside_range)
zip_table <- bind_rows(
df_zip_base %>%
mutate(Group = "All patients"),
df_zip_base %>%
filter(Enrolled == "Enrolled") %>%
mutate(Group = "Enrolled")
) %>%
count(Group, zip5, name = "n_outside") %>%
arrange(Group, desc(n_outside))
zip_table
## # A tibble: 54 × 3
## Group zip5 n_outside
## <chr> <chr> <int>
## 1 All patients 10466 7
## 2 All patients 10566 4
## 3 All patients 10458 3
## 4 All patients 10465 3
## 5 All patients 10805 3
## 6 All patients 12533 3
## 7 All patients 10459 2
## 8 All patients 10462 2
## 9 All patients 10520 2
## 10 All patients 10541 2
## # ℹ 44 more rows