library(readxl)
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(tidyr)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(patchwork)
library(tibble)
library(zipcodeR)
library(sf)
## Warning: package 'sf' was built under R version 4.5.2
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
library(rnaturalearth)
library(maps)
df <- read_excel("DF2.xlsx", col_types = "text")

# Standardize NA-like codes across T0/T1/T2 survey columns
survey_cols <- grep("^T[012]-", names(df), value = TRUE)
na_codes <- c("", "NA", "N/A", "na", "n/a", "Na", "NULL", "null")

df <- df %>%
  mutate(
    across(all_of(survey_cols), ~{
      x <- trimws(as.character(.x))
      x[x %in% na_codes] <- NA_character_
      x
    })
  )

# ---- 1) Identify the ENROLLMENT column in DF2 (no guessing) ----
enroll_candidates <- names(df)[grepl("enroll", names(df), ignore.case = TRUE)]
print(enroll_candidates)
## [1] "Pre Enrollment Notes"                         
## [2] "(Screen for Enrollment) Interested in Program"
## [3] "Enrolled"
stopifnot(length(enroll_candidates) >= 1)

# Pick the first candidate by default (edit this ONE line if needed)
ENROLL_COL <- enroll_candidates[3]
cat("Enrolled:", ENROLL_COL, "\n")
## Enrolled: Enrolled
# Quick audit of values in enrollment column
print(sort(unique(na.omit(trimws(df[[ENROLL_COL]])))))
## [1] "N" "y" "Y"
# ---- 2) Identify Cohort column (must exist) ----
stopifnot("Cohort" %in% names(df))

# ---- 3) Build df_status with correct rules: Enrolled wins ----
df_status <- df %>%
  mutate(
    Enrolled_flag = case_when(
      toupper(trimws(.data[[ENROLL_COL]])) %in% c("YES","Y","ENROLLED","TRUE","T","1") ~ TRUE,
      toupper(trimws(.data[[ENROLL_COL]])) %in% c("NO","N","NOT ENROLLED","FALSE","F","0") ~ FALSE,
      TRUE ~ FALSE  # treat blanks/unknown as NOT enrolled for safety
    ),
    Cohort_clean = case_when(
      str_to_lower(trimws(as.character(Cohort))) %in% c("1","cohort 1","cohort1","c1") ~ "Cohort 1",
      str_to_lower(trimws(as.character(Cohort))) %in% c("2","cohort 2","cohort2","c2") ~ "Cohort 2",
      TRUE ~ NA_character_
    ),
    Cohort_final = case_when(
      Enrolled_flag & Cohort_clean %in% c("Cohort 1","Cohort 2") ~ Cohort_clean,
      TRUE ~ "Not enrolled"
    ),
    Enrolled = if_else(Cohort_final %in% c("Cohort 1","Cohort 2"), "Enrolled", "Not enrolled")
  )
df_status <- df_status %>%
  dplyr::mutate(
    Enrolled = dplyr::if_else(Enrolled == "Enrolled", "Enrolled", "Not enrolled")
  )

df_COHORT1  <- df_status %>% filter(Cohort_final == "Cohort 1")
df_COHORT2  <- df_status %>% filter(Cohort_final == "Cohort 2")
df_enrolled <- df_status %>% filter(Enrolled == "Enrolled")
t1_q_cols <- grep("^T1-", names(df_status), value = TRUE)
t1_q_cols <- t1_q_cols[!grepl("Contact|NOTES", t1_q_cols, ignore.case = TRUE)]

df_COHORT1 <- df_COHORT1 %>%
  mutate(
    T1_answer_count = rowSums(!is.na(dplyr::select(., dplyr::all_of(t1_q_cols))))
  )

n_c1_total <- nrow(df_COHORT1)
n_c1_t1_gt2 <- sum(df_COHORT1$T1_answer_count > 2, na.rm = TRUE)
pct_c1_t1_gt2 <- ifelse(n_c1_total > 0, round(100 * n_c1_t1_gt2 / n_c1_total, 1), NA_real_)

n_c1_attrition <- n_c1_total - n_c1_t1_gt2
pct_c1_attrition <- ifelse(n_c1_total > 0, round(100 * n_c1_attrition / n_c1_total, 1), NA_real_)

# ---- 4) PDSA summary table (counts only; targets included) ----
TARGET_COHORT1 <- 60
TARGET_COHORT2 <- 60

n_total    <- nrow(df_status)
n_c1       <- nrow(df_COHORT1)
n_c2       <- nrow(df_COHORT2)
n_enrolled <- nrow(df_enrolled)
n_not_enrolled <- sum(df_status$Enrolled == "Not enrolled")

pdsa_summary <- tibble(
  Metric = c(
    "Total people in dataset",
    "Cohort 1 — actual (ENROLLED==Yes AND Cohort==1)",
    "Cohort 1 — target",
    "Cohort 1 — shortfall",
    "Cohort 2 — actual (ENROLLED==Yes AND Cohort==2)",
    "Cohort 2 — target",
    "Cohort 2 — shortfall",
    "Enrolled total (Cohort 1 + Cohort 2)",
    "Not enrolled total (everyone else)"
  ),
  Count = c(
    n_total,
    n_c1,
    TARGET_COHORT1,
    TARGET_COHORT1 - n_c1,
    n_c2,
    TARGET_COHORT2,
    TARGET_COHORT2 - n_c2,
    n_enrolled,
    n_not_enrolled
  )
)

pdsa_summary <- pdsa_summary %>%
  dplyr::bind_rows(
    tibble::tibble(
      Metric = c(
        "Cohort 1 with >2 T1 questions answered",
        "Cohort 1 attrition (<=2 T1 questions answered)"
      ),
      Count = c(n_c1_t1_gt2, n_c1_attrition),
      Denominator = c(n_c1_total, n_c1_total),
      Percent = c(pct_c1_t1_gt2, pct_c1_attrition)
    )
  )


pdsa_summary
## # A tibble: 11 × 4
##    Metric                                          Count Denominator Percent
##    <chr>                                           <dbl>       <int>   <dbl>
##  1 Total people in dataset                           764          NA    NA  
##  2 Cohort 1 — actual (ENROLLED==Yes AND Cohort==1)    60          NA    NA  
##  3 Cohort 1 — target                                  60          NA    NA  
##  4 Cohort 1 — shortfall                                0          NA    NA  
##  5 Cohort 2 — actual (ENROLLED==Yes AND Cohort==2)    41          NA    NA  
##  6 Cohort 2 — target                                  60          NA    NA  
##  7 Cohort 2 — shortfall                               19          NA    NA  
##  8 Enrolled total (Cohort 1 + Cohort 2)              101          NA    NA  
##  9 Not enrolled total (everyone else)                663          NA    NA  
## 10 Cohort 1 with >2 T1 questions answered             38          60    63.3
## 11 Cohort 1 attrition (<=2 T1 questions answered)     22          60    36.7
# Compare Enrolled vs All patients 

# 1) Build long race rows (one row per race code per patient)
race_long <- df_status %>%
  dplyr::select(Enrolled, Race) %>%
  dplyr::filter(!is.na(Race), trimws(Race) != "") %>%
  dplyr::mutate(Race = stringr::str_split(as.character(Race), "\\r\\n")) %>%
  tidyr::unnest(Race) %>%
  dplyr::mutate(Race = stringr::str_trim(Race)) %>%
  dplyr::filter(stringr::str_detect(Race, "^R")) %>%
  dplyr::mutate(
    Race_clean = dplyr::case_when(
      stringr::str_detect(Race, "R1") ~ "American Indian / Alaska Native",
      stringr::str_detect(Race, "R2") ~ "Asian",
      stringr::str_detect(Race, "R3") ~ "Black or African American",
      stringr::str_detect(Race, "R4") ~ "Native Hawaiian / Pacific Islander",
      stringr::str_detect(Race, "R5") ~ "White",
      stringr::str_detect(Race, "R9") ~ "Other",
      TRUE ~ NA_character_
    )
  ) %>%
  dplyr::filter(!is.na(Race_clean))

# 2) Duplicate into two comparison groups: All patients vs Enrolled
race_grouped <- dplyr::bind_rows(
  race_long %>% dplyr::mutate(Group = "All patients"),
  race_long %>% dplyr::filter(Enrolled == "Enrolled") %>% dplyr::mutate(Group = "Enrolled")
) %>%
  dplyr::filter(Group %in% c("All patients", "Enrolled")) %>%
  dplyr::mutate(Group = factor(Group, levels = c("All patients", "Enrolled")))

# 3) Tabulate within-group percentages
race_tab <- race_grouped %>%
  dplyr::count(Group, Race_clean, name = "n") %>%
  dplyr::group_by(Group) %>%
  dplyr::mutate(percent = round(100 * n / sum(n), 1)) %>%
  dplyr::ungroup()

# 4)
race_tab
## # A tibble: 10 × 4
##    Group        Race_clean                             n percent
##    <fct>        <chr>                              <int>   <dbl>
##  1 All patients American Indian / Alaska Native        4     0.5
##  2 All patients Asian                                 20     2.7
##  3 All patients Black or African American            206    27.8
##  4 All patients Native Hawaiian / Pacific Islander     2     0.3
##  5 All patients Other                                193    26.1
##  6 All patients White                                315    42.6
##  7 Enrolled     American Indian / Alaska Native        1     1  
##  8 Enrolled     Black or African American             40    40  
##  9 Enrolled     Other                                 37    37  
## 10 Enrolled     White                                 22    22
# Order races by max % in either group (keeps visual comparison intuitive)
race_order <- race_tab %>%
  dplyr::group_by(Race_clean) %>%
  dplyr::summarise(max_pct = max(percent), .groups = "drop") %>%
  dplyr::arrange(max_pct) %>%
  dplyr::pull(Race_clean)

race_tab <- race_tab %>%
  dplyr::mutate(
    Race_clean = factor(Race_clean, levels = race_order)
  )

# Plot: Enrolled vs All patients
ggplot2::ggplot(
  race_tab,
  ggplot2::aes(x = Race_clean, y = percent, fill = Group)
) +
  ggplot2::geom_col(
    position = ggplot2::position_dodge(width = 0.8),
    width = 0.7
  ) +
  ggplot2::geom_text(
    ggplot2::aes(label = paste0(percent, "%")),
    position = ggplot2::position_dodge(width = 0.8),
    hjust = -0.1,
    size = 3.6
  ) +
  ggplot2::coord_flip(clip = "off") +
  ggplot2::scale_x_discrete(limits = rev(levels(race_tab$Race_clean))) +
  ggplot2::scale_fill_manual(
    values = c(
      "All patients" = "#CFE1F2",
      "Enrolled"     = "#1F4E79"
    )
  ) +
  ggplot2::expand_limits(y = max(race_tab$percent, na.rm = TRUE) + 8) +
  ggplot2::theme_bw() +
  ggplot2::labs(
    title = "Race Distribution: Enrolled vs All Patients",
    x = "Race",
    y = "% within group",
    fill = ""
  ) +
  ggplot2::theme(
    plot.title  = ggplot2::element_text(face = "bold", size = 14),
    axis.text.y = ggplot2::element_text(size = 10),
    axis.text.x = ggplot2::element_text(size = 10),
    legend.position = "top"
  )

### SEX ANALYSIS — Enrolled vs All patients

sex_long <- df_status %>%
  dplyr::transmute(
    Enrolled,
    Sex = trimws(as.character(`Legal Sex`))
  ) %>%
  dplyr::filter(!is.na(Sex), Sex != "")

sex_tab <- dplyr::bind_rows(
  sex_long %>% dplyr::mutate(Group = "All patients"),
  sex_long %>% dplyr::filter(Enrolled == "Enrolled") %>% dplyr::mutate(Group = "Enrolled")
) %>%
  dplyr::count(Group, Sex, name = "n") %>%
  dplyr::group_by(Group) %>%
  dplyr::mutate(percent = round(100 * n / sum(n), 1)) %>%
  dplyr::ungroup()

sex_order <- sex_tab %>%
  dplyr::group_by(Sex) %>%
  dplyr::summarise(max_pct = max(percent), .groups = "drop") %>%
  dplyr::arrange(max_pct) %>%
  dplyr::pull(Sex)

sex_tab <- sex_tab %>%
  dplyr::mutate(
    Sex   = factor(Sex, levels = sex_order),
    Group = factor(Group, levels = c("All patients", "Enrolled"))
  )

ggplot2::ggplot(sex_tab, ggplot2::aes(x = Sex, y = percent, fill = Group)) +
  ggplot2::geom_col(position = ggplot2::position_dodge(width = 0.8), width = 0.7) +
  ggplot2::geom_text(
    ggplot2::aes(label = paste0(percent, "%")),
    position = ggplot2::position_dodge(width = 0.8),
    hjust = -0.1,
    size = 3.6,
    color = "black"
  ) +
  ggplot2::coord_flip(clip = "off") +
  ggplot2::scale_x_discrete(limits = rev(levels(sex_tab$Sex))) +
  ggplot2::scale_fill_manual(values = c("All patients" = "#CFE1F2", "Enrolled" = "#1F4E79")) +
  ggplot2::expand_limits(y = max(sex_tab$percent, na.rm = TRUE) + 8) +
  ggplot2::theme_bw() +
  ggplot2::labs(
    title = "Legal Sex: Enrolled vs All Patients",
    x = "",
    y = "% within group",
    fill = ""
  ) +
  ggplot2::theme(
    plot.title  = ggplot2::element_text(face = "bold", size = 14),
    axis.text.y = ggplot2::element_text(size = 10),
    axis.text.x = ggplot2::element_text(size = 10),
    legend.position = "top"
  )

### ETHNICITY — Enrolled vs All patients

# 1) Clean ethnicity (DO NOT overwrite df_status; just create a temp view)
eth_long <- df_status %>%
  dplyr::transmute(
    Enrolled,
    Ethnicity_raw = trimws(as.character(Ethnicity)),
    Ethnicity_clean = dplyr::case_when(
      grepl("E1", Ethnicity_raw) ~ "Spanish/Hispanic/Latino",
      grepl("E2", Ethnicity_raw) ~ "Not Spanish/Hispanic/Latino",
      grepl("S1", Ethnicity_raw) ~ "Patient declined",
      grepl("S2", Ethnicity_raw) ~ "Patient unavailable",
      TRUE                      ~ NA_character_
    )
  ) %>%
  dplyr::filter(!is.na(Ethnicity_clean), Ethnicity_clean != "")

# 2) Build comparison groups: All vs Enrolled
eth_tab <- dplyr::bind_rows(
  eth_long %>% dplyr::mutate(Group = "All patients"),
  eth_long %>% dplyr::filter(Enrolled == "Enrolled") %>% dplyr::mutate(Group = "Enrolled")
) %>%
  dplyr::count(Group, Ethnicity_clean, name = "n") %>%
  dplyr::group_by(Group) %>%
  dplyr::mutate(percent = round(100 * n / sum(n), 1)) %>%
  dplyr::ungroup()

# 3) Order categories by max % across groups (optional)
eth_order <- eth_tab %>%
  dplyr::group_by(Ethnicity_clean) %>%
  dplyr::summarise(max_pct = max(percent), .groups = "drop") %>%
  dplyr::arrange(max_pct) %>%
  dplyr::pull(Ethnicity_clean)

eth_tab <- eth_tab %>%
  dplyr::mutate(
    Ethnicity_clean = factor(Ethnicity_clean, levels = eth_order),
    Group = factor(Group, levels = c("All patients", "Enrolled"))
  )

# 4) Plot
ggplot2::ggplot(eth_tab, ggplot2::aes(x = Ethnicity_clean, y = percent, fill = Group)) +
  ggplot2::geom_col(position = ggplot2::position_dodge(width = 0.8), width = 0.7) +
  ggplot2::geom_text(
    ggplot2::aes(label = paste0(percent, "%")),
    position = ggplot2::position_dodge(width = 0.8),
    hjust = -0.1,
    size = 3.6,
    color = "black"
  ) +
  ggplot2::coord_flip(clip = "off") +
  ggplot2::scale_x_discrete(limits = rev(levels(eth_tab$Ethnicity_clean))) +
  ggplot2::scale_fill_manual(values = c("All patients" = "#CFE1F2", "Enrolled" = "#1F4E79")) +
  ggplot2::expand_limits(y = max(eth_tab$percent, na.rm = TRUE) + 8) +
  ggplot2::theme_bw() +
  ggplot2::labs(
    title = "Ethnicity: Enrolled vs All Patients",
    x = "",
    y = "% within group",
    fill = ""
  ) +
  ggplot2::theme(
    plot.title  = ggplot2::element_text(face = "bold", size = 14),
    axis.text.y = ggplot2::element_text(size = 10),
    axis.text.x = ggplot2::element_text(size = 10),
    legend.position = "top"
  )

## ---- hba1c_hist_all_vs_enrolled (DF2) ---------------------------------

# 0) Ensure df_status + Enrolled exist (KEEP your existing rule here)
t0_q_cols <- grep("^T0-", names(df), value = TRUE)
t0_q_cols <- t0_q_cols[!grepl("Contact|NOTES", t0_q_cols, ignore.case = TRUE)]

df_status <- df %>%
  dplyr::mutate(
    T0_answer_count = rowSums(!is.na(dplyr::select(., dplyr::all_of(t0_q_cols)))),
    Enrolled = dplyr::if_else(T0_answer_count >= 3, "Enrolled", "Not enrolled")
  )

# 1) Clean HbA1c (baseline column in DF2)
clean_a1c <- function(x) {
  x <- trimws(as.character(x))
  x[x %in% c("", "NA", "N/A", "na", "n/a", "Na", "NULL", "null")] <- NA_character_
  x[x == "<4.0"]  <- "4.0"
  x[x == ">14.0"] <- "14.0"
  x[x == ">20.0"] <- "20.0"
  x <- gsub("%", "", x)
  suppressWarnings(as.numeric(x))
}

df_status <- df_status %>%
  dplyr::mutate(HbA1c = clean_a1c(`HbA1c Value (Last 3 Months)`))

# 2) Build “All patients” vs “Enrolled” groups (NO “Not enrolled” histogram)
df_long <- df_status %>%
  dplyr::filter(!is.na(HbA1c)) %>%
  dplyr::mutate(
    Hb_bin = floor(HbA1c),
    Hb_mid = Hb_bin + 0.5
  )

df_hist <- dplyr::bind_rows(
  df_long %>% dplyr::mutate(Group = "All patients"),
  df_long %>% dplyr::filter(Enrolled == "Enrolled") %>% dplyr::mutate(Group = "Enrolled")
) %>%
  dplyr::count(Group, Hb_bin, Hb_mid, name = "n") %>%
  dplyr::group_by(Group) %>%
  dplyr::mutate(pct = n / sum(n) * 100) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    Group = factor(Group, levels = c("All patients", "Enrolled")),
    label = ifelse(pct > 0, paste0(round(pct, 1), "%"), "")
  )

# 3) Plot (two stacked histograms, same style + y-limits)
ggplot2::ggplot(df_hist, ggplot2::aes(x = Hb_mid, y = pct)) +
  ggplot2::geom_col(width = 0.9, fill = "#4E79A7", color = "white") +
  ggplot2::geom_text(ggplot2::aes(label = label), vjust = -0.35, size = 3.5) +
  ggplot2::facet_wrap(~ Group, ncol = 1, scales = "fixed") +
  ggplot2::scale_x_continuous(
    breaks = seq(min(df_hist$Hb_bin, na.rm = TRUE), max(df_hist$Hb_bin, na.rm = TRUE), by = 1),
    minor_breaks = NULL
  ) +
  ggplot2::coord_cartesian(ylim = c(0, 50)) +
  ggplot2::scale_y_continuous(labels = function(x) paste0(x, "%")) +
  ggplot2::theme_bw() +
  ggplot2::labs(
    title = "HbA1c Distribution: Enrolled vs All Patients",
    x = "HbA1c (%)",
    y = "% of patients within group"
  ) +
  ggplot2::theme(
    plot.title = ggplot2::element_text(size = 14, face = "bold"),
    strip.text = ggplot2::element_text(size = 12, face = "bold"),
    axis.text  = ggplot2::element_text(size = 11)
  )

### Blood sugar check frequency (T0 vs T1)

## 1. Identify the two survey columns
t0_col <- grep(
  "^T0- In the past week, how often did you check your blood sugar",
  names(df_COHORT1),
  value = TRUE
)
t1_col <- grep(
  "^T1- In the past week, how often did you check your blood sugar",
  names(df_COHORT1),
  value = TRUE
)

stopifnot(length(t0_col) == 1, length(t1_col) == 1)

t0_raw <- df_COHORT1[[t0_col]]
t1_raw <- df_COHORT1[[t1_col]]

## 2. Cleaning function: collapse equivalent strings
clean_freq <- function(x) {
  x_std <- toupper(trimws(as.character(x)))

  dplyr::case_when(
    # 1–2 times (all variants)
    x_std %in% c("1–2 TIMES", "1-2 TIMES", "1 TO 2 TIMES") ~ "1–2 times",

    # 3–4 times
    x_std %in% c("3–4 TIMES", "3-4 TIMES")                 ~ "3–4 times",

    # Daily
    x_std %in% c("DAILY", "DAILY ", "DAILY.")              ~ "Daily",

    # More than once daily
    x_std %in% c("MORE THAN DAILY",
                 "MORE THAN ONCE DAILY",
                 "MORE THAN ONCE  DAILY",
                 "MORE THAN ONCE A DAY")                   ~ "More than once daily",

    # Not at all
    x_std %in% c("NOT AT ALL")                             ~ "Not at all",

    # Anything unexpected
    TRUE                                                   ~ NA_character_
  )
}

## 3. Long data with cleaned responses
df_bs <- tibble::tibble(
  Timepoint = c(rep("T0", length(t0_raw)), rep("T1", length(t1_raw))),
  Response_raw = c(t0_raw, t1_raw)
) %>%
  mutate(Response_clean = clean_freq(Response_raw)) %>%
  filter(!is.na(Response_clean))

## 4. Summary table for plotting
freq_summary <- df_bs %>%
  count(Timepoint, Response_clean) %>%
  group_by(Timepoint) %>%
  mutate(pct = round(100 * n / sum(n), 1)) %>%
  ungroup() %>%
  mutate(
    Response_clean = factor(
      Response_clean,
      levels = c("Not at all",
                 "1–2 times",
                 "3–4 times",
                 "Daily",
                 "More than once daily")
    )
  )

## 5. Plot: frequency of blood sugar checks at T0 vs T1
ggplot(freq_summary,
       aes(x = Response_clean, y = pct, fill = Timepoint)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_text(aes(label = paste0(pct, "%")),
            position = position_dodge(width = 0.8),
            vjust = -0.3,
            size = 3.5) +
  theme_bw() +
  labs(
    title = "How Often Patients Checked Blood Sugar (T0 vs T1)",
    x     = "Frequency in past week",
    y     = "% of participants",
    fill  = "Timepoint"
  ) +
  theme(
    axis.text.x = element_text(angle = 30, hjust = 1, size = 10),
    plot.title  = element_text(size = 14, face = "bold")
  )

###blood sugar in range
sugar_levels_T0 <- c("No", "Maybe", "Yes")
sugar_levels_T1 <- c(
  "Never",
  "Sometimes",
  "About half the time",
  "Most of the time",
  "Always"
)
sugar_T0_raw <- df_COHORT1$`T0- Were your sugar levels usually in your target range last week?`
sugar_T1_raw <- df_COHORT1$`T1- Were your sugar levels usually in your target range last week?`

length(sugar_T0_raw)
## [1] 60
length(sugar_T1_raw)
## [1] 60
clean_T0 <- function(x) {
  x_std <- toupper(trimws(x))
  case_when(
    x_std == "YES"   ~ "Yes",
    x_std == "NO"    ~ "No",
    x_std == "MAYBE" ~ "Maybe",
    TRUE             ~ NA_character_
  )
}

clean_T1 <- function(x) {
  x_std <- toupper(trimws(x))
  case_when(
    x_std %in% c("NEVER") ~ "Never",
    x_std == "SOMETIMES" ~ "Sometimes",
    x_std %in% c("ABOUT HALF THE TIME", "HALF THE TIME") ~ "About half",
    x_std %in% c("MOST OF THE TIME", "YES") ~ "Most",
    x_std == "ALWAYS" ~ "Always",
    TRUE ~ NA_character_
  )
}

T0_clean <- clean_T0(sugar_T0_raw)
T1_clean <- clean_T1(sugar_T1_raw)

score_map <- c(
  "No" = 0,
  "Never" = 0,

  "Maybe" = 1,
  "Sometimes" = 1,

  "About half" = 2,

  "Yes" = 3,
  "Most" = 3,

  "Always" = 4
)

T0_score <- unname(score_map[T0_clean])
T1_score <- unname(score_map[T1_clean])

df_change <- data.frame(
  patient_id = seq_len(nrow(df_COHORT1)),
  T0_raw = T0_clean,
  T1_raw = T1_clean,
  T0_score = T0_score,
  T1_score = T1_score
) %>%
  filter(!is.na(T0_score) & !is.na(T1_score)) %>%
  mutate(
    delta = T1_score - T0_score,
    change = case_when(
      delta > 0  ~ "Increased",
      delta < 0  ~ "Decreased",
      TRUE       ~ "No change"
    )
  )

change_summary <- df_change %>%
  count(change) %>%
  mutate(percent = round(100 * n / sum(n), 1))

change_summary$change <- factor(
  change_summary$change,
  levels = c("Decreased", "No change", "Increased")
)

ggplot(change_summary, aes(x = change, y = percent, fill = change)) +
  geom_col() +
  geom_text(
    aes(label = paste0(percent, "%")),
    vjust = -0.3,
    size = 4
  ) +
  scale_fill_manual(
    values = c(
      "Decreased" = "#d73027",
      "No change" = "#4E79A7",
      "Increased" = "#1a9850"
    )
  ) +
  theme_bw() +
  labs(
    title = "Change in Blood Sugar in Target Range (T0 → T1)",
    x = "",
    y = "% of participants"
  ) +
  theme(
    legend.position = "none",
    plot.title = element_text(size = 14, face = "bold")
  )

change_grouped_table <- df_change %>%
  count(change, T0_raw, T1_raw) %>%
  arrange(change, desc(n))

change_grouped_percent <- df_change %>%
  count(change, T0_raw, T1_raw) %>%
  group_by(change) %>%
  mutate(percent = round(100 * n / sum(n), 1)) %>%
  ungroup() %>%
  arrange(change, desc(percent))

change_grouped_wide <- change_grouped_percent %>%
  mutate(label = paste0(n, " (", percent, "%)")) %>%
  select(change, T0_raw, T1_raw, label) %>%
  tidyr::pivot_wider(
    names_from  = T1_raw,
    values_from = label,
    values_fill = ""
  )
change_grouped_wide
## # A tibble: 8 × 7
##   change    T0_raw Sometimes   Never       Most        `About half` Always    
##   <chr>     <chr>  <chr>       <chr>       <chr>       <chr>        <chr>     
## 1 Decreased Yes    "2 (66.7%)" ""          ""          ""           ""        
## 2 Decreased Maybe  ""          "1 (33.3%)" ""          ""           ""        
## 3 Increased No     "6 (33.3%)" ""          "2 (11.1%)" ""           "1 (5.6%)"
## 4 Increased Maybe  ""          ""          "5 (27.8%)" "2 (11.1%)"  "1 (5.6%)"
## 5 Increased Yes    ""          ""          ""          ""           "1 (5.6%)"
## 6 No change Yes    ""          ""          "7 (63.6%)" ""           ""        
## 7 No change Maybe  "3 (27.3%)" ""          ""          ""           ""        
## 8 No change No     ""          "1 (9.1%)"  ""          ""           ""
## ---- missed_medication_T0_T1_summary ---------------------------------
## Assumes df_COHORT1 already exists (Cohort 1 only)

t0_col <- "T0- In the past week, did you miss any doses of you diabetes medications?"
t1_col <- "T1- In the past week, did you miss any doses of you diabetes medications?"

stopifnot(t0_col %in% names(df_COHORT1),
          t1_col %in% names(df_COHORT1))

clean_yesno <- function(x) {
  x <- toupper(trimws(as.character(x)))
  dplyr::case_when(
    x %in% c("Y","YES") ~ "Yes",
    x %in% c("N","NO")  ~ "No",
    TRUE ~ NA_character_
  )
}

df_meds <- df_COHORT1 %>%
  transmute(
    T0 = clean_yesno(.data[[t0_col]]),
    T1 = clean_yesno(.data[[t1_col]])
  )

## ------------------------------------------------
## 1) Distribution at T0
## ------------------------------------------------
t0_dist <- df_meds %>%
  filter(!is.na(T0)) %>%
  count(T0) %>%
  mutate(
    Denominator = sum(n),
    Percent = round(100 * n / Denominator, 1)
  )

t0_dist
## # A tibble: 2 × 4
##   T0        n Denominator Percent
##   <chr> <int>       <int>   <dbl>
## 1 No       47          60    78.3
## 2 Yes      13          60    21.7
## ------------------------------------------------
## 2) Distribution at T1
## ------------------------------------------------
t1_dist <- df_meds %>%
  filter(!is.na(T1)) %>%
  count(T1) %>%
  mutate(
    Denominator = sum(n),
    Percent = round(100 * n / Denominator, 1)
  )

t1_dist
## # A tibble: 2 × 4
##   T1        n Denominator Percent
##   <chr> <int>       <int>   <dbl>
## 1 No       32          38    84.2
## 2 Yes       6          38    15.8
## ------------------------------------------------
## 3) Change from T0 → T1 (only those with BOTH)
## ------------------------------------------------
change_dist <- df_meds %>%
  filter(!is.na(T0), !is.na(T1)) %>%
  mutate(
    Change = case_when(
      T0 == "Yes" & T1 == "No"  ~ "Improved (missed → did not miss)",
      T0 == "No"  & T1 == "Yes" ~ "Worsened (did not miss → missed)",
      TRUE                      ~ "No change"
    )
  ) %>%
  count(Change) %>%
  mutate(
    Denominator = sum(n),
    Percent = round(100 * n / Denominator, 1)
  )

change_dist
## # A tibble: 3 × 4
##   Change                               n Denominator Percent
##   <chr>                            <int>       <int>   <dbl>
## 1 Improved (missed → did not miss)     7          38    18.4
## 2 No change                           27          38    71.1
## 3 Worsened (did not miss → missed)     4          38    10.5
## ---- missed_meds_T0_plot --------------------------------------------

ggplot(t0_dist, aes(x = T0, y = Percent, fill = T0)) +
  geom_col(width = 0.7) +
  geom_text(aes(label = paste0(Percent, "%")),
            vjust = -0.3, size = 4) +
  scale_fill_manual(values = c(
    "Yes" = "#d73027",
    "No"  = "#1a9850"
  )) +
  scale_y_continuous(limits = c(0, 100)) +
  theme_bw() +
  labs(
    title = "Missed Diabetes Medication Doses at T0 (Cohort 1)",
    x = "",
    y = "% of patients"
  ) +
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold", size = 14)
  )

## ---- missed_meds_T1_plot --------------------------------------------

ggplot(t1_dist, aes(x = T1, y = Percent, fill = T1)) +
  geom_col(width = 0.7) +
  geom_text(aes(label = paste0(Percent, "%")),
            vjust = -0.3, size = 4) +
  scale_fill_manual(values = c(
    "Yes" = "#d73027",
    "No"  = "#1a9850"
  )) +
  scale_y_continuous(limits = c(0, 100)) +
  theme_bw() +
  labs(
    title = "Missed Diabetes Medication Doses at T1 (Cohort 1)",
    x = "",
    y = "% of patients"
  ) +
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold", size = 14)
  )

## ---- missed_meds_change_plot ----------------------------------------

ggplot(change_dist,
       aes(x = Change, y = Percent, fill = Change)) +
  geom_col(width = 0.7) +
  geom_text(aes(label = paste0(Percent, "%")),
            vjust = -0.3, size = 4) +
  scale_fill_manual(values = c(
    "Improved (missed → did not miss)" = "#1a9850",
    "No change"                        = "#4E79A7",
    "Worsened (did not miss → missed)" = "#d73027"
  )) +
  scale_y_continuous(limits = c(0, 100)) +
  theme_bw() +
  labs(
    title = "Change in Missed Medication Doses from T0 to T1 (Cohort 1)",
    x = "",
    y = "% of evaluable patients"
  ) +
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold", size = 14),
    axis.text.x = element_text(size = 11)
  )

###Confidence in blood sugar maintenance practices
confidence_levels <- c(
  "Not confident",
  "Somewhat not confident",
  "Maybe",
  "Confident",
  "Very confident"
)

# T0: Yes / Maybe / No → 1–3
conf_T0_raw <- df_COHORT1$`T0- Are you Confident in making food choices that help control your blood sugar?`

conf_T0_num <- dplyr::case_when(
  toupper(trimws(conf_T0_raw)) == "NO"    ~ 1,
  toupper(trimws(conf_T0_raw)) == "MAYBE" ~ 2,
  toupper(trimws(conf_T0_raw)) == "YES"   ~ 3,
  TRUE ~ NA_real_
)

# T1: 1–4 scale or text variants → 1–4
conf_T1_raw <- df_COHORT1$`T1- How Confident are you in making food choices that help control your blood sugar?`

conf_T1_num <- {
  x <- toupper(trimws(as.character(conf_T1_raw)))
  dplyr::case_when(
    x %in% c("1", "NOT CONFIDENT")                     ~ 1,
    x %in% c("2", "SOMEWHAT NOT CONFIDENT")            ~ 2,
    x %in% c("3", "CONFIDENT", "COFIDENT", "CONFIDEMT") ~ 3,
    x %in% c("4", "VERY CONFIDENT")                    ~ 4,
    TRUE ~ NA_real_
  )
}

# Combine & classify change
df_conf_change <- data.frame(
  conf_T0_num = conf_T0_num,
  conf_T1_num = conf_T1_num
) %>%
  dplyr::filter(!is.na(conf_T0_num), !is.na(conf_T1_num)) %>%
  dplyr::mutate(
    change = conf_T1_num - conf_T0_num,
    ChangeCategory = dplyr::case_when(
      change > 0  ~ "Increased confidence",
      change == 0 ~ "No change",
      change < 0  ~ "Decreased confidence"
    )
  )

change_summary <- df_conf_change %>%
  dplyr::count(ChangeCategory) %>%
  dplyr::mutate(
    percent = round(100 * n / sum(n), 1),
    label   = paste0(n, " (", percent, "%)")
  )

ggplot(change_summary, aes(x = ChangeCategory, y = n)) +
  geom_col(fill = "#2E86C1", width = 0.6) +
  geom_text(aes(label = label),
            vjust = -0.3, size = 3.8) +
  scale_y_continuous(
    expand = expansion(mult = c(0, 0.1))
  ) +
  theme_bw() +
  labs(
    title = "Change in Confidence Making Food Choices (T0 → T1)",
    x     = "",
    y     = "Number of participants"
  ) +
  theme(
    axis.text.x  = element_text(size = 11),
    axis.title.y = element_text(size = 11),
    plot.title   = element_text(size = 14, face = "bold")
  )

# ---- T0 confidence (Yes / Maybe / No) ----
conf_T0_raw <- df_COHORT1$`T0- Are you Confident in making food choices that help control your blood sugar?`

conf_T0_label <- dplyr::case_when(
  toupper(trimws(conf_T0_raw)) == "NO"    ~ "No",
  toupper(trimws(conf_T0_raw)) == "MAYBE" ~ "Maybe",
  toupper(trimws(conf_T0_raw)) == "YES"   ~ "Yes",
  TRUE ~ NA_character_
)

# ---- T1 confidence (1–4 or text variants) ----
conf_T1_raw <- df_COHORT1$`T1- How Confident are you in making food choices that help control your blood sugar?`

conf_T1_label <- {
  x <- toupper(trimws(as.character(conf_T1_raw)))
  dplyr::case_when(
    x %in% c("1", "NOT CONFIDENT")                       ~ "Not confident",
    x %in% c("2", "SOMEWHAT NOT CONFIDENT")              ~ "Somewhat not confident",
    x %in% c("3", "CONFIDENT", "COFIDENT", "CONFIDEMT")  ~ "Confident",
    x %in% c("4", "VERY CONFIDENT")                      ~ "Very confident",
    TRUE ~ NA_character_
  )
}


conf_transition <- data.frame(
  T0 = conf_T0_label,
  T1 = conf_T1_label,
  stringsAsFactors = FALSE
) %>%
  dplyr::filter(!is.na(T0), !is.na(T1))

conf_transition_table <- conf_transition %>%
  dplyr::count(T0, T1, name = "n") %>%
  dplyr::arrange(desc(n))

conf_transition_table
##       T0                     T1  n
## 1    Yes         Very confident 19
## 2    Yes              Confident  5
## 3  Maybe              Confident  4
## 4     No         Very confident  2
## 5    Yes Somewhat not confident  2
## 6  Maybe Somewhat not confident  1
## 7  Maybe         Very confident  1
## 8     No              Confident  1
## 9     No Somewhat not confident  1
## 10   Yes          Not confident  1
## Appointments T0 vs T1 – enrolled pts (>= 3 T0 answers, >=1 T1 answer)

# 1. Identify T0 / T1 question columns ------------------------------
t0_q_cols <- grep("^T0-", names(df), value = TRUE)
t1_q_cols <- grep("^T1-", names(df), value = TRUE)

# 2. Define analysis cohort locally (DOES NOT overwrite df_status) --
df_appt_base <- df %>%
  mutate(
    T0_answer_count = rowSums(!is.na(dplyr::select(., all_of(t0_q_cols)))),
    T1_answer_count = rowSums(!is.na(dplyr::select(., all_of(t1_q_cols))))
  ) %>%
  filter(
    T0_answer_count >= 3,   # your enrollment rule
    T1_answer_count > 0     # must have answered something at T1
  )

# 3. Pull raw appointment columns from this filtered cohort ---------
appt_T0_raw <- df_appt_base$`T0- Since your discharge from the hospital, have you had an appointment with a healthcare provider for your diabetes?`
appt_T1_raw <- df_appt_base$`T1- Since our last check in, have you had an appointment with a healthcare provider for your diabetes?`

# 4. Clean Yes/No responses -----------------------------------------
clean_yesno <- function(x) {
  x_std <- toupper(trimws(as.character(x)))
  dplyr::case_when(
    x_std %in% c("YES", "Y", "YS") ~ "Yes",
    x_std %in% c("NO",  "N")       ~ "No",
    TRUE                           ~ NA_character_
  )
}

appt_T0_clean <- clean_yesno(appt_T0_raw)
appt_T1_clean <- clean_yesno(appt_T1_raw)

# 5. Build paired change dataset (complete appointment data only) ---
df_appt_change <- tibble(
  patient_id = seq_len(nrow(df_appt_base)),
  T0_raw     = appt_T0_clean,
  T1_raw     = appt_T1_clean
) %>%
  filter(!is.na(T0_raw), !is.na(T1_raw))   # drops any remaining NA pairs

# 6. Score Yes/No and classify change -------------------------------
score_map <- c("No" = 0, "Yes" = 1)

df_appt_change <- df_appt_change %>%
  mutate(
    T0_score = unname(score_map[T0_raw]),
    T1_score = unname(score_map[T1_raw]),
    delta    = T1_score - T0_score,
    change   = dplyr::case_when(
      delta > 0  ~ "Increased",   # No -> Yes
      delta < 0  ~ "Decreased",   # Yes -> No
      TRUE       ~ "No change"    # Yes->Yes or No->No
    )
  )

# 7. Summary for bar chart ------------------------------------------
change_summary <- df_appt_change %>%
  count(change) %>%
  mutate(
    percent = round(100 * n / sum(n), 1),
    change  = factor(change, levels = c("Decreased", "No change", "Increased"))
  )

# 8. (Optional) transition table object (not printed automatically) -
transition_table_wide <- df_appt_change %>%
  count(change, T0_raw, T1_raw) %>%
  group_by(change) %>%
  mutate(
    percent = round(100 * n / sum(n), 1),
    label   = paste0(n, " (", percent, "%)")
  ) %>%
  ungroup() %>%
  select(change, T0_raw, T1_raw, label) %>%
  pivot_wider(
    names_from  = T1_raw,
    values_from = label,
    values_fill = ""
  )
# View(transition_table_wide) if you want to inspect it

# 9. Pie-chart data (same cohort as change plot) --------------------
appt_pie_df <- df_appt_change %>%
  select(T0_raw, T1_raw) %>%
  mutate(row = row_number()) %>%
  pivot_longer(cols = c(T0_raw, T1_raw),
               names_to = "Timepoint",
               values_to = "Response") %>%
  mutate(
    Timepoint = if_else(Timepoint == "T0_raw", "T0", "T1"),
    Response  = factor(Response, levels = c("Yes", "No"))
  )

appt_pie_summary <- appt_pie_df %>%
  count(Timepoint, Response) %>%
  group_by(Timepoint) %>%
  mutate(
    percent = round(100 * n / sum(n), 1),
    label   = paste0(percent, "%\n(n=", n, ")")
  ) %>%
  ungroup()

# 10. Plots: bar of change + two pies -------------------------------
p_change <- ggplot(change_summary,
                   aes(x = change, y = percent, fill = change)) +
  geom_col() +
  geom_text(
    aes(label = paste0(percent, "%")),
    vjust = -0.3,
    size = 4
  ) +
  scale_fill_manual(
    values = c(
      "Decreased" = "#d73027",
      "No change" = "#4E79A7",
      "Increased" = "#1a9850"
    )
  ) +
  scale_y_continuous(
    limits = c(0, 80),
    breaks = seq(0, 80, by = 10),
    labels = function(x) paste0(x, "%")
  ) +
  theme_bw() +
  labs(
    title = "Change in Having a Diabetes-Related Appointment (T0 → T1)",
    x = "",
    y = "% of participants"
  ) +
  theme(
    legend.position = "none",
    plot.title = element_text(size = 14, face = "bold")
  )


p_pies <- ggplot(appt_pie_summary,
                 aes(x = "", y = percent, fill = Response)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  geom_text(
    aes(label = label),
    position = position_stack(vjust = 0.5),
    size = 3.8
  ) +
  facet_wrap(~ Timepoint) +
  scale_fill_manual(values = c("Yes" = "#1a9850", "No" = "#d73027")) +
  theme_void() +
  labs(
    title = "Appointments with Diabetes Provider at T0 and T1",
    fill  = "Response"
  ) +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    strip.text = element_text(size = 12, face = "bold")
  )

p_change / p_pies   # <- this is the only thing that prints

## ---- impact_meals_made ---------------------------------------------

meals_raw <- df_COHORT1$`T1- In the past week, how many meals did you make using food from the program?`

clean_meals <- function(x) {
  x_std <- toupper(trimws(as.character(x)))
  dplyr::case_when(
    x_std %in% c("0")            ~ "0",
    x_std %in% c("1","1 MEAL")   ~ "1",
    x_std %in% c("2 TO 3","2-3","2–3","2 TO3","2 TO 3") ~ "2–3",
    x_std %in% c("4 TO 6","4-6","4–6","4 TO 6") ~ "4–6",
    str_detect(x_std,"7")        ~ "7+",
    TRUE ~ NA_character_
  )
}

meals_clean <- clean_meals(meals_raw)

meals_summary <- tibble::tibble(meals = meals_clean) %>%
  filter(!is.na(meals)) %>%
  count(meals) %>%
  mutate(
    percent = round(100 * n / sum(n), 1),
    label   = paste0(percent, "%")
  )

ggplot(meals_summary, aes(x = meals, y = percent)) +
  geom_col(fill = "#1F4E79") +
  geom_text(aes(label = label), vjust = -0.3, size = 3.8) +
  theme_bw() +
  labs(
    title = "Meals prepared using program food (past week)",
    x     = "Number of meals",
    y     = "% of participants"
  )

## ---- impact_ease_of_use --------------------------------------------
## ---- impact_ease_of_use_final_fix ----------------------------------

ease_raw <- df_COHORT1$`T1- In the past week, how easy was it to use the food in your meals?`

ease_clean <- dplyr::case_when(
  trimws(ease_raw) == "1" ~ "Very easy",
  TRUE                   ~ str_to_sentence(trimws(ease_raw))
)

ease_summary <- tibble(response = ease_clean) %>%
  filter(!is.na(response)) %>%
  count(response) %>%
  mutate(
    percent = round(100 * n / sum(n), 1),
    response = factor(
      response,
      levels = c("Neutral", "Easy", "Very easy"),
      ordered = TRUE
    )
  )

ggplot(ease_summary, aes(x = response, y = percent)) +
  geom_col(fill = "#2E86C1") +
  geom_text(
    aes(label = paste0(percent, "%")),
    vjust = -0.3,
    size = 4
  ) +
  theme_bw() +
  labs(
    title = "Ease of Using Program Food in Meals (T1)",
    x     = "",
    y     = "% of participants"
  ) +
  theme(
    axis.text.x = element_text(size = 11),
    plot.title  = element_text(size = 14, face = "bold")
  )

## -------------------------------
## FOOD WASTE (T1 ONLY – IMPACT)
## -------------------------------

# 1. Identify the T1 waste column explicitly
waste_col <- grep(
  "^T1-.*throw away any of the food",
  names(df_COHORT1),
  value = TRUE
)

# sanity check (must be length 1)
stopifnot(length(waste_col) == 1)

# 2. Pull raw responses
waste_raw <- df_COHORT1[[waste_col]]

# 3. Clean Yes / No
waste_clean <- case_when(
  toupper(trimws(waste_raw)) %in% c("YES", "Y") ~ "Yes",
  toupper(trimws(waste_raw)) %in% c("NO", "N")  ~ "No",
  TRUE ~ NA_character_
)

# 4. Build summary table
waste_summary <- tibble(Response = waste_clean) %>%
  filter(!is.na(Response)) %>%
  count(Response) %>%
  mutate(
    percent = round(100 * n / sum(n), 1),
    label = paste0(percent, "%\n(n=", n, ")")
  )

# 5. Plot (simple, low-risk)
ggplot(waste_summary, aes(x = Response, y = percent, fill = Response)) +
  geom_col(width = 0.6) +
  geom_text(aes(label = label), vjust = -0.3, size = 4) +
  scale_fill_manual(
    values = c("No" = "#1a9850", "Yes" = "#d73027")
  ) +
  scale_y_continuous(
    limits = c(0, 100),
    labels = function(x) paste0(x, "%")
  ) +
  theme_bw() +
  labs(
    title = "Did Patients Throw Away Delivered Food? (T1)",
    x = "",
    y = "% of participants"
  ) +
  theme(
    legend.position = "none",
    plot.title = element_text(size = 14, face = "bold")
  )

## ------------------------------
## Program impact on feeling in control of diabetes (T1)

# 1. Pull the T1 column
control_col <- grep(
  "^T1-How much did this program help you feel more in control of your diabetes",
  names(df_COHORT1),
  value = TRUE
)

control_raw <- df_COHORT1[[control_col]]

# 2. Clean / recode responses
control_clean <- toupper(trimws(as.character(control_raw)))

control_clean <- dplyr::case_when(
  control_clean %in% c("NOT AT ALL", "0") ~ "Not at all",
  control_clean %in% c("A LITTLE", "A LITTLE ", "1") ~ "A little",
  control_clean %in% c("SOMEWHAT", "2") ~ "Somewhat",
  control_clean %in% c("A LOT", "A LOT ", "3") ~ "A lot",
  TRUE ~ NA_character_
)

# 3. Summarize % of participants
control_summary <- tibble::tibble(response = control_clean) %>%
  dplyr::filter(!is.na(response)) %>%
  dplyr::count(response) %>%
  dplyr::mutate(
    percent  = round(100 * n / sum(n), 1),
    response = factor(response,
                      levels = c("Not at all", "A little", "Somewhat", "A lot"))
  )

# 4. Plot
ggplot(control_summary, aes(x = response, y = percent)) +
  geom_col(fill = "#4E79A7", width = 0.7) +
  geom_text(
    aes(label = paste0(percent, "%")),
    vjust = -0.3,
    size  = 4
  ) +
  theme_bw() +
  labs(
    title = "Program Impact on Feeling in Control of Diabetes (T1)",
    x     = "",
    y     = "% of participants"
  ) +
  coord_cartesian(ylim = c(0, 100)) +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.text.x = element_text(size = 11, angle = 30, hjust = 1),
    axis.text.y = element_text(size = 11)
  )

# Count patients by cohort
df %>%
  count(Cohort) %>%
  arrange(desc(n))
## # A tibble: 3 × 2
##   Cohort     n
##   <chr>  <int>
## 1 1        489
## 2 2        220
## 3 3         55
clean_yesno <- function(x) {
  x <- toupper(trimws(as.character(x)))
  dplyr::case_when(
    x %in% c("Y","YES","TRUE","1") ~ "Enrolled",
    x %in% c("N","NO","FALSE","0") ~ "Not enrolled",
    TRUE ~ NA_character_
  )
}

clean_cohort <- function(x) {
  x <- toupper(trimws(as.character(x)))
  dplyr::case_when(
    x %in% c("1","COHORT 1","COHORT1","C1") ~ "Cohort 1",
    x %in% c("2","COHORT 2","COHORT2","C2") ~ "Cohort 2",
    TRUE ~ NA_character_
  )
}

df_status <- df %>%
  mutate(
    Enrolled = clean_yesno(.data[["Enrolled"]]),
    Cohort_clean = clean_cohort(.data[["Cohort"]]),
    Cohort_final = dplyr::case_when(
      Enrolled == "Enrolled" & Cohort_clean %in% c("Cohort 1","Cohort 2") ~ Cohort_clean,
      TRUE ~ "Not enrolled"
    )
  )

df_status %>%
  count(Enrolled, Cohort_final)
## # A tibble: 4 × 3
##   Enrolled     Cohort_final     n
##   <chr>        <chr>        <int>
## 1 Enrolled     Cohort 1        60
## 2 Enrolled     Cohort 2        41
## 3 Not enrolled Not enrolled   610
## 4 <NA>         Not enrolled    53
## ---- outside_delivery_range_map_and_counts, echo=TRUE -----------------------

library(dplyr)
library(stringr)
library(ggplot2)

# ---- explicit column names ----
contact_col <- "T0 Contact"
zip_col <- grep("zip", names(df), value = TRUE, ignore.case = TRUE)[1]
stopifnot(contact_col %in% names(df), !is.na(zip_col))


# ---- build patient-level base table ----
df_outside <- df_status %>%
  mutate(
    patient_id = row_number(),
    contact_std = str_to_lower(trimws(as.character(.data[[contact_col]]))),
    outside_range = str_detect(contact_std, "outside") & str_detect(contact_std, "range"),
    zip5 = str_extract(as.character(.data[[zip_col]]), "\\b\\d{5}\\b"),
    Enrolled = if_else(Enrolled == "Enrolled", "Enrolled", "Not enrolled")
  ) %>%
  filter(outside_range, !is.na(zip5))

# ---- REQUIRED COUNTS (what you asked) ----
n_patients_outside_all   <- df_outside %>% distinct(patient_id) %>% nrow()
n_unique_zips_outside_all <- df_outside %>% distinct(zip5) %>% nrow()

n_patients_outside_enrolled <- df_outside %>% filter(Enrolled == "Enrolled") %>% distinct(patient_id) %>% nrow()
n_unique_zips_outside_enrolled <- df_outside %>% filter(Enrolled == "Enrolled") %>% distinct(zip5) %>% nrow()

outside_counts_summary <- tibble::tibble(
  Group = c("All patients", "Enrolled"),
  Patients_flagged_outside_range = c(n_patients_outside_all, n_patients_outside_enrolled),
  Unique_ZIPs_flagged_outside_range = c(n_unique_zips_outside_all, n_unique_zips_outside_enrolled)
)

outside_counts_summary
## # A tibble: 2 × 3
##   Group        Patients_flagged_outside_range Unique_ZIPs_flagged_outside_range
##   <chr>                                 <int>                             <int>
## 1 All patients                             79                                54
## 2 Enrolled                                  0                                 0
# ---- ZIP count table (most useful + easiest to read) ----
zip_counts_all <- df_outside %>%
  count(zip5, name = "Patients") %>%
  arrange(desc(Patients))

zip_counts_enrolled <- df_outside %>%
  filter(Enrolled == "Enrolled") %>%
  count(zip5, name = "Patients") %>%
  arrange(desc(Patients))

# Print top zips (adjust n as needed)
head(zip_counts_all, 25)
## # A tibble: 25 × 2
##    zip5  Patients
##    <chr>    <int>
##  1 10466        7
##  2 10566        4
##  3 10458        3
##  4 10465        3
##  5 10805        3
##  6 12533        3
##  7 10459        2
##  8 10462        2
##  9 10520        2
## 10 10541        2
## # ℹ 15 more rows
head(zip_counts_enrolled, 25)
## # A tibble: 0 × 2
## # ℹ 2 variables: zip5 <chr>, Patients <int>
# ---- Bar chart: Top ZIPs flagged Outside-of-Range (All patients) ----
top_n <- 20
zip_plot_df <- zip_counts_all %>%
  slice_head(n = top_n) %>%
  mutate(zip5 = factor(zip5, levels = rev(zip5)))

ggplot(zip_plot_df, aes(x = zip5, y = Patients)) +
  geom_col(fill = "#1F4E79") +
  geom_text(aes(label = Patients), hjust = -0.15, size = 3.5) +
  coord_flip(clip = "off") +
  expand_limits(y = max(zip_plot_df$Patients, na.rm = TRUE) * 1.15) +
  theme_bw() +
  labs(
    title = "Outside of Delivery Range (T0 Contact) — Top ZIP Codes",
    subtitle = paste0(
      "All patients flagged: ", n_patients_outside_all,
      " • Unique ZIPs: ", n_unique_zips_outside_all,
      " • (Enrolled flagged: ", n_patients_outside_enrolled, ")"
    ),
    x = "ZIP code",
    y = "Patients flagged"
  ) +
  theme(
    plot.title = element_text(face = "bold", size = 14)
  )

# ---- OPTIONAL: Put the old “state outline” map back IF maps is available ----
# This uses state outlines as a basemap and plots ZIP centroids on top.
# If 'maps' is not installed, it will fall back to points-only (still usable).

if (requireNamespace("zipcodeR", quietly = TRUE) && requireNamespace("sf", quietly = TRUE)) {

  library(sf)

  # attach ZIP centroids
  zip_pts <- zip_counts_all %>%
    left_join(
      zipcodeR::zip_code_db %>% select(zipcode, lat, lng),
      by = c("zip5" = "zipcode")
    ) %>%
    filter(!is.na(lat), !is.na(lng))

  zip_sf <- st_as_sf(zip_pts, coords = c("lng", "lat"), crs = 4326)

  # tri-state-ish bounding box around your points (buffered)
  xs <- st_coordinates(zip_sf)[,1]
  ys <- st_coordinates(zip_sf)[,2]
  xlim <- range(xs) + c(-0.5, 0.5)
  ylim <- range(ys) + c(-0.5, 0.5)

  if (requireNamespace("maps", quietly = TRUE)) {
    states_map <- maps::map("state", fill = TRUE, plot = FALSE)
    states_sf <- st_as_sf(states_map)

    states_tri <- states_sf %>%
      filter(ID %in% c("new york", "new jersey", "connecticut"))

    ggplot() +
      geom_sf(data = states_tri, fill = "grey95", color = "grey50", linewidth = 0.4) +
      geom_sf(data = zip_sf, aes(size = Patients), color = "#1F4E79", alpha = 0.75) +
      scale_size(range = c(2, 12)) +
      coord_sf(xlim = xlim, ylim = ylim) +
      theme_minimal() +
      labs(
        title = "Patients Outside of Delivery Range (T0 Contact)",
        subtitle = paste0(
          "All patients flagged: ", n_patients_outside_all,
          " • Unique ZIPs: ", n_unique_zips_outside_all
        ),
        size = "Patients"
      ) +
      theme(plot.title = element_text(face = "bold", size = 14),
            legend.position = "right")
  } else {
    # fallback: points only (no state outline)
    ggplot() +
      geom_sf(data = zip_sf, aes(size = Patients), color = "#1F4E79", alpha = 0.75) +
      scale_size(range = c(2, 12)) +
      coord_sf(xlim = xlim, ylim = ylim) +
      theme_minimal() +
      labs(
        title = "Patients Outside of Delivery Range (T0 Contact) — ZIP centroids",
        subtitle = paste0(
          "All patients flagged: ", n_patients_outside_all,
          " • Unique ZIPs: ", n_unique_zips_outside_all,
          " • (Install 'maps' to add state outlines)"
        ),
        size = "Patients"
      ) +
      theme(plot.title = element_text(face = "bold", size = 14),
            legend.position = "right")
  }

} else {
  message("To plot ZIP centroids, install packages: zipcodeR and sf.")
}

###wakefield bronx, peekskill, fordham bronx, throggs neck
df_zip_base <- df_status %>%
  mutate(
    zip5 = str_extract(as.character(.data[[zip_col]]), "\\b\\d{5}\\b"),
    contact_std = str_to_lower(trimws(as.character(`T0 Contact`))),
    outside_range = str_detect(contact_std, "outside")
  ) %>%
  filter(!is.na(zip5), outside_range)
zip_table <- bind_rows(
  df_zip_base %>%
    mutate(Group = "All patients"),
  df_zip_base %>%
    filter(Enrolled == "Enrolled") %>%
    mutate(Group = "Enrolled")
) %>%
  count(Group, zip5, name = "n_outside") %>%
  arrange(Group, desc(n_outside))

zip_table
## # A tibble: 54 × 3
##    Group        zip5  n_outside
##    <chr>        <chr>     <int>
##  1 All patients 10466         7
##  2 All patients 10566         4
##  3 All patients 10458         3
##  4 All patients 10465         3
##  5 All patients 10805         3
##  6 All patients 12533         3
##  7 All patients 10459         2
##  8 All patients 10462         2
##  9 All patients 10520         2
## 10 All patients 10541         2
## # ℹ 44 more rows