Cohort_Validation

df <- read_excel("FIM Data 2-26-2026 (DI).xlsx",
                 col_types = "text") %>%
  clean_names()

names(df)

##   [1] "pre_enrollment_notes"                                                                                                       
##   [2] "cohort"                                                                                                                     
##   [3] "enrolled"                                                                                                                   
##   [4] "age"                                                                                                                        
##   [5] "preferred_language"                                                                                                         
##   [6] "disch_date_time"                                                                                                            
##   [7] "cc"                                                                                                                         
##   [8] "admission_diagnosis"                                                                                                        
##   [9] "admitting_provider"                                                                                                         
##  [10] "discharge_disposition"                                                                                                      
##  [11] "lace_readmission_score_score_column"                                                                                        
##  [12] "unplanned_readmission_score"                                                                                                
##  [13] "primary_cvg"                                                                                                                
##  [14] "hb_a1c_value_last_3_months"                                                                                                 
##  [15] "pt_portal_status"                                                                                                           
##  [16] "hosp_last_365_days"                                                                                                         
##  [17] "ed_last_365_days"                                                                                                           
##  [18] "ed_vis_last_90_days"                                                                                                        
##  [19] "hosp_last_90_days"                                                                                                          
##  [20] "pcp"                                                                                                                        
##  [21] "race"                                                                                                                       
##  [22] "ethnicity"                                                                                                                  
##  [23] "legal_sex"                                                                                                                  
##  [24] "sdoh_high_risk_domains"                                                                                                     
##  [25] "zip_code"                                                                                                                   
##  [26] "prescreen_contacted_y_n"                                                                                                    
##  [27] "ps_in_the_past_12_months_were_you_ever_worried_your_food_would_run_out_before_you_had_money_to_buy_more"                    
##  [28] "ps_in_the_past_12_months_did_the_food_you_could_afford_not_last_until_the_end_of_the_month_and_you_couldn_t_get_more"       
##  [29] "ps_in_the_last_month_did_anyone_in_your_household_have_to_skip_meals_due_to_lack_of_food"                                   
##  [30] "pre_screen_interested_in_program"                                                                                           
##  [31] "t0_contact"                                                                                                                 
##  [32] "screen_for_enrollment_interested_in_program"                                                                                
##  [33] "t0_are_you_likely_to_eat_the_food_as_part_of_your_normal_daily_routine"                                                     
##  [34] "t0_in_the_past_week_how_often_did_you_check_your_blood_sugar"                                                               
##  [35] "t0_are_you_confident_in_making_food_choices_that_help_control_your_blood_sugar"                                             
##  [36] "t0_are_you_confident_in_understanding_how_to_read_food_labels_and_nutrition_information"                                    
##  [37] "t0_were_your_sugar_levels_usually_in_your_target_range_last_week"                                                           
##  [38] "t0_in_the_past_week_did_you_miss_any_doses_of_you_diabetes_medications"                                                     
##  [39] "t0_since_your_discharge_from_the_hospital_have_you_had_an_appointment_with_a_healthcare_provider_for_your_diabetes"         
##  [40] "t1_contact_y_n"                                                                                                             
##  [41] "t1_notes"                                                                                                                   
##  [42] "t1_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"                                                
##  [43] "t1_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"                                                       
##  [44] "if_yes_why_and_how_much_food_was_thrown_away_44"                                                                            
##  [45] "t1_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"                                                          
##  [46] "t1_in_the_past_week_how_often_did_you_check_your_blood_sugar"                                                               
##  [47] "t1_were_your_sugar_levels_usually_in_your_target_range_last_week"                                                           
##  [48] "t1_did_you_have_any_symptoms_of_high_or_low_blood_sugar_in_the_past_week"                                                   
##  [49] "t1_in_the_past_week_did_you_miss_any_doses_of_you_diabetes_medications"                                                     
##  [50] "do_you_have_access_to_your_medications_escalate_as_needed_50"                                                               
##  [51] "t1_since_our_last_check_in_have_you_had_an_appointment_with_a_healthcare_provider_for_your_diabetes"                        
##  [52] "t1_have_you_done_anything_in_the_past_week_to_help_manage_your_diabetes_like_walking_portion_control_or_drinking_more_water"
##  [53] "t1_how_much_did_this_program_help_you_feel_more_in_control_of_your_diabetes"                                                
##  [54] "t1_did_the_nutrition_material_you_got_from_white_plains_hospital_help_you_make_healthier_food_choices"                      
##  [55] "t1_how_confident_are_you_in_making_food_choices_that_help_control_your_blood_sugar"                                         
##  [56] "t1_how_confident_are_you_in_understanding_food_labels_or_nutrition_information"                                             
##  [57] "t1_how_confident_are_you_in_your_ability_to_prepare_healthy_meals_with_the_food_you_typically_have_at_home"                 
##  [58] "in_one_to_two_sentences_can_you_tell_me_what_feedback_you_have_that_can_improve_this_program_for_others_58"                 
##  [59] "t1_do_you_have_any_suggestions_or_feedback_to_improve_this_program_for_others"                                              
##  [60] "t2_contact_y_n"                                                                                                             
##  [61] "t2_notes"                                                                                                                   
##  [62] "t2_in_the_past_6_months_were_you_ever_worried_your_food_would_run_out_before_you_had_money_to_buy_more"                     
##  [63] "t2_in_the_past_6_months_did_the_food_you_could_afford_not_last_until_the_end_of_the_month_and_you_couldn_t_get_more"        
##  [64] "t2_in_the_last_month_did_anyone_in_your_household_have_to_skip_meals_due_to_lack_of_food"                                   
##  [65] "t2_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"                                                
##  [66] "t2_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"                                                       
##  [67] "if_yes_why_and_how_much_food_was_thrown_away_67"                                                                            
##  [68] "t2_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"                                                          
##  [69] "t2_in_the_past_week_how_often_did_you_check_your_blood_sugar"                                                               
##  [70] "do_you_have_access_to_your_medications_escalate_as_needed_70"                                                               
##  [71] "t2_were_your_sugar_levels_usually_in_your_target_range_last_week"                                                           
##  [72] "t2_in_the_past_week_did_you_have_any_symptoms_of_high_or_low_blood_sugar"                                                   
##  [73] "t2_in_the_past_week_did_you_miss_any_doses_of_your_diabetes_medication"                                                     
##  [74] "t2_since_our_last_check_in_have_you_had_an_appointment_with_a_healthcare_provider_for_your_diabetes"                        
##  [75] "t2_have_you_done_anything_in_the_past_week_to_help_manage_your_diabetes_like_walking_portion_control_drinking_more_water"   
##  [76] "t2_how_much_did_this_program_help_you_feel_more_in_control_of_your_diabetes"                                                
##  [77] "t2_did_the_nutrition_material_you_got_from_white_plains_hospital_help_you_make_healthier_food_choices"                      
##  [78] "t2_how_confident_are_you_in_making_food_choices_that_help_control_your_blood_sugar"                                         
##  [79] "t2_how_confident_are_you_in_understanding_food_labels_or_nutrition_information"                                             
##  [80] "t2_how_confident_are_you_in_your_ability_to_prepare_healthy_meals_with_the_foods_you_typically_have_at_home"                
##  [81] "in_one_to_two_sentences_can_you_tell_me_what_feedback_you_have_that_can_improve_this_program_for_others_81"                 
##  [82] "t2_do_you_have_any_suggestions_or_feedback_to_improve_this_program_for_others"                                              
##  [83] "hemoglobin_a1c_8_1_25_10_2_2025"                                                                                            
##  [84] "hospital_utilization_8_1_25_10_2_25"                                                                                        
##  [85] "a1c_change_8_1_25_10_2_25"                                                                                                  
##  [86] "a1c_under_8_8_1_25_10_2_25"                                                                                                 
##  [87] "hemoglobin_a1c_11_20_2_20_t1"                                                                                               
##  [88] "ed_utilization_t1"                                                                                                          
##  [89] "ip_utilization_t1"                                                                                                          
##  [90] "a1c_change_t1"                                                                                                              
##  [91] "a1c_under_8_t1"                                                                                                             
##  [92] "additional_feedback"                                                                                                        
##  [93] "hemoglobin_a1c_t2"                                                                                                          
##  [94] "a1c_date_t2"                                                                                                                
##  [95] "ed_utilization_t2"                                                                                                          
##  [96] "hospital_utilization_t2"                                                                                                    
##  [97] "a1c_change_t2"                                                                                                              
##  [98] "a1c_under_8_t2"                                                                                                             
##  [99] "housing"                                                                                                                    
## [100] "housing_2"                                                                                                                  
## [101] "food"                                                                                                                       
## [102] "utilities"                                                                                                                  
## [103] "transport"                                                                                                                  
## [104] "finance"                                                                                                                    
## [105] "caregiver"                                                                                                                  
## [106] "legal"                                                                                                                      
## [107] "violence"                                                                                                                   
## [108] "employment"                                                                                                                 
## [109] "education"                                                                                                                  
## [110] "completed"

view(df)

# Confirm required columns exist
stopifnot("enrolled" %in% names(df))
stopifnot("cohort" %in% names(df))

df_clean <- df %>%
  mutate(
    enrolled_clean = str_to_upper(str_squish(as.character(enrolled))),
    cohort_clean   = str_squish(as.character(cohort)),
    cohort_num     = suppressWarnings(as.integer(str_extract(cohort_clean, "\\d+")))
  )

df_status <- df_clean %>%
  mutate(
    enrolled_group = case_when(
      enrolled_clean == "Y" ~ "Enrolled",
      TRUE ~ "Not enrolled"
    ),
    cohort_final = case_when(
      enrolled_group == "Enrolled" & cohort_num == 1 ~ "Cohort 1",
      enrolled_group == "Enrolled" & cohort_num == 2 ~ "Cohort 2",
      enrolled_group == "Enrolled" & cohort_num == 3 ~ "Cohort 3",
      TRUE ~ "Not enrolled"
    )
  )

cohort_counts <- df_status %>%
  count(cohort_final, name = "N") %>%
  arrange(cohort_final)

cohort_counts

## # A tibble: 4 × 2
##   cohort_final     N
##   <chr>        <int>
## 1 Cohort 1        60
## 2 Cohort 2        41
## 3 Cohort 3        35
## 4 Not enrolled   817

a1c_t1_col  <- "a1c_change_t1"
ed_t1_col   <- "ed_utilization_t1"
ip_t1_col   <- "ip_utilization_t1"

a1c_t2_col  <- "a1c_change_t2"
ed_t2_col   <- "ed_utilization_t2"
hosp_t2_col <- "hospital_utilization_t2"

# ----------------------------
# 7) Safety checks: chart-review columns exist
# ----------------------------
chart_cols <- c(a1c_t1_col, ed_t1_col, ip_t1_col, a1c_t2_col, ed_t2_col, hosp_t2_col)

missing_cols <- setdiff(chart_cols, names(df_status))
if (length(missing_cols) > 0) {
  stop(
    "These chart-review columns are missing from df_status:\n  - ",
    paste(missing_cols, collapse = "\n  - "),
    "\n\nRun: names(df_status) and update the *_col variables to match."
  )
}

# ----------------------------
# 8) Cohort subsets
# ----------------------------
df_c1 <- df_status %>% filter(cohort_final == "Cohort 1")
df_c2 <- df_status %>% filter(cohort_final == "Cohort 2")
df_c3 <- df_status %>% filter(cohort_final == "Cohort 3")

# ----------------------------
# 9) Completion helpers
#   - "nonblank" means not NA and not "" and not "NA"/"N/A"/"NULL"
# ----------------------------
is_nonblank <- function(x) {
  x_chr <- as.character(x)
  x_std <- str_to_upper(str_squish(x_chr))
  !(is.na(x_chr) | x_std == "" | x_std %in% c("NA", "N/A", "NULL"))
}

pct_complete <- function(df, column) {
  total <- nrow(df)
  if (total == 0) return("-")

  complete_n <- sum(is_nonblank(df[[column]]))
  pct <- round(100 * complete_n / total, 1)
  paste0(complete_n, "/", total, " (", pct, "%)")
}

pct_complete_all_required <- function(df, cols_required) {
  total <- nrow(df)
  if (total == 0) return("-")

  complete_n <- sum(apply(df[, cols_required, drop = FALSE], 1, function(row) {
    all(is_nonblank(row))
  }))
  pct <- round(100 * complete_n / total, 1)
  paste0(complete_n, "/", total, " (", pct, "%)")
}

# ----------------------------
# 10) Basic cohort size table
# ----------------------------
cohort_size_table <- tibble(
  Cohort = c("Cohort 1", "Cohort 2", "Cohort 3"),
  N = c(nrow(df_c1), nrow(df_c2), nrow(df_c3))
)

cohort_size_table

## # A tibble: 3 × 2
##   Cohort       N
##   <chr>    <int>
## 1 Cohort 1    60
## 2 Cohort 2    41
## 3 Cohort 3    35

# ----------------------------
# CHART REVIEW
# ----------------------------
c1_required <- c(a1c_t1_col, ed_t1_col, ip_t1_col, a1c_t2_col, ed_t2_col, hosp_t2_col)
c2_required <- c(a1c_t1_col, ed_t1_col, ip_t1_col)

chart_review_table <- tibble(
  Cohort = c("Cohort 1", "Cohort 2", "Cohort 3"),
  N = c(nrow(df_c1), nrow(df_c2), nrow(df_c3)),

  # Field-level completeness 
  `A1c Change (T1)` = c(pct_complete(df_c1, a1c_t1_col), pct_complete(df_c2, a1c_t1_col), "-"),
  `ED Utilization (T1)` = c(pct_complete(df_c1, ed_t1_col), pct_complete(df_c2, ed_t1_col), "-"),
  `IP Utilization (T1)` = c(pct_complete(df_c1, ip_t1_col), pct_complete(df_c2, ip_t1_col), "-"),

  `A1c Change (T2)` = c(pct_complete(df_c1, a1c_t2_col), "-", "-"),
  `ED Utilization (T2)` = c(pct_complete(df_c1, ed_t2_col), "-", "-"),
  `Hospital Utilization (T2)` = c(pct_complete(df_c1, hosp_t2_col), "-", "-"),

  # Cohort-level "complete chart review"
  `Chart review complete (per cohort rules)` = c(
    pct_complete_all_required(df_c1, c1_required),
    pct_complete_all_required(df_c2, c2_required),
    "-"
  )
)

chart_review_table

## # A tibble: 3 × 9
##   Cohort       N `A1c Change (T1)` `ED Utilization (T1)` `IP Utilization (T1)`
##   <chr>    <int> <chr>             <chr>                 <chr>                
## 1 Cohort 1    60 29/60 (48.3%)     60/60 (100%)          60/60 (100%)         
## 2 Cohort 2    41 23/41 (56.1%)     41/41 (100%)          41/41 (100%)         
## 3 Cohort 3    35 -                 -                     -                    
## # ℹ 4 more variables: `A1c Change (T2)` <chr>, `ED Utilization (T2)` <chr>,
## #   `Hospital Utilization (T2)` <chr>,
## #   `Chart review complete (per cohort rules)` <chr>

# -------------------------
# A1c Change (T1): Cohort 1 + Cohort 2
# -------------------------
a1c_t1 <- df_status %>%
  filter(cohort_final %in% c("Cohort 1", "Cohort 2")) %>%
  transmute(
    delta = suppressWarnings(as.numeric(str_replace_all(str_squish(as.character(.data[[a1c_t1_col]])), "[^0-9.\\-]", "")))
  ) %>%
  filter(!is.na(delta)) %>%
  mutate(
    Category = case_when(
      delta <= -1.0 ~ "Improved (≥1.0% ↓)",
      delta >=  1.0 ~ "Worsened (≥1.0% ↑)",
      TRUE          ~ "No meaningful change (<1.0%)"
    ),
    Category = factor(Category, levels = c("Improved (≥1.0% ↓)", "No meaningful change (<1.0%)", "Worsened (≥1.0% ↑)"))
  )

n_t1 <- nrow(a1c_t1)

a1c_t1_tab <- a1c_t1 %>%
  count(Category, name = "n") %>%
  mutate(
    pct = round(100 * n / sum(n), 1),
    label = paste0(pct, "%\n(n=", n, ")")
  )


# ---- small helper: parse numeric safely ----
parse_delta <- function(x) {
  suppressWarnings(as.numeric(str_replace_all(str_squish(as.character(x)), "[^0-9.\\-]", "")))
}

# ---- style settings (you asked for color) ----
col_improved <- "#2E7D32"   # green
col_nochange <- "#757575"   # gray
col_worsened <- "#C62828"   # red

make_a1c_plots <- function(df_in, delta_col, title_text) {

  d <- df_in %>%
    transmute(delta = parse_delta(.data[[delta_col]])) %>%
    filter(!is.na(delta)) %>%
    mutate(
      Category = case_when(
        delta <= -1.0 ~ "Improved (≥1.0%)",
        delta >=  1.0 ~ "Worsened (≥1.0%)",
        TRUE          ~ "No meaningful change"
      ),
      Category = factor(Category, levels = c("Improved (≥1.0%)", "No meaningful change", "Worsened (≥1.0%)"))
    )

  n <- nrow(d)

  tab <- d %>%
    count(Category, name = "n") %>%
    mutate(
      pct = round(100 * n / sum(n), 1),
      label = paste0(pct, "%  (n=", n, ")")
    )

  # 1) Colored category bar chart
  p_bar <- ggplot(tab, aes(x = Category, y = pct, fill = Category)) +
    geom_col(width = 0.72) +
    geom_text(aes(label = label), vjust = -0.35, size = 4) +
    scale_fill_manual(values = c(
      "Improved (≥1.0%)" = col_improved,
      "No meaningful change" = col_nochange,
      "Worsened (≥1.0%)" = col_worsened
    )) +
    scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
    labs(title = paste0(title_text, " (n = ", n, ")"), x = NULL, y = "% of cohort") +
    theme_minimal(base_size = 13) +
    theme(
      plot.title = element_text(face = "bold", size = 16),
      axis.text.x = element_text(size = 12),
      legend.position = "none"
    )

  # 2) Replace histogram with: jitter + boxplot + threshold bands/lines
  p_dist <- ggplot(d, aes(x = "", y = delta)) +
    annotate("rect", xmin = -Inf, xmax = Inf, ymin = -1, ymax = 1, alpha = 0.10, fill = col_nochange) +
    geom_hline(yintercept = c(-1, 1), linetype = "dashed", linewidth = 0.7, color = "black") +
    geom_boxplot(width = 0.22, outlier.shape = NA, alpha = 0.35) +
    geom_jitter(width = 0.12, height = 0, alpha = 0.55, size = 1.6) +
    coord_flip() +
    labs(
      title = "A1c change distribution (each dot = 1 patient)",
      x = NULL,
      y = "A1c change (percentage points)"
    ) +
    theme_minimal(base_size = 13) +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      axis.text.y = element_blank(),
      axis.ticks.y = element_blank()
    )

  list(tab = tab, bar = p_bar, dist = p_dist)
}

# =========================
# T1: Cohort 1 + Cohort 2
# =========================
df_a1c_t1 <- df_status %>% filter(cohort_final %in% c("Cohort 1", "Cohort 2"))
out_t1 <- make_a1c_plots(df_a1c_t1, a1c_t1_col, "A1C Change (T1): Cohort 1 + Cohort 2")
out_t1$tab

## # A tibble: 3 × 4
##   Category                 n   pct label        
##   <fct>                <int> <dbl> <chr>        
## 1 Improved (≥1.0%)        32  62.7 62.7%  (n=32)
## 2 No meaningful change    17  33.3 33.3%  (n=17)
## 3 Worsened (≥1.0%)         2   3.9 3.9%  (n=2)

out_t1$bar

# =========================
# T2: Cohort 1 only
# =========================
df_a1c_t2 <- df_status %>% filter(cohort_final == "Cohort 1")
out_t2 <- make_a1c_plots(df_a1c_t2, a1c_t2_col, "A1C Change (T2): Cohort 1")
out_t2$tab

## # A tibble: 3 × 4
##   Category                 n   pct label        
##   <fct>                <int> <dbl> <chr>        
## 1 Improved (≥1.0%)        19  63.3 63.3%  (n=19)
## 2 No meaningful change     9  30   30%  (n=9)   
## 3 Worsened (≥1.0%)         2   6.7 6.7%  (n=2)

out_t2$bar

# -------------------------
# Column names you specified
# -------------------------
col_a1c_all <- "hb_a1c_value_last_3_months"
col_a1c_t1  <- "hemoglobin_a1c_11_20_2_20_t1"
col_a1c_t2  <- "hemoglobin_a1c_t2"

need_cols <- c(col_a1c_all, col_a1c_t1, col_a1c_t2)
missing <- setdiff(need_cols, names(df_status))
if (length(missing) > 0) {
  stop("Missing columns:\n  - ", paste(missing, collapse = "\n  - "),
       "\n\nCheck names(df_status) and fix the col_a1c_* variables above.")
}

# -------------------------
# Clean HbA1c values → numeric
# -------------------------
clean_a1c <- function(x) {
  x <- str_squish(as.character(x))
  x[x %in% c("", "NA", "N/A", "na", "n/a", "NULL", "null")] <- NA_character_
  x[x == "<4.0"]  <- "4.0"
  x[x == ">14.0"] <- "14.0"
  x[x == ">20.0"] <- "20.0"
  x <- gsub("%", "", x)
  suppressWarnings(as.numeric(x))
}

# -------------------------
# Build long dataset (3 panels)
# -------------------------
a1c_long <- df_status %>%
  transmute(
    id = row_number(),
    A1c_All = clean_a1c(.data[[col_a1c_all]]),
    A1c_T1  = clean_a1c(.data[[col_a1c_t1]]),
    A1c_T2  = clean_a1c(.data[[col_a1c_t2]])
  ) %>%
  pivot_longer(cols = c(A1c_All, A1c_T1, A1c_T2),
               names_to = "Timepoint",
               values_to = "HbA1c") %>%
  filter(!is.na(HbA1c)) %>%
  mutate(
    Timepoint = recode(Timepoint,
                       A1c_All = "All patients (chart review HbA1c)",
                       A1c_T1  = "T1 HbA1c",
                       A1c_T2  = "T2 HbA1c"),
    Timepoint = factor(Timepoint,
                       levels = c("All patients (chart review HbA1c)", "T1 HbA1c", "T2 HbA1c")),
    Hb_bin = floor(HbA1c),
    Hb_mid = Hb_bin + 0.5
  )

# -------------------------
# Summarize into 1.0%-wide bins and label percentages
# -------------------------
a1c_hist <- a1c_long %>%
  count(Timepoint, Hb_bin, Hb_mid, name = "n") %>%
  group_by(Timepoint) %>%
  mutate(
    pct = round(100 * n / sum(n), 1),
    label = ifelse(pct > 0, paste0(pct, "%"), ""),
    N = sum(n)
  ) %>%
  ungroup()

# For facet labels with n
facet_labs <- setNames(
  paste0(levels(a1c_hist$Timepoint), " (n = ", tapply(a1c_long$HbA1c, a1c_long$Timepoint, length), ")"),
  levels(a1c_hist$Timepoint)
)

# Determine x-range
min_bin <- min(a1c_hist$Hb_bin, na.rm = TRUE)
max_bin <- max(a1c_hist$Hb_bin, na.rm = TRUE)

# -------------------------
# Plot (3 stacked panels)
# -------------------------
ggplot(a1c_hist, aes(x = Hb_mid, y = pct)) +
  geom_col(width = 0.9, fill = "#4E79A7", color = "white") +
  geom_text(aes(label = label), vjust = -0.35, size = 3.4) +
  facet_wrap(
    ~ Timepoint,
    ncol = 1,
    labeller = labeller(Timepoint = facet_labs)
  ) +
  scale_x_continuous(
    breaks = seq(min_bin, max_bin, by = 1),
    minor_breaks = NULL
  ) +
  scale_y_continuous(labels = function(x) paste0(x, "%")) +
  coord_cartesian(ylim = c(0, max(a1c_hist$pct, na.rm = TRUE) + 10)) +
  theme_bw() +
  labs(
    title = "HbA1c Distributions (All vs T1 vs T2)",
    x = "HbA1c (%)",
    y = "% within timepoint"
  ) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    strip.text = element_text(face = "bold", size = 12),
    axis.text  = element_text(size = 11)
  )

# -------------------------
# 1) COLUMN NAMES (EDIT THESE 5 to match names(df_status))
# -------------------------
col_ed_base  <- "ed_vis_last_90_days"
col_ed_t1    <- "ed_utilization_t1"
col_ed_t2    <- "ed_utilization_t2"

col_ip_base  <- "hosp_last_90_days"          # <-- replace with your true IP baseline column (NOT hosp)
col_ip_t1    <- "ip_utilization_t1"
col_ip_t2  <- "hospital_utilization_t2"


# -------------------------
# 2) Helpers
# -------------------------
parse_count <- function(x) {
  x <- as.character(x)
  x <- str_squish(x)
  x[x == ""] <- NA_character_
  x[str_to_upper(x) %in% c("NA","N/A","NULL")] <- NA_character_
  suppressWarnings(as.numeric(str_replace_all(x, "[^0-9\\-\\.]", "")))
}

make_bucket <- function(x, cap = 5) {
  bx <- ifelse(x >= cap, cap, as.integer(x))
  ifelse(bx == cap, paste0(cap, "+"), as.character(bx))
}

# -------------------------
# 3) Safety checks
# -------------------------
need_cols <- c(col_ed_base, col_ed_t1, col_ed_t2,
               col_ip_base, col_ip_t1, col_ip_t2,
               "enrolled_group")
missing <- setdiff(need_cols, names(df_status))
if (length(missing) > 0) {
  stop("Missing columns:\n  - ", paste(missing, collapse = "\n  - "),
       "\n\nFix the col_* names above using names(df_status).")
}

# -------------------------
# 4) Colors (simple + clear)
# -------------------------
col_less <- "#2E7D32"   # green
col_same <- "#757575"   # gray
col_more <- "#C62828"   # red

col_zero <- "#546E7A"   # blue-gray
col_onep <- "#8E24AA"   # purple

make_dist_df <- function(df_status, base_col, t1_col, t2_col, cap = 5, domain_label = "ED") {

  d <- df_status %>%
    transmute(
      group = if_else(enrolled_group == "Enrolled", "Enrolled", "All patients"),
      base = parse_count(.data[[base_col]]),
      t1   = parse_count(.data[[t1_col]]),
      t2   = parse_count(.data[[t2_col]])
    ) %>%
    pivot_longer(cols = c(base, t1, t2), names_to = "timepoint", values_to = "count") %>%
    filter(!is.na(count)) %>%
    mutate(
      timepoint = recode(timepoint,
                         base = "Baseline (last 90d)",
                         t1   = "T1",
                         t2   = "T2"),
      timepoint = factor(timepoint, levels = c("Baseline (last 90d)", "T1", "T2")),
      bucket = make_bucket(count, cap = cap),
      bucket = factor(bucket, levels = c(as.character(0:(cap-1)), paste0(cap, "+"))),
      group = factor(group, levels = c("All patients", "Enrolled")),
      domain = domain_label
    )

  d %>%
    count(domain, group, timepoint, bucket, name = "n") %>%
    tidyr::complete(domain, group, timepoint, bucket, fill = list(n = 0)) %>%  # ensures missing buckets show as 0
    group_by(domain, group, timepoint) %>%
    mutate(
      denom = sum(n),
      pct = ifelse(denom > 0, round(100 * n / denom, 1), 0),
      label = ifelse(n > 0, paste0(pct, "%"), "")
    ) %>%
    ungroup()
}

plot_dist <- function(tab, title_text, xlab) {
  ggplot(tab, aes(x = bucket, y = pct)) +
    geom_col(width = 0.8, fill = "#2E6F9E") +
    geom_text(aes(label = label), vjust = -0.25, size = 3.6) +
    facet_grid(group ~ timepoint) +
    scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
    theme_bw() +
    labs(title = title_text, x = xlab, y = "% of patients within group") +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      strip.background = element_rect(fill = "grey90"),
      strip.text = element_text(face = "bold")
    )
}

# -------------------------
# 5) ED distribution plot (cap at 5+)
# -------------------------
ed_dist <- make_dist_df(df_status, col_ed_base, col_ed_t1, col_ed_t2, cap = 5, domain_label = "ED")
plot_dist(ed_dist, "ED Utilization Distribution: All patients vs Enrolled", "ED visits")

# -------------------------
# 6) IP distribution plot (cap at 3+)
# -------------------------
ip_dist <- make_dist_df(df_status, col_ip_base, col_ip_t1, col_ip_t2, cap = 3, domain_label = "IP")
plot_dist(ip_dist, "Inpatient Utilization Distribution: All patients vs Enrolled", "Inpatient stays")

# =========================
# 7) CHANGE PLOTS: Baseline → T1 (Enrolled only)
#     X-axis: Less visits / No change / More visits
# =========================

# Build enrolled paired cohort (must have baseline + T1 for each domain)
df_util_pairs <- df_status %>%
  filter(enrolled_group == "Enrolled") %>%
  transmute(
    ED_base = parse_count(.data[[col_ed_base]]),
    ED_T1   = parse_count(.data[[col_ed_t1]]),
    IP_base = parse_count(.data[[col_ip_base]]),
    IP_T1   = parse_count(.data[[col_ip_t1]])
  ) %>%
  filter(!is.na(ED_base) & !is.na(ED_T1),
         !is.na(IP_base) & !is.na(IP_T1))

n_pairs <- nrow(df_util_pairs)

make_change_tab <- function(df, base_var, t1_var) {
  df %>%
    transmute(
      Change = case_when(
        .data[[t1_var]] < .data[[base_var]] ~ "Less visits",
        .data[[t1_var]] > .data[[base_var]] ~ "More visits",
        TRUE                                 ~ "No change"
      )
    ) %>%
    count(Change, name = "n") %>%
    mutate(
      pct = round(100 * n / sum(n), 1),
      label = paste0(pct, "%\n(n=", n, ")"),
      Change = factor(Change, levels = c("Less visits", "No change", "More visits"))
    )
}

plot_change_bar <- function(tab, title_text) {
  ggplot(tab, aes(x = Change, y = pct, fill = Change)) +
    geom_col(width = 0.72) +
    geom_text(aes(label = label), vjust = -0.35, size = 4) +
    scale_fill_manual(values = c(
      "Less visits" = col_less,
      "No change"   = col_same,
      "More visits" = col_more
    )) +
    scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
    labs(
      title = title_text,
      x = NULL,
      y = "% of enrolled cohort"
    ) +
    theme_bw() +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      legend.position = "none"
    )
}

# ED change
ed_change_tab <- make_change_tab(df_util_pairs, "ED_base", "ED_T1")
p_ed_change <- plot_change_bar(
  ed_change_tab,
  paste0("ED Utilization Change: Baseline \u2192 T1 (n = ", n_pairs, ")")
)

# IP change
ip_change_tab <- make_change_tab(df_util_pairs, "IP_base", "IP_T1")
p_ip_change <- plot_change_bar(
  ip_change_tab,
  paste0("IP Utilization Change: Baseline \u2192 T1 (n = ", n_pairs, ")")
)

# show tables if you want
ed_change_tab

## # A tibble: 3 × 4
##   Change          n   pct label          
##   <fct>       <int> <dbl> <chr>          
## 1 Less visits    58  57.4 "57.4%\n(n=58)"
## 2 More visits     9   8.9 "8.9%\n(n=9)"  
## 3 No change      34  33.7 "33.7%\n(n=34)"

ip_change_tab

## # A tibble: 3 × 4
##   Change          n   pct label          
##   <fct>       <int> <dbl> <chr>          
## 1 Less visits    62  61.4 "61.4%\n(n=62)"
## 2 More visits     6   5.9 "5.9%\n(n=6)"  
## 3 No change      33  32.7 "32.7%\n(n=33)"

# stacked like your screenshot
p_ed_change / p_ip_change

# =========================
# DEMOGRAPHIC TABLE
# Keeps original combined race table
# Also creates an expanded race table that splits multi-race responses
# =========================

race_col <- "race"
eth_col  <- "ethnicity"

stopifnot(race_col %in% names(df_status))
stopifnot(eth_col  %in% names(df_status))

canon_multiselect <- function(x) {
  x <- as.character(x)
  x <- str_trim(x)
  x[x == ""] <- NA_character_

  sapply(x, function(val) {
    if (is.na(val)) return(NA_character_)
    parts <- unlist(str_split(val, "\\r?\\n|\\s*[;,]\\s*"))
    parts <- str_trim(parts)
    parts <- parts[parts != ""]
    if (length(parts) == 0) return(NA_character_)
    paste(sort(unique(parts)), collapse = "; ")
  }, USE.NAMES = FALSE)
}

race_map <- c(
  "R1" = "American Indian / Alaska Native",
  "R2" = "Asian",
  "R3" = "Black or African-American",
  "R4" = "Native Hawaiian / Pacific Islander",
  "R5" = "White",
  "R9" = "Other",
  "S1" = "Patient Declined",
  "S2" = "Patient Unavailable",
  "S3" = "Not Applicable / Unknown"
)

clean_race_value <- function(x_vec) {
  x_vec <- canon_multiselect(x_vec)
  sapply(x_vec, function(s) {
    if (is.na(s)) return(NA_character_)
    items <- str_split(s, ";\\s*")[[1]]
    items <- str_trim(items)

    recoded <- vapply(items, function(it) {
      code <- str_extract(it, "^[A-Za-z]\\d+")
      if (!is.na(code) && code %in% names(race_map)) race_map[[code]] else it
    }, character(1))

    paste(sort(unique(recoded)), collapse = "; ")
  }, USE.NAMES = FALSE)
}

clean_eth_value <- function(x_vec) {
  x_vec <- canon_multiselect(x_vec)
  sapply(x_vec, function(s) {
    if (is.na(s)) return(NA_character_)
    items <- str_split(s, ";\\s*")[[1]]
    items <- str_trim(items)

    recoded <- dplyr::case_when(
      str_detect(items, "E1") ~ "Spanish / Hispanic / Latino",
      str_detect(items, "E2") ~ "Not Spanish / Hispanic / Latino",
      str_detect(items, "S1") ~ "Patient Declined",
      str_detect(items, "S2") ~ "Patient Unavailable",
      TRUE ~ NA_character_
    )
    recoded <- recoded[!is.na(recoded)]
    if (length(recoded) == 0) return(NA_character_)
    paste(sort(unique(recoded)), collapse = "; ")
  }, USE.NAMES = FALSE)
}


df_status <- df_status %>%
  mutate(
    race_clean      = clean_race_value(.data[[race_col]]),
    ethnicity_clean = clean_eth_value(.data[[eth_col]])
  )

fmt_np <- function(n, denom) {
  if (is.na(n) || denom == 0) return("-")
  paste0(n, " (", sprintf("%.1f", 100 * n / denom), "%)")
}

make_demo_table <- function(data, variable, label) {
  d <- data %>%
    mutate(Value = canon_multiselect(.data[[variable]])) %>%
    filter(!is.na(Value))

  denom_all <- nrow(d)
  denom_not <- nrow(filter(d, enrolled_group == "Not enrolled"))
  denom_enr <- nrow(filter(d, enrolled_group == "Enrolled"))
  denom_c1  <- nrow(filter(d, cohort_final == "Cohort 1"))
  denom_c2  <- nrow(filter(d, cohort_final == "Cohort 2"))
  denom_c3  <- nrow(filter(d, cohort_final == "Cohort 3"))

  all_ct <- d %>% count(Value, name = "n_all")
  not_ct <- d %>% filter(enrolled_group == "Not enrolled") %>% count(Value, name = "n_not")
  enr_ct <- d %>% filter(enrolled_group == "Enrolled") %>% count(Value, name = "n_enr")
  c1_ct  <- d %>% filter(cohort_final == "Cohort 1") %>% count(Value, name = "n_c1")
  c2_ct  <- d %>% filter(cohort_final == "Cohort 2") %>% count(Value, name = "n_c2")
  c3_ct  <- d %>% filter(cohort_final == "Cohort 3") %>% count(Value, name = "n_c3")

  all_ct %>%
    full_join(not_ct, by = "Value") %>%
    full_join(enr_ct, by = "Value") %>%
    full_join(c1_ct,  by = "Value") %>%
    full_join(c2_ct,  by = "Value") %>%
    full_join(c3_ct,  by = "Value") %>%
    mutate(
      Category = label,
      `All Patients` = mapply(fmt_np, n_all, denom_all),
      `Not enrolled` = mapply(fmt_np, n_not, denom_not),
      `Enrolled`     = mapply(fmt_np, n_enr, denom_enr),
      `Cohort 1`     = mapply(fmt_np, n_c1,  denom_c1),
      `Cohort 2`     = mapply(fmt_np, n_c2,  denom_c2),
      `Cohort 3`     = mapply(fmt_np, n_c3,  denom_c3)
    ) %>%
    select(Category, Value, `All Patients`, `Not enrolled`, `Enrolled`, `Cohort 1`, `Cohort 2`, `Cohort 3`) %>%
    arrange(Category, Value)
}

make_demo_table_expanded <- function(data, variable, label) {
  d <- data %>%
    filter(!is.na(.data[[variable]])) %>%
    mutate(
      Value = str_split(.data[[variable]], ";\\s*")
    ) %>%
    tidyr::unnest(Value) %>%
    mutate(
      Value = str_trim(as.character(Value)),
      Value = na_if(Value, "")
    ) %>%
    filter(!is.na(Value))

  denom_all <- nrow(filter(data, !is.na(.data[[variable]])))
  denom_not <- nrow(filter(data, !is.na(.data[[variable]]), enrolled_group == "Not enrolled"))
  denom_enr <- nrow(filter(data, !is.na(.data[[variable]]), enrolled_group == "Enrolled"))
  denom_c1  <- nrow(filter(data, !is.na(.data[[variable]]), cohort_final == "Cohort 1"))
  denom_c2  <- nrow(filter(data, !is.na(.data[[variable]]), cohort_final == "Cohort 2"))
  denom_c3  <- nrow(filter(data, !is.na(.data[[variable]]), cohort_final == "Cohort 3"))

  all_ct <- d %>% count(Value, name = "n_all")
  not_ct <- d %>% filter(enrolled_group == "Not enrolled") %>% count(Value, name = "n_not")
  enr_ct <- d %>% filter(enrolled_group == "Enrolled") %>% count(Value, name = "n_enr")
  c1_ct  <- d %>% filter(cohort_final == "Cohort 1") %>% count(Value, name = "n_c1")
  c2_ct  <- d %>% filter(cohort_final == "Cohort 2") %>% count(Value, name = "n_c2")
  c3_ct  <- d %>% filter(cohort_final == "Cohort 3") %>% count(Value, name = "n_c3")

  all_ct %>%
    full_join(not_ct, by = "Value") %>%
    full_join(enr_ct, by = "Value") %>%
    full_join(c1_ct,  by = "Value") %>%
    full_join(c2_ct,  by = "Value") %>%
    full_join(c3_ct,  by = "Value") %>%
    mutate(
      Category = label,
      `All Patients` = mapply(fmt_np, n_all, denom_all),
      `Not enrolled` = mapply(fmt_np, n_not, denom_not),
      `Enrolled`     = mapply(fmt_np, n_enr, denom_enr),
      `Cohort 1`     = mapply(fmt_np, n_c1,  denom_c1),
      `Cohort 2`     = mapply(fmt_np, n_c2,  denom_c2),
      `Cohort 3`     = mapply(fmt_np, n_c3,  denom_c3)
    ) %>%
    select(Category, Value, `All Patients`, `Not enrolled`, `Enrolled`, `Cohort 1`, `Cohort 2`, `Cohort 3`) %>%
    arrange(Category, Value)
}

race_table_combined <- make_demo_table(df_status, "race_clean", "Race")
race_table_expanded <- make_demo_table_expanded(df_status, "race_clean", "Race")
ethnicity_table     <- make_demo_table(df_status, "ethnicity_clean", "Ethnicity")

demographic_table_old <- bind_rows(race_table_combined, ethnicity_table)
demographic_table     <- bind_rows(race_table_expanded, ethnicity_table)

demographic_table

## # A tibble: 12 × 8
##    Category  Value  `All Patients` `Not enrolled` Enrolled `Cohort 1` `Cohort 2`
##    <chr>     <chr>  <chr>          <chr>          <chr>    <chr>      <chr>     
##  1 Race      Ameri… 4 (0.4%)       3 (0.4%)       1 (0.7%) 1 (1.7%)   -         
##  2 Race      Asian  31 (3.3%)      29 (3.5%)      2 (1.5%) -          -         
##  3 Race      Black… 252 (26.4%)    195 (23.9%)    57 (41.… 25 (41.7%) 15 (36.6%)
##  4 Race      Nativ… 3 (0.3%)       2 (0.2%)       1 (0.7%) 1 (1.7%)   -         
##  5 Race      Other  285 (29.9%)    233 (28.5%)    52 (38.… 24 (40.0%) 18 (43.9%)
##  6 Race      Patie… 38 (4.0%)      36 (4.4%)      2 (1.5%) -          2 (4.9%)  
##  7 Race      Patie… 2 (0.2%)       2 (0.2%)       -        -          -         
##  8 Race      White  404 (42.4%)    374 (45.8%)    30 (22.… 14 (23.3%) 9 (22.0%) 
##  9 Ethnicity Not S… 639 (67.2%)    556 (68.2%)    83 (61.… 36 (60.0%) 22 (53.7%)
## 10 Ethnicity Patie… 34 (3.6%)      33 (4.0%)      1 (0.7%) -          1 (2.4%)  
## 11 Ethnicity Patie… 2 (0.2%)       2 (0.2%)       -        -          -         
## 12 Ethnicity Spani… 276 (29.0%)    224 (27.5%)    52 (38.… 24 (40.0%) 18 (43.9%)
## # ℹ 1 more variable: `Cohort 3` <chr>

demographic_table_old

## # A tibble: 23 × 8
##    Category Value   `All Patients` `Not enrolled` Enrolled `Cohort 1` `Cohort 2`
##    <chr>    <chr>   <chr>          <chr>          <chr>    <chr>      <chr>     
##  1 Race     Americ… 4 (0.4%)       3 (0.4%)       1 (0.7%) 1 (1.7%)   -         
##  2 Race     Asian   29 (3.0%)      27 (3.3%)      2 (1.5%) -          -         
##  3 Race     Asian;… 1 (0.1%)       1 (0.1%)       -        -          -         
##  4 Race     Asian;… 1 (0.1%)       1 (0.1%)       -        -          -         
##  5 Race     Black … 234 (24.6%)    180 (22.0%)    54 (39.… 23 (38.3%) 14 (34.1%)
##  6 Race     Black … 1 (0.1%)       -              1 (0.7%) 1 (1.7%)   -         
##  7 Race     Black … 12 (1.3%)      11 (1.3%)      1 (0.7%) 1 (1.7%)   -         
##  8 Race     Black … 3 (0.3%)       2 (0.2%)       1 (0.7%) -          1 (2.4%)  
##  9 Race     Black … 2 (0.2%)       2 (0.2%)       -        -          -         
## 10 Race     Native… 2 (0.2%)       2 (0.2%)       -        -          -         
## # ℹ 13 more rows
## # ℹ 1 more variable: `Cohort 3` <chr>

# -------------------------
# 0) COLUMN NAMES (edit if needed)
# -------------------------
race_col <- "race"

sex_col  <- "legal_sex"
eth_col  <- "ethnicity"

needed_demo_cols <- c("enrolled_group", race_col, sex_col, eth_col)
missing_demo_cols <- setdiff(needed_demo_cols, names(df_status))
if (length(missing_demo_cols) > 0) {
  stop(
    "Missing demographic columns:\n  - ",
    paste(missing_demo_cols, collapse = "\n  - ")
  )
}

# -------------------------
# Helper for legend labels
# -------------------------
make_legend_labels <- function(df_long) {
  sizes <- df_long %>%
    distinct(id, Group) %>%
    count(Group, name = "N")

  setNames(
    paste0(sizes$Group, " (n = ", sizes$N, ")"),
    sizes$Group
  )
}

# =========================
# 1) RACE
# =========================

race_map <- c(
  "R1" = "American Indian / Alaska Native",
  "R2" = "Asian",
  "R3" = "Black or African American",
  "R4" = "Native Hawaiian / Pacific Islander",
  "R5" = "White",
  "R9" = "Other"
)

race_long <- df_status %>%
  transmute(
    id = row_number(),
    enrolled_group,
    Race_raw = str_squish(as.character(.data[[race_col]]))
  ) %>%
  filter(!is.na(Race_raw), Race_raw != "") %>%
  mutate(Race_raw = str_split(Race_raw, "\\r?\\n")) %>%
  unnest(Race_raw) %>%
  mutate(
    Race_code = str_extract(Race_raw, "^R\\d+"),
    Race_clean = unname(race_map[Race_code])
  ) %>%
  filter(!is.na(Race_clean))

race_tab <- bind_rows(
  race_long %>% mutate(Group = "All patients"),
  race_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
) %>%
  count(Group, Race_clean, name = "n") %>%
  group_by(Group) %>%
  mutate(percent = round(100 * n / sum(n), 1)) %>%
  ungroup()

race_tab <- race_tab %>%
  mutate(Group = factor(Group, levels = c("All patients", "Enrolled")))

race_legend_labels <- make_legend_labels(
  bind_rows(
    race_long %>% mutate(Group = "All patients"),
    race_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
  )
)

ggplot(race_tab, aes(x = Race_clean, y = percent, fill = Group)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_text(aes(label = paste0(percent, "%")),
            position = position_dodge(width = 0.8),
            hjust = -0.1, size = 3.6) +
  coord_flip(clip = "off") +
  scale_fill_manual(values = c("All patients" = "#CFE1F2",
                               "Enrolled" = "#1F4E79"),
                    labels = race_legend_labels) +
  expand_limits(y = max(race_tab$percent, na.rm = TRUE) + 8) +
  theme_bw() +
  labs(title = "Race Distribution: Enrolled vs All Patients",
       x = NULL,
       y = "% within group",
       fill = "") +
  theme(plot.title = element_text(face = "bold"),
        legend.position = "top")

# =========================
# 2) LEGAL SEX
# =========================

sex_long <- df_status %>%
  transmute(
    id = row_number(),
    enrolled_group,
    Sex = str_squish(as.character(.data[[sex_col]]))
  ) %>%
  filter(!is.na(Sex), Sex != "")

sex_tab <- bind_rows(
  sex_long %>% mutate(Group = "All patients"),
  sex_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
) %>%
  count(Group, Sex, name = "n") %>%
  group_by(Group) %>%
  mutate(percent = round(100 * n / sum(n), 1)) %>%
  ungroup() %>%
  mutate(Group = factor(Group, levels = c("All patients", "Enrolled")))

sex_legend_labels <- make_legend_labels(
  bind_rows(
    sex_long %>% mutate(Group = "All patients"),
    sex_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
  )
)

ggplot(sex_tab, aes(x = Sex, y = percent, fill = Group)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_text(aes(label = paste0(percent, "%")),
            position = position_dodge(width = 0.8),
            hjust = -0.1, size = 3.6) +
  coord_flip(clip = "off") +
  scale_fill_manual(values = c("All patients" = "#CFE1F2",
                               "Enrolled" = "#1F4E79"),
                    labels = sex_legend_labels) +
  expand_limits(y = max(sex_tab$percent, na.rm = TRUE) + 8) +
  theme_bw() +
  labs(title = "Legal Sex: Enrolled vs All Patients",
       x = NULL,
       y = "% within group",
       fill = "") +
  theme(plot.title = element_text(face = "bold"),
        legend.position = "top")

# =========================
# 3) ETHNICITY
# =========================

eth_long <- df_status %>%
  transmute(
    id = row_number(),
    enrolled_group,
    Eth_raw = str_squish(as.character(.data[[eth_col]])),
    Eth_clean = case_when(
      str_detect(Eth_raw, "E1") ~ "Spanish/Hispanic/Latino",
      str_detect(Eth_raw, "E2") ~ "Not Spanish/Hispanic/Latino",
      str_detect(Eth_raw, "S1") ~ "Patient declined",
      str_detect(Eth_raw, "S2") ~ "Patient unavailable",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!is.na(Eth_clean), Eth_clean != "")

eth_tab <- bind_rows(
  eth_long %>% mutate(Group = "All patients"),
  eth_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
) %>%
  count(Group, Eth_clean, name = "n") %>%
  group_by(Group) %>%
  mutate(percent = round(100 * n / sum(n), 1)) %>%
  ungroup() %>%
  mutate(Group = factor(Group, levels = c("All patients", "Enrolled")))

eth_legend_labels <- make_legend_labels(
  bind_rows(
    eth_long %>% mutate(Group = "All patients"),
    eth_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
  )
)

ggplot(eth_tab, aes(x = Eth_clean, y = percent, fill = Group)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_text(aes(label = paste0(percent, "%")),
            position = position_dodge(width = 0.8),
            hjust = -0.1, size = 3.6) +
  coord_flip(clip = "off") +
  scale_fill_manual(values = c("All patients" = "#CFE1F2",
                               "Enrolled" = "#1F4E79"),
                    labels = eth_legend_labels) +
  expand_limits(y = max(eth_tab$percent, na.rm = TRUE) + 8) +
  theme_bw() +
  labs(title = "Ethnicity: Enrolled vs All Patients",
       x = NULL,
       y = "% within group",
       fill = "") +
  theme(plot.title = element_text(face = "bold"),
        legend.position = "top")

# =========================
# T1 SELF-MANAGEMENT (PAST WEEK)
# =========================


col_t1_manage <- "t1_have_you_done_anything_in_the_past_week_to_help_manage_your_diabetes_like_walking_portion_control_or_drinking_more_water"

# ---- Safety check ----
if (!col_t1_manage %in% names(df_status)) {
  stop("Column not found:\n  ", col_t1_manage,
       "\n\nCheck names(df_status) and update col_t1_manage.")
}

# ---- 1) Clean + categorize responses ----
# ---- 1) Clean + categorize responses (FIXED: blanks/not surveyed -> NA) ----
missing_tokens <- c(
  "", "NA", "N/A", "NULL", "NONE",
  "NOT SURVEYED", "NOT ASKED", "NOT APPLICABLE",
  "SKIPPED", "MISSING", "UNKNOWN"
)

df_manage <- df_status %>%
  transmute(
    raw = .data[[col_t1_manage]],
    raw_chr = as.character(raw),
    raw_std = str_to_upper(str_squish(raw_chr)),
    category = case_when(
      # TRUE missing (actual NA)
      is.na(raw_chr) ~ NA_character_,

      # empty/whitespace after squish
      raw_std == "" ~ NA_character_,

      # common placeholders meaning "not surveyed" / missing
      raw_std %in% missing_tokens ~ NA_character_,

      # if your system uses "-" or "." as blank, include them:
      raw_std %in% c("-", ".", "--") ~ NA_character_,

      # actual responses
      raw_std %in% c("N", "NO") ~ "No",
      raw_std %in% c("Y", "YES") ~ "Yes (unspecified)",
      str_detect(raw_std, "^DAILY") ~ "Daily",
      str_detect(raw_std, "OCCASION") ~ "Occasionally",

      TRUE ~ "Other / unclear"
    )
  )

# ---- 2) Summary table ----
manage_tab <- df_manage %>%
  filter(!is.na(category)) %>%
  count(category, name = "n") %>%
  mutate(
    percent = round(100 * n / sum(n), 1)
  ) %>%
  arrange(desc(n))

manage_tab

## # A tibble: 4 × 3
##   category              n percent
##   <chr>             <int>   <dbl>
## 1 Daily                22    55  
## 2 Occasionally         12    30  
## 3 Yes (unspecified)     5    12.5
## 4 No                    1     2.5

# ---- 3) Bar chart ----
ggplot(manage_tab, aes(x = reorder(category, percent), y = percent)) +
  geom_col(width = 0.7, fill = "#4E79A7") +
  geom_text(
    aes(label = paste0(percent, "% (n=", n, ")")),
    hjust = -0.05,
    size = 3.6
  ) +
  coord_flip(clip = "off") +
  expand_limits(y = max(manage_tab$percent, na.rm = TRUE) + 8) +
  scale_y_continuous(labels = function(x) paste0(x, "%")) +
  theme_bw() +
  labs(
    title = "T1: Diabetes Self-Management Actions (Past Week)",
    x = "",
    y = "% of respondents"
  ) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text = element_text(size = 11)
  )

col_help_choices   <- "t1_did_the_nutrition_material_you_got_from_white_plains_hospital_help_you_make_healthier_food_choices"
col_conf_food      <- "t1_how_confident_are_you_in_making_food_choices_that_help_control_your_blood_sugar"
col_conf_labels    <- "t1_how_confident_are_you_in_understanding_food_labels_or_nutrition_information"
col_conf_prepare   <- "t1_how_confident_are_you_in_your_ability_to_prepare_healthy_meals_with_the_food_you_typically_have_at_home"
col_help_control   <- "t1_how_much_did_this_program_help_you_feel_more_in_control_of_your_diabetes"

# CLEAN + STANDARDIZE LIKERT RESPONSES
# =========================
# =========================
# CLEANERS
# =========================

# A) Confidence-style questions (maps numeric + misspellings + MAYBE)
clean_confidence <- function(x) {
  s <- str_to_upper(str_squish(as.character(x)))

  case_when(
    is.na(s) ~ NA_character_,
    s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,

    # numeric scales (support both 1-4 and 1-5)
    s %in% c("1") ~ "Not confident",
    s %in% c("2") ~ "Somewhat not confident",
    s %in% c("3") ~ "Confident",
    s %in% c("4") ~ "Very confident",
    s %in% c("5") ~ "Very confident",   # if a stray 5 appears, bucket to Very confident

    # explicit text + common typos
    str_detect(s, "NOT CONFIDENT") ~ "Not confident",
    str_detect(s, "SOMEWHAT NOT CONFIDENT") ~ "Somewhat not confident",
    str_detect(s, "\\bMAYBE\\b") ~ "Maybe",
    s %in% c("CONFIDENT", "COFIDENT", "CONFIDEMT", "CONFIDNT", "COFIDENT") ~ "Confident",
    str_detect(s, "VERY CONFIDENT") ~ "Very confident",
    str_detect(s, "SOMEWHAT CONFIDENT") ~ "Somewhat confident",

    TRUE ~ "Other / unclear"
  )
}

# B) Program-help / impact-style questions (A lot / Somewhat / A little, etc.)
clean_impact <- function(x) {
  s <- str_to_upper(str_squish(as.character(x)))

  case_when(
    is.na(s) ~ NA_character_,
    s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,

    # numeric (if any)
    s %in% c("5") ~ "A lot",
    s %in% c("4") ~ "Somewhat",
    s %in% c("3") ~ "A little",
    s %in% c("2") ~ "A little",
    s %in% c("1") ~ "Not at all",

    # text variants (case + spacing)
    str_detect(s, "^A LOT") ~ "A lot",
    str_detect(s, "^SOMEWHAT") ~ "Somewhat",
    str_detect(s, "A LITTLE") ~ "A little",
    str_detect(s, "NOT AT ALL") ~ "Not at all",

    # yes/no (if it sneaks in)
    s %in% c("YES", "Y") ~ "Yes",
    s %in% c("NO", "N") ~ "No",

    TRUE ~ "Other / unclear"
  )
}
plot_t1_question <- function(df, col_name, title_text, cleaner_fun) {

  if (!col_name %in% names(df)) stop("Column not found: ", col_name)

  df_plot <- df %>%
    transmute(response = cleaner_fun(.data[[col_name]])) %>%
    filter(!is.na(response)) %>%
    count(response, name = "n") %>%
    mutate(percent = round(100 * n / sum(n), 1)) %>%
    arrange(percent)

  ggplot(df_plot, aes(x = reorder(response, percent), y = percent)) +
    geom_col(fill = "#1F4E79", width = 0.7) +
    geom_text(aes(label = paste0(percent, "% (n=", n, ")")),
              hjust = -0.05, size = 3.6) +
    coord_flip(clip = "off") +
    expand_limits(y = max(df_plot$percent, na.rm = TRUE) + 8) +
    scale_y_continuous(labels = function(x) paste0(x, "%")) +
    theme_bw() +
    labs(title = title_text, x = "", y = "% of respondents") +
    theme(plot.title = element_text(face = "bold", size = 14),
          axis.text = element_text(size = 11))
}

p1 <- plot_t1_question(df_status, col_help_choices,
                       "T1: Nutrition Material Helped Make Healthier Food Choices",
                       cleaner_fun = clean_impact)

p2 <- plot_t1_question(df_status, col_conf_food,
                       "T1: Confidence in Making Food Choices to Control A1C",
                       cleaner_fun = clean_confidence)

p3 <- plot_t1_question(df_status, col_conf_labels,
                       "T1: Confidence Understanding Food Labels / Nutrition Info",
                       cleaner_fun = clean_confidence)

p4 <- plot_t1_question(df_status, col_conf_prepare,
                       "T1: Confidence Preparing Healthy Meals at Home",
                       cleaner_fun = clean_confidence)



print(p1); print(p2); print(p3); print(p4)

impact_tab <- df_status %>%
  transmute(response = clean_impact(.data[[col_help_control]])) %>%
  filter(!is.na(response)) %>%
  count(response, name = "n") %>%
  mutate(
    percent = round(100 * n / sum(n), 1),
    label = paste0(response, "\n", percent, "% (n=", n, ")")
  )

ggplot(impact_tab, aes(x = "", y = n, fill = response)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  geom_text(
    aes(label = paste0(percent, "%")),
    position = position_stack(vjust = 0.5),
    size = 4,
    color = "white"
  ) +
  scale_fill_manual(
    values = c(
      "A lot"    = "#1F4E79",
      "Somewhat" = "#2E86C1",
      "A little" = "#5DADE2",
      "Yes"      = "#A9CCE3"
    )
  ) +
  theme_void() +
  labs(
    title = "T1: Program Helped Patient Feel More in Control of Diabetes",
    fill = ""
  ) +
  theme(
    plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
    legend.position = "right"
  )

# =========================
# MEALS MADE USING PROGRAM FOOD — T1 + T2


# ---- EDIT if needed to match names(df_status) ----
col_meals_t1 <- "t1_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"
col_meals_t2 <- "t2_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"

# ---- safety check ----
need_cols <- c(col_meals_t1, col_meals_t2)
missing <- setdiff(need_cols, names(df_status))
if (length(missing) > 0) {
  stop("Missing columns:\n  - ", paste(missing, collapse = "\n  - "),
       "\n\nFix col_meals_t1 / col_meals_t2 using names(df_status).")
}

# ---- robust cleaner (works for BOTH T1 + T2) ----
clean_meals <- function(x) {

  s <- str_to_lower(str_squish(as.character(x)))

  # convert text numbers to digits
  s <- str_replace_all(s, c(
    "zero"  = "0",
    "one"   = "1",
    "two"   = "2",
    "three" = "3",
    "four"  = "4",
    "five"  = "5",
    "six"   = "6",
    "seven" = "7",
    "eight" = "8"
  ))

  case_when(
    # missing
    is.na(s) ~ NA_character_,
    s %in% c("", "na", "n/a", "null", "unsure of #", "unsure") ~ NA_character_,

    # ran out = 0
    str_detect(s, "ran out") ~ "0",

    # explicit 0 / 1
    str_detect(s, "^0$") ~ "0",
    str_detect(s, "^1$") ~ "1",

    # 2–3 bucket
    str_detect(s, "2-3") | str_detect(s, "2–3") | str_detect(s, "2 to 3") |
      str_detect(s, "^2$") | str_detect(s, "^3$") |
      str_detect(s, "2 a day") | str_detect(s, "3 a day") ~ "2–3",

    # 4–6 bucket
    str_detect(s, "4-6") | str_detect(s, "4 to 6") |
      str_detect(s, "4 to 5") |
      str_detect(s, "3-4") | str_detect(s, "3 to 4") |
      str_detect(s, "^4$") | str_detect(s, "^5$") | str_detect(s, "^6$") |
      str_detect(s, "5-6") ~ "4–6",

    # 7+
    str_detect(s, "7") | str_detect(s, "8") ~ "7+",

    TRUE ~ NA_character_
  )
}

# ---- plotting helper ----

plot_meals <- function(df, col_name, title_text, y_max = 60) {

  tab <- df %>%
    transmute(meals = clean_meals(.data[[col_name]])) %>%
    filter(!is.na(meals)) %>%
    count(meals, name = "n") %>%
    mutate(
      meals = factor(meals, levels = c("0", "1", "2–3", "4–6", "7+")),
      percent = round(100 * n / sum(n), 1),
      label = paste0(percent, "% (n=", n, ")")
    ) %>%
    arrange(meals)

  ggplot(tab, aes(x = meals, y = percent)) +
    geom_col(fill = "#1F4E79", width = 0.7) +
    geom_text(aes(label = label), vjust = -0.35, size = 4) +
    scale_y_continuous(
      limits = c(0, y_max),      # <-- FIXED Y LIMIT
      labels = function(x) paste0(x, "%")
    ) +
    theme_bw() +
    labs(
      title = title_text,
      x = "Number of meals",
      y = "% of respondents"
    ) +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      axis.text = element_text(size = 11)
    )
}
p_meals_t1 <- plot_meals(df_status, col_meals_t1,
                         "Meals Prepared Using Program Food (Past Week) — T1",
                         y_max = 60)

p_meals_t2 <- plot_meals(df_status, col_meals_t2,
                         "Meals Prepared Using Program Food (Past Week) — T2",
                         y_max = 60)

print(p_meals_t1)

print(p_meals_t2)

# =========================
# EASE OF USING PROGRAM FOOD IN MEALS (T1)
# =========================
# ---- EDIT to match names(df_status) if needed ----
col_ease_t1 <- "t1_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"
col_ease_t2 <- "t2_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"

if (!col_ease_t1 %in% names(df_status)) {
  stop("Column not found: ", col_ease_t1,
       "\n\nFind the right one with: names(df_status)[str_detect(names(df_status), 'easy')]")
}

# ---- cleaner ----
clean_ease <- function(x) {
  s <- str_to_upper(str_squish(as.character(x)))

  case_when(
    is.na(s) ~ NA_character_,
    s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,

    # numeric scale variants (adjust if your coding differs)
    s %in% c("1") ~ "Very easy",
    s %in% c("2") ~ "Easy",
    s %in% c("3") ~ "Neutral",

    # text variants
    str_detect(s, "VERY EASY") ~ "Very easy",
    s == "EASY" ~ "Easy",
    str_detect(s, "NEUTRAL") ~ "Neutral",

    TRUE ~ NA_character_
  )
}

ease_summary <- df_status %>%
  transmute(response = clean_ease(.data[[col_ease_t1]])) %>%
  filter(!is.na(response)) %>%
  count(response, name = "n") %>%
  mutate(
    response = factor(response, levels = c("Neutral", "Easy", "Very easy"), ordered = TRUE),
    percent = round(100 * n / sum(n), 1),
    label = paste0(percent, "% (n=", n, ")")
  ) %>%
  arrange(response)

ease_summary_t2 <- df_status %>%
  transmute(response = clean_ease(.data[[col_ease_t2]])) %>%
  filter(!is.na(response)) %>%
  count(response, name = "n") %>%
  mutate(
    response = factor(response,
                      levels = c("Neutral", "Easy", "Very easy"),
                      ordered = TRUE),
    percent = round(100 * n / sum(n), 1),
    label = paste0(percent, "% (n=", n, ")")
  ) %>%
  arrange(response)


ggplot(ease_summary, aes(x = response, y = percent)) +
  geom_col(fill = "#2E86C1", width = 0.7) +
  geom_text(aes(label = label), vjust = -0.35, size = 4) +
  scale_y_continuous(labels = function(x) paste0(x, "%"),
                     expand = expansion(mult = c(0, 0.12))) +
  theme_bw() +
  labs(
    title = "Ease of Using Program Food in Meals (T1)",
    x = "",
    y = "% of participants"
  ) +
  theme(
    axis.text.x = element_text(size = 11),
    plot.title  = element_text(size = 14, face = "bold")
  )

ggplot(ease_summary_t2, aes(x = response, y = percent)) +
  geom_col(fill = "#2E86C1", width = 0.7) +
  geom_text(aes(label = label), vjust = -0.35, size = 4) +
  scale_y_continuous(
    limits = c(0, 60),   # adjust if needed to match T1
    labels = function(x) paste0(x, "%")
  ) +
  theme_bw() +
  labs(
    title = "Ease of Using Program Food in Meals (T2)",
    x = "",
    y = "% of participants"
  ) +
  theme(
    axis.text.x = element_text(size = 11),
    plot.title  = element_text(size = 14, face = "bold")
  )

# 1. Identify the T1 waste column explicitly
col_waste_t1 <- "t1_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"                                                       
col_waste_t2 <- "t2_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"  
# ---- Safety check ----
need_cols <- c(col_waste_t1, col_waste_t2)
missing <- setdiff(need_cols, names(df_status))
if (length(missing) > 0) {
  stop("Missing columns:\n  - ",
       paste(missing, collapse = "\n  - "),
       "\n\nRun names(df_status) and correct the column names above.")
}

# ---- Clean Yes / No properly (ignore blanks & non-surveyed) ----
clean_yesno <- function(x) {
  s <- str_to_upper(str_squish(as.character(x)))

  case_when(
    is.na(s) ~ NA_character_,
    s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,
    s %in% c("YES", "Y") ~ "Yes",
    s %in% c("NO", "N")  ~ "No",
    TRUE ~ NA_character_
  )
}

# ---- Function to build summary + plot ----
plot_waste <- function(df, col_name, title_text) {

  tab <- df %>%
    transmute(Response = clean_yesno(.data[[col_name]])) %>%
    filter(!is.na(Response)) %>%
    count(Response, name = "n") %>%
    mutate(
      Response = factor(Response, levels = c("No", "Yes")),
      percent = round(100 * n / sum(n), 1),
      label = paste0(percent, "%\n(n=", n, ")")
    )

  ggplot(tab, aes(x = Response, y = percent, fill = Response)) +
    geom_col(width = 0.6) +
    geom_text(aes(label = label), vjust = -0.3, size = 4) +
    scale_fill_manual(values = c("No" = "#1a9850", "Yes" = "#d73027")) +
    scale_y_continuous(limits = c(0, 100),
                       labels = function(x) paste0(x, "%")) +
    theme_bw() +
    labs(
      title = title_text,
      x = "",
      y = "% of respondents"
    ) +
    theme(
      legend.position = "none",
      plot.title = element_text(size = 14, face = "bold")
    )
}

# ---- Draw both graphs ----
p_waste_t1 <- plot_waste(df_status, col_waste_t1,
                         "Did Patients Throw Away Delivered Food? (T1)")

p_waste_t2 <- plot_waste(df_status, col_waste_t2,
                         "Did Patients Throw Away Delivered Food? (T2)")

print(p_waste_t1)

print(p_waste_t2)

# =========================
# SLIDE: SAMPLE SNAPSHOT
# =========================

sample_snapshot <- df_status %>%
  count(enrolled_group, cohort_final, name = "N") %>%
  arrange(enrolled_group, cohort_final)

sample_snapshot

## # A tibble: 4 × 3
##   enrolled_group cohort_final     N
##   <chr>          <chr>        <int>
## 1 Enrolled       Cohort 1        60
## 2 Enrolled       Cohort 2        41
## 3 Enrolled       Cohort 3        35
## 4 Not enrolled   Not enrolled   817

# =========================
# SLIDE: DENOMINATORS (T1/T2 response counts)
# Uses your existing is_nonblank()
# =========================

count_answered <- function(df, col) {
  tibble(
    column = col,
    answered_n = sum(is_nonblank(df[[col]])),
    total_n = nrow(df),
    answered_pct = round(100 * answered_n / total_n, 1)
  )
}

# list the columns you’re actually plotting in the deck
deck_cols <- c(
  # A1c change fields you already defined earlier
  a1c_t1_col, a1c_t2_col,

  # utilization
  col_ed_base, col_ed_t1, col_ed_t2,
  col_ip_base, col_ip_t1, col_ip_t2,

  # engagement + experience
  col_meals_t1, col_meals_t2,
  col_ease_t1,  col_ease_t2,
  col_t1_manage,

  # confidence/help
  col_help_choices, col_conf_food, col_conf_labels, col_conf_prepare, col_help_control
)

missing_deck_cols <- setdiff(deck_cols, names(df_status))
if (length(missing_deck_cols) > 0) {
  stop("Deck columns missing:\n  - ", paste(missing_deck_cols, collapse = "\n  - "))
}

denoms_all <- bind_rows(lapply(deck_cols, function(cc) count_answered(df_status, cc))) %>%
  arrange(desc(answered_n))

denoms_all

## # A tibble: 18 × 4
##    column                                        answered_n total_n answered_pct
##    <chr>                                              <int>   <int>        <dbl>
##  1 ed_vis_last_90_days                                  953     953        100  
##  2 hosp_last_90_days                                    953     953        100  
##  3 ed_utilization_t1                                    107     953         11.2
##  4 ip_utilization_t1                                    107     953         11.2
##  5 t1_in_the_past_week_how_many_meals_did_you_m…         66     953          6.9
##  6 t1_in_the_past_week_how_easy_was_it_to_use_t…         64     953          6.7
##  7 t1_how_confident_are_you_in_making_food_choi…         64     953          6.7
##  8 t1_how_confident_are_you_in_understanding_fo…         64     953          6.7
##  9 t1_how_much_did_this_program_help_you_feel_m…         64     953          6.7
## 10 t1_did_the_nutrition_material_you_got_from_w…         61     953          6.4
## 11 t1_how_confident_are_you_in_your_ability_to_…         61     953          6.4
## 12 hospital_utilization_t2                               60     953          6.3
## 13 ed_utilization_t2                                     59     953          6.2
## 14 a1c_change_t1                                         55     953          5.8
## 15 t2_in_the_past_week_how_many_meals_did_you_m…         43     953          4.5
## 16 t2_in_the_past_week_how_easy_was_it_to_use_t…         42     953          4.4
## 17 t1_have_you_done_anything_in_the_past_week_t…         40     953          4.2
## 18 a1c_change_t2                                         31     953          3.3

denoms_by_cohort <- function(df, cols) {
  bind_rows(lapply(cols, function(cc) {
    df %>%
      group_by(cohort_final) %>%
      summarise(
        column = cc,
        answered_n = sum(is_nonblank(.data[[cc]])),
        total_n = n(),
        answered_pct = round(100 * answered_n / total_n, 1),
        .groups = "drop"
      )
  }))
}

denoms_by_cohort(df_status, deck_cols)

## # A tibble: 17,154 × 5
##    cohort_final column                           answered_n total_n answered_pct
##    <chr>        <chr>                                 <int>   <int>        <dbl>
##  1 Cohort 1     Abdominal Pain; Vomiting; Diarr…         29      60         48.3
##  2 Cohort 1     Flank Pain; Difficulty Urinating         29      60         48.3
##  3 Cohort 1     <NA>                                     29      60         48.3
##  4 Cohort 1     Rash                                     29      60         48.3
##  5 Cohort 1     Blood in Urine                           29      60         48.3
##  6 Cohort 1     Fever; Post-op Problem                   29      60         48.3
##  7 Cohort 1     Irregular Heart Beat                     29      60         48.3
##  8 Cohort 1     Foot Swelling; Wound Check               29      60         48.3
##  9 Cohort 1     Chest Pain                               29      60         48.3
## 10 Cohort 1     Hyperglycemia; Nausea; Diarrhea          29      60         48.3
## # ℹ 17,144 more rows

# =========================
# SLIDE: A1c CHANGE SUMMARY TABLES
# Uses your existing parse_delta()
# =========================

a1c_change_summary <- function(df_in, delta_col, label) {
  d <- df_in %>%
    transmute(delta = parse_delta(.data[[delta_col]])) %>%
    filter(!is.na(delta)) %>%
    mutate(
      category = case_when(
        delta <= -1.0 ~ "Improved (≥1.0% ↓)",
        delta >=  1.0 ~ "Worsened (≥1.0% ↑)",
        TRUE          ~ "No meaningful change (<1.0%)"
      ),
      category = factor(category, levels = c(
        "Improved (≥1.0% ↓)",
        "No meaningful change (<1.0%)",
        "Worsened (≥1.0% ↑)"
      ))
    )

  tab <- d %>%
    count(category, name = "n") %>%
    mutate(
      pct = round(100 * n / sum(n), 1),
      N_total = sum(n),
      panel = label
    )

  tab
}

a1c_t1_sum <- a1c_change_summary(
  df_status %>% filter(cohort_final %in% c("Cohort 1", "Cohort 2")),
  a1c_t1_col,
  "T1: Cohort 1 + 2"
)

a1c_t2_sum <- a1c_change_summary(
  df_status %>% filter(cohort_final == "Cohort 1"),
  a1c_t2_col,
  "T2: Cohort 1"
)

bind_rows(a1c_t1_sum, a1c_t2_sum)

## # A tibble: 6 × 5
##   category                         n   pct N_total panel           
##   <fct>                        <int> <dbl>   <int> <chr>           
## 1 Improved (≥1.0% ↓)              32  62.7      51 T1: Cohort 1 + 2
## 2 No meaningful change (<1.0%)    17  33.3      51 T1: Cohort 1 + 2
## 3 Worsened (≥1.0% ↑)               2   3.9      51 T1: Cohort 1 + 2
## 4 Improved (≥1.0% ↓)              19  63.3      30 T2: Cohort 1    
## 5 No meaningful change (<1.0%)     9  30        30 T2: Cohort 1    
## 6 Worsened (≥1.0% ↑)               2   6.7      30 T2: Cohort 1

# =========================
# SLIDE: UTILIZATION CHANGE TABLE (Baseline -> T1), Enrolled
# Uses df_util_pairs you already create
# =========================

util_change_summary <- function(df, base_var, t1_var, label) {
  df %>%
    transmute(
      domain = label,
      change = case_when(
        .data[[t1_var]] < .data[[base_var]] ~ "Less",
        .data[[t1_var]] > .data[[base_var]] ~ "More",
        TRUE                                 ~ "No change"
      )
    ) %>%
    count(domain, change, name = "n") %>%
    group_by(domain) %>%
    mutate(
      pct = round(100 * n / sum(n), 1),
      N = sum(n)
    ) %>%
    ungroup() %>%
    mutate(change = factor(change, levels = c("Less", "No change", "More")))
}

ed_change_sum <- util_change_summary(df_util_pairs, "ED_base", "ED_T1", "ED visits")
ip_change_sum <- util_change_summary(df_util_pairs, "IP_base", "IP_T1", "IP stays")

bind_rows(ed_change_sum, ip_change_sum)

## # A tibble: 6 × 5
##   domain    change        n   pct     N
##   <chr>     <fct>     <int> <dbl> <int>
## 1 ED visits Less         58  57.4   101
## 2 ED visits More          9   8.9   101
## 3 ED visits No change    34  33.7   101
## 4 IP stays  Less         62  61.4   101
## 5 IP stays  More          6   5.9   101
## 6 IP stays  No change    33  32.7   101

# =========================
# COHORT 1 — HbA1c SLIDE-READY STATS TABLE
# Uses existing: df_c1, clean_a1c(), col_a1c_all, col_a1c_t1, col_a1c_t2
# =========================

stopifnot(exists("df_c1"))
need_cols <- c(col_a1c_all, col_a1c_t1, col_a1c_t2)
missing <- setdiff(need_cols, names(df_c1))
if (length(missing) > 0) {
  stop("Missing HbA1c columns in df_c1:\n  - ", paste(missing, collapse = "\n  - "))
}

# ---- Build Cohort 1 HbA1c numeric fields ----
a1c_c1 <- df_c1 %>%
  transmute(
    id = row_number(),
    a1c_base = clean_a1c(.data[[col_a1c_all]]),
    a1c_t1   = clean_a1c(.data[[col_a1c_t1]]),
    a1c_t2   = clean_a1c(.data[[col_a1c_t2]])
  )

# ---- helpers for clean formatting ----
fmt_mean_sd <- function(x, digits = 2) {
  x <- x[!is.na(x)]
  if (length(x) == 0) return("-")
  paste0(
    format(round(mean(x), digits), nsmall = digits),
    " ± ",
    format(round(sd(x), digits), nsmall = digits)
  )
}

fmt_num <- function(x, digits = 2) {
  if (is.na(x)) return(NA_character_)
  format(round(x, digits), nsmall = digits)
}

fmt_p <- function(p) {
  if (is.na(p)) return(NA_character_)
  if (p < 0.001) return("<0.001")
  format(round(p, 3), nsmall = 3)
}

# ---- one comparison -> one clean row ----
a1c_compare_row <- function(df, x_var, y_var, label, digits = 2) {

  d <- df %>%
    transmute(x = .data[[x_var]], y = .data[[y_var]]) %>%
    filter(!is.na(x) & !is.na(y)) %>%
    mutate(diff = y - x)

  n <- nrow(d)
  if (n < 3) {
    return(tibble(
      Comparison = label,
      `n (paired)` = n,
      `Baseline mean ± SD` = "-",
      `Follow-up mean ± SD` = "-",
      `Mean Δ (pp) [95% CI]` = "Not enough paired data",
      `Median Δ (pp)` = "-",
      `p (paired t)` = "-"
    ))
  }

  tt <- t.test(d$y, d$x, paired = TRUE)

  tibble(
    Comparison = label,
    `n (paired)` = n,
    `Baseline mean ± SD`  = fmt_mean_sd(d$x, digits),
    `Follow-up mean ± SD` = fmt_mean_sd(d$y, digits),
    `Mean Δ (pp) [95% CI]` = paste0(
      fmt_num(mean(d$diff), digits),
      " [",
      fmt_num(tt$conf.int[1], digits),
      ", ",
      fmt_num(tt$conf.int[2], digits),
      "]"
    ),
    `Median Δ (pp)` = fmt_num(median(d$diff), digits),
    `p (paired t)`  = fmt_p(tt$p.value)
  )
}

# ---- MAIN SLIDE TABLE (clean) ----
a1c_slide_table_c1 <- bind_rows(
  a1c_compare_row(a1c_c1, "a1c_base", "a1c_t1", "Cohort 1: Baseline → T1"),
  a1c_compare_row(a1c_c1, "a1c_base", "a1c_t2", "Cohort 1: Baseline → T2"),
  a1c_compare_row(a1c_c1, "a1c_t1",   "a1c_t2", "Cohort 1: T1 → T2")
)

a1c_slide_table_c1

## # A tibble: 3 × 7
##   Comparison             `n (paired)` `Baseline mean ± SD` `Follow-up mean ± SD`
##   <chr>                         <int> <chr>                <chr>                
## 1 Cohort 1: Baseline → …           29 10.24 ± 2.40         7.91 ± 1.81          
## 2 Cohort 1: Baseline → …           30 10.62 ± 2.03         8.57 ± 2.27          
## 3 Cohort 1: T1 → T2                18 7.82 ± 1.72          7.68 ± 1.94          
## # ℹ 3 more variables: `Mean Δ (pp) [95% CI]` <chr>, `Median Δ (pp)` <chr>,
## #   `p (paired t)` <chr>

# Optional: nice rendering in Rmd
# knitr::kable(a1c_slide_table_c1, caption = "Cohort 1 HbA1c change (paired)")

# ---- OPTIONAL APPENDIX TABLE (diagnostics + nonparametric) ----
a1c_diag_row <- function(df, x_var, y_var, label) {

  d <- df %>%
    transmute(x = .data[[x_var]], y = .data[[y_var]]) %>%
    filter(!is.na(x) & !is.na(y)) %>%
    mutate(diff = y - x)

  n <- nrow(d)
  if (n < 3) {
    return(tibble(
      Comparison = label,
      `n (paired)` = n,
      `Shapiro p (diff)` = "-",
      `p (Wilcoxon signed-rank)` = "-"
    ))
  }

  sh <- shapiro.test(d$diff)
  wx <- wilcox.test(d$y, d$x, paired = TRUE, exact = FALSE)

  tibble(
    Comparison = label,
    `n (paired)` = n,
    `Shapiro p (diff)` = fmt_p(sh$p.value),
    `p (Wilcoxon signed-rank)` = fmt_p(wx$p.value)
  )
}

a1c_appendix_diag_c1 <- bind_rows(
  a1c_diag_row(a1c_c1, "a1c_base", "a1c_t1", "Cohort 1: Baseline → T1"),
  a1c_diag_row(a1c_c1, "a1c_base", "a1c_t2", "Cohort 1: Baseline → T2"),
  a1c_diag_row(a1c_c1, "a1c_t1",   "a1c_t2", "Cohort 1: T1 → T2")
)

a1c_appendix_diag_c1

## # A tibble: 3 × 4
##   Comparison              `n (paired)` `Shapiro p (diff)` p (Wilcoxon signed-r…¹
##   <chr>                          <int> <chr>              <chr>                 
## 1 Cohort 1: Baseline → T1           29 0.663              <0.001                
## 2 Cohort 1: Baseline → T2           30 0.964              <0.001                
## 3 Cohort 1: T1 → T2                 18 0.017              0.541                 
## # ℹ abbreviated name: ¹`p (Wilcoxon signed-rank)`

# =========================
# COHORT 1 HbA1c — PAIRED CHANGE PLOT (spaghetti + mean)
# =========================

stopifnot(exists("df_c1"))

a1c_pairs_long <- df_c1 %>%
  transmute(
    id = row_number(),
    Baseline = clean_a1c(.data[[col_a1c_all]]),
    T1       = clean_a1c(.data[[col_a1c_t1]]),
    T2       = clean_a1c(.data[[col_a1c_t2]])
  ) %>%
  pivot_longer(cols = c(Baseline, T1, T2),
               names_to = "Timepoint",
               values_to = "HbA1c") %>%
  filter(!is.na(HbA1c)) %>%
  mutate(
    Timepoint = factor(Timepoint, levels = c("Baseline", "T1", "T2"))
  )

# Only keep patients who have baseline AND at least one follow-up
a1c_keep_ids <- a1c_pairs_long %>%
  group_by(id) %>%
  summarize(has_base = any(Timepoint == "Baseline"),
            has_fu   = any(Timepoint %in% c("T1","T2")),
            .groups = "drop") %>%
  filter(has_base & has_fu) %>%
  pull(id)

a1c_pairs_long <- a1c_pairs_long %>% filter(id %in% a1c_keep_ids)

# mean points per timepoint (computed from the plotted data)
a1c_means <- a1c_pairs_long %>%
  group_by(Timepoint) %>%
  summarize(
    mean_a1c = mean(HbA1c, na.rm = TRUE),
    n = n_distinct(id),
    .groups = "drop"
  )

ggplot(a1c_pairs_long, aes(x = Timepoint, y = HbA1c, group = id)) +
  geom_line(alpha = 0.25, linewidth = 0.6) +
  geom_point(alpha = 0.55, size = 1.6) +
  geom_point(data = a1c_means, aes(x = Timepoint, y = mean_a1c),
             inherit.aes = FALSE, size = 3.2) +
  geom_line(data = a1c_means, aes(x = Timepoint, y = mean_a1c, group = 1),
            inherit.aes = FALSE, linewidth = 1.1) +
  labs(
    title = "Cohort 1 HbA1c Over Time (Paired Patients)",
    subtitle = "Thin lines = individual patients; thick line/dots = mean among plotted patients",
    x = NULL,
    y = "HbA1c (%)"
  ) +
  theme_bw() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text  = element_text(size = 11)
  )

# =========================
# COHORT 1 HbA1c — CHANGE DISTRIBUTION (Δ) with mean + 95% CI
# =========================

a1c_diff_long <- df_c1 %>%
  transmute(
    id = row_number(),
    a1c_base = clean_a1c(.data[[col_a1c_all]]),
    a1c_t1   = clean_a1c(.data[[col_a1c_t1]]),
    a1c_t2   = clean_a1c(.data[[col_a1c_t2]])
  )

# Build paired diffs (follow-up minus baseline)
diff_t1 <- a1c_diff_long %>%
  filter(!is.na(a1c_base) & !is.na(a1c_t1)) %>%
  transmute(Comparison = "Baseline → T1", diff = a1c_t1 - a1c_base)

diff_t2 <- a1c_diff_long %>%
  filter(!is.na(a1c_base) & !is.na(a1c_t2)) %>%
  transmute(Comparison = "Baseline → T2", diff = a1c_t2 - a1c_base)

diff_plot <- bind_rows(diff_t1, diff_t2) %>%
  mutate(Comparison = factor(Comparison, levels = c("Baseline → T1", "Baseline → T2")))

# Mean + 95% CI per comparison (paired diff CI)
diff_ci <- diff_plot %>%
  group_by(Comparison) %>%
  summarize(
    n = n(),
    mean_diff = mean(diff, na.rm = TRUE),
    tt = list(t.test(diff, mu = 0)),
    lo = tt[[1]]$conf.int[1],
    hi = tt[[1]]$conf.int[2],
    .groups = "drop"
  )

ggplot(diff_plot, aes(x = Comparison, y = diff)) +
  geom_hline(yintercept = 0, linetype = "dashed", linewidth = 0.7) +
  geom_boxplot(width = 0.35, outlier.shape = NA, alpha = 0.25) +
  geom_jitter(width = 0.12, height = 0, alpha = 0.55, size = 1.6) +
  geom_point(data = diff_ci, aes(x = Comparison, y = mean_diff),
             inherit.aes = FALSE, size = 3.0) +
  geom_errorbar(data = diff_ci, aes(x = Comparison, ymin = lo, ymax = hi),
                inherit.aes = FALSE, width = 0.10, linewidth = 0.9) +
  labs(
    title = "Cohort 1 HbA1c Change (Δ) with 95% CI",
    subtitle = "Δ = follow-up HbA1c − baseline HbA1c (negative = improvement)",
    x = NULL,
    y = "HbA1c change (percentage points)"
  ) +
  theme_bw() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text  = element_text(size = 11)
  )

# =========================
# FIND DISCONTINUATION FIELDS (by column name)
# =========================

kw <- c(
  "discont", "discharg", "drop", "withdraw", "declin", "expire", "deceas",
  "status", "active", "inactive", "program", "enroll", "end", "stop", "reason", "closed"
)

cand_cols <- names(df_status)[
  Reduce(`|`, lapply(kw, function(k) str_detect(names(df_status), fixed(k))))
]

cand_cols

##  [1] "pre_enrollment_notes"                                                                                                
##  [2] "enrolled"                                                                                                            
##  [3] "discharge_disposition"                                                                                               
##  [4] "pt_portal_status"                                                                                                    
##  [5] "ps_in_the_past_12_months_did_the_food_you_could_afford_not_last_until_the_end_of_the_month_and_you_couldn_t_get_more"
##  [6] "pre_screen_interested_in_program"                                                                                    
##  [7] "screen_for_enrollment_interested_in_program"                                                                         
##  [8] "t0_since_your_discharge_from_the_hospital_have_you_had_an_appointment_with_a_healthcare_provider_for_your_diabetes"  
##  [9] "t1_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"                                         
## [10] "t1_how_much_did_this_program_help_you_feel_more_in_control_of_your_diabetes"                                         
## [11] "in_one_to_two_sentences_can_you_tell_me_what_feedback_you_have_that_can_improve_this_program_for_others_58"          
## [12] "t1_do_you_have_any_suggestions_or_feedback_to_improve_this_program_for_others"                                       
## [13] "t2_in_the_past_6_months_did_the_food_you_could_afford_not_last_until_the_end_of_the_month_and_you_couldn_t_get_more" 
## [14] "t2_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"                                         
## [15] "t2_how_much_did_this_program_help_you_feel_more_in_control_of_your_diabetes"                                         
## [16] "in_one_to_two_sentences_can_you_tell_me_what_feedback_you_have_that_can_improve_this_program_for_others_81"          
## [17] "t2_do_you_have_any_suggestions_or_feedback_to_improve_this_program_for_others"                                       
## [18] "enrolled_clean"                                                                                                      
## [19] "enrolled_group"

# ---- Total counts table ----
total_counts <- df_status %>%
  summarise(
    Enrolled = sum(enrolled_group == "Enrolled", na.rm = TRUE),
    Cohort_1 = sum(cohort_final == "Cohort 1", na.rm = TRUE),
    Cohort_2 = sum(cohort_final == "Cohort 2", na.rm = TRUE),
    Cohort_3 = sum(cohort_final == "Cohort 3", na.rm = TRUE)
  )

total_counts

## # A tibble: 1 × 4
##   Enrolled Cohort_1 Cohort_2 Cohort_3
##      <int>    <int>    <int>    <int>
## 1      136       60       41       35

parse_delta <- function(x) {
  suppressWarnings(as.numeric(stringr::str_replace_all(
    stringr::str_squish(as.character(x)),
    "[^0-9.\\-]", ""
  )))
}

a1c_cards <- function(data, delta_col, cohort_filter, label) {

  d <- data %>%
    filter(!!rlang::parse_expr(cohort_filter)) %>%
    transmute(delta = parse_delta(.data[[delta_col]])) %>%
    filter(!is.na(delta)) %>%
    mutate(
      bucket = dplyr::case_when(
        delta <= -1.0 ~ "Improved",
        delta >=  1.0 ~ "Worsened",
        TRUE          ~ "No change"
      )
    )

  out <- d %>%
    count(bucket, name = "n") %>%
    mutate(
      N = sum(n),
      pct = round(100 * n / N, 1)
    ) %>%
    arrange(factor(bucket, levels = c("Improved","No change","Worsened"))) %>%
    mutate(Slide = label) %>%
    select(Slide, bucket, n, N, pct)

  out
}

# ---- VERIFY: A1C Change at T1: Cohort 1 + Cohort 2 (n should be 51 on slide) ----
a1c_t1_verify <- a1c_cards(
  df_status,
  delta_col = a1c_t1_col,
  cohort_filter = "cohort_final %in% c('Cohort 1','Cohort 2')",
  label = "A1C Change (T1): Cohort 1 + Cohort 2"
)
a1c_t1_verify

## # A tibble: 3 × 5
##   Slide                                bucket        n     N   pct
##   <chr>                                <chr>     <int> <int> <dbl>
## 1 A1C Change (T1): Cohort 1 + Cohort 2 Improved     32    51  62.7
## 2 A1C Change (T1): Cohort 1 + Cohort 2 No change    17    51  33.3
## 3 A1C Change (T1): Cohort 1 + Cohort 2 Worsened      2    51   3.9

# ---- VERIFY: A1C Change at T2: Cohort 1 only (n should be 30 on slide) ----
a1c_t2_verify <- a1c_cards(
  df_status,
  delta_col = a1c_t2_col,
  cohort_filter = "cohort_final == 'Cohort 1'",
  label = "A1C Change (T2): Cohort 1"
)
a1c_t2_verify

## # A tibble: 3 × 5
##   Slide                     bucket        n     N   pct
##   <chr>                     <chr>     <int> <int> <dbl>
## 1 A1C Change (T2): Cohort 1 Improved     19    30  63.3
## 2 A1C Change (T2): Cohort 1 No change     9    30  30  
## 3 A1C Change (T2): Cohort 1 Worsened      2    30   6.7

# =========================
# A1C (PDSA language, cohort-specific ENROLLED stats)
# Uses your existing make_a1c_plots() + a1c_change_* columns
# Does NOT modify df_status
# =========================

stopifnot(exists("df_status"))
stopifnot(exists("make_a1c_plots"))
stopifnot("enrolled_group" %in% names(df_status))
stopifnot("cohort_final" %in% names(df_status))

# ---- ENROLLED-only analysis frame (new object; df_status untouched) ----
df_enrolled <- df_status %>% dplyr::filter(enrolled_group == "Enrolled")

# ---- Map your existing delta columns to how YOU will speak (PDSA) ----
# IMPORTANT: we are NOT re-computing deltas here; we are just relabeling.
# Adjust labels if needed, but keep the columns.
cycle_map <- list(
  # You said: Cohort 2 uses "t0->t1" in the Nov→Feb window.
  # In your current dataset, that delta is stored in a1c_change_t1.
  PDSA2_C2 = list(
    cohort = "Cohort 2",
    delta_col = a1c_t1_col,
    title = "A1C Change — PDSA Cycle 2 (Nov→Feb): Cohort 2 (ENROLLED)"
  ),

  # Cohort 1: you want to discuss baseline→3mo (Aug→Oct) AND later (Nov→Feb).
  # With your current objects, you only have two delta fields: a1c_change_t1 and a1c_change_t2.
  # Use whichever your chart reviewer intended:
  # - a1c_change_t1 = earlier delta
  # - a1c_change_t2 = later / most recent delta
  PDSA1_C1 = list(
    cohort = "Cohort 1",
    delta_col = a1c_t1_col,
    title = "A1C Change — PDSA Cycle 1 (Aug→Oct): Cohort 1 (ENROLLED)"
  ),
  PDSA2_C1 = list(
    cohort = "Cohort 1",
    delta_col = a1c_t2_col,
    title = "A1C Change — PDSA Cycle 2 (Nov→Feb): Cohort 1 (ENROLLED)"
  )
)

# ---- Safety checks for the mapped columns ----
need_delta_cols <- unique(vapply(cycle_map, function(x) x$delta_col, character(1)))
missing_delta_cols <- setdiff(need_delta_cols, names(df_status))
if (length(missing_delta_cols) > 0) {
  stop(
    "Missing A1C delta columns in df_status:\n  - ",
    paste(missing_delta_cols, collapse = "\n  - "),
    "\n\nFix your a1c_t*_col variables or cycle_map."
  )
}

# ---- Run each cohort/cycle using your existing plot function ----
run_cycle <- function(df_in, cohort_name, delta_col, title_text) {
  df_cycle <- df_in %>% dplyr::filter(cohort_final == cohort_name)
  out <- make_a1c_plots(df_cycle, delta_col, title_text)
  list(tab = out$tab, bar = out$bar, dist = out$dist)
}

# Cohort 1 PDSA1 (uses a1c_change_t1 as stored)
res_pdsa1_c1 <- run_cycle(df_enrolled, cycle_map$PDSA1_C1$cohort,
                          cycle_map$PDSA1_C1$delta_col, cycle_map$PDSA1_C1$title)

# Cohort 1 PDSA2 (uses a1c_change_t2 as stored)
res_pdsa2_c1 <- run_cycle(df_enrolled, cycle_map$PDSA2_C1$cohort,
                          cycle_map$PDSA2_C1$delta_col, cycle_map$PDSA2_C1$title)

# Cohort 2 PDSA2 (uses a1c_change_t1 as stored)
res_pdsa2_c2 <- run_cycle(df_enrolled, cycle_map$PDSA2_C2$cohort,
                          cycle_map$PDSA2_C2$delta_col, cycle_map$PDSA2_C2$title)

# ---- Outputs (tables + slide-ready bar charts) ----
res_pdsa1_c1$tab

## # A tibble: 3 × 4
##   Category                 n   pct label        
##   <fct>                <int> <dbl> <chr>        
## 1 Improved (≥1.0%)        19  67.9 67.9%  (n=19)
## 2 No meaningful change     8  28.6 28.6%  (n=8) 
## 3 Worsened (≥1.0%)         1   3.6 3.6%  (n=1)

res_pdsa1_c1$bar

res_pdsa2_c1$tab

## # A tibble: 3 × 4
##   Category                 n   pct label        
##   <fct>                <int> <dbl> <chr>        
## 1 Improved (≥1.0%)        19  63.3 63.3%  (n=19)
## 2 No meaningful change     9  30   30%  (n=9)   
## 3 Worsened (≥1.0%)         2   6.7 6.7%  (n=2)

res_pdsa2_c1$bar

res_pdsa2_c2$tab

## # A tibble: 3 × 4
##   Category                 n   pct label        
##   <fct>                <int> <dbl> <chr>        
## 1 Improved (≥1.0%)        13  56.5 56.5%  (n=13)
## 2 No meaningful change     9  39.1 39.1%  (n=9) 
## 3 Worsened (≥1.0%)         1   4.3 4.3%  (n=1)

res_pdsa2_c2$bar

# ---- helpers (do NOT modify df_status) ----
is_nonblank <- function(x) {
  x_chr <- as.character(x)
  x_std <- stringr::str_to_upper(stringr::str_squish(x_chr))
  !(is.na(x_chr) | x_std == "" | x_std %in% c("NA", "N/A", "NULL"))
}

pct <- function(n, d) ifelse(d == 0, NA_real_, round(100 * n / d, 1))

parse_num <- function(x) {
  suppressWarnings(as.numeric(stringr::str_replace_all(
    stringr::str_squish(as.character(x)),
    "[^0-9.\\-]", ""
  )))
}

parse_count <- function(x) {
  s <- stringr::str_squish(as.character(x))
  s[stringr::str_to_upper(s) %in% c("", "NA","N/A","NULL")] <- NA_character_
  suppressWarnings(as.numeric(stringr::str_replace_all(s, "[^0-9\\-\\.]", "")))
}

clean_yesno <- function(x) {
  s <- stringr::str_to_upper(stringr::str_squish(as.character(x)))
  dplyr::case_when(
    is.na(s) ~ NA_character_,
    s %in% c("", "NA","N/A","NULL") ~ NA_character_,
    s %in% c("Y","YES") ~ "Yes",
    s %in% c("N","NO")  ~ "No",
    TRUE ~ NA_character_
  )
}
race_col <- "race"
eth_col  <- "ethnicity"
sex_col  <- "legal_sex"

demo_ns <- df_status %>%
  dplyr::summarise(
    all_race_n = sum(is_nonblank(.data[[race_col]])),
    enr_race_n = sum(enrolled_group == "Enrolled" & is_nonblank(.data[[race_col]])),

    all_eth_n  = sum(is_nonblank(.data[[eth_col]])),
    enr_eth_n  = sum(enrolled_group == "Enrolled" & is_nonblank(.data[[eth_col]])),

    all_sex_n  = sum(is_nonblank(.data[[sex_col]])),
    enr_sex_n  = sum(enrolled_group == "Enrolled" & is_nonblank(.data[[sex_col]]))
  )

demo_ns

## # A tibble: 1 × 6
##   all_race_n enr_race_n all_eth_n enr_eth_n all_sex_n enr_sex_n
##        <int>      <int>     <int>     <int>     <int>     <int>
## 1        953        136       953       136       953       136

race_map <- c(
  "R1" = "American Indian / Alaska Native",
  "R2" = "Asian",
  "R3" = "Black or African American",
  "R4" = "Native Hawaiian / Pacific Islander",
  "R5" = "White",
  "R9" = "Other"
)

race_long <- df_status %>%
  dplyr::transmute(
    enrolled_group,
    Race_raw = stringr::str_squish(as.character(.data[[race_col]]))
  ) %>%
  dplyr::filter(is_nonblank(Race_raw)) %>%
  dplyr::mutate(Race_raw = stringr::str_split(Race_raw, "\\r?\\n")) %>%
  tidyr::unnest(Race_raw) %>%
  dplyr::mutate(
    Race_code  = stringr::str_extract(Race_raw, "^R\\d+"),
    Race_clean = unname(race_map[Race_code])
  ) %>%
  dplyr::filter(!is.na(Race_clean))

race_tab <- dplyr::bind_rows(
  race_long %>% dplyr::mutate(Group = "All Patients"),
  race_long %>% dplyr::filter(enrolled_group == "Enrolled") %>% dplyr::mutate(Group = "Enrolled")
) %>%
  dplyr::count(Group, Race_clean, name = "n") %>%
  dplyr::group_by(Group) %>%
  dplyr::mutate(
    N = sum(n),
    percent = pct(n, N)
  ) %>%
  dplyr::ungroup()

# If the slide only shows these 4:
race_slide <- race_tab %>%
  dplyr::filter(Race_clean %in% c("Black or African American","Other","White","Asian")) %>%
  dplyr::arrange(dplyr::desc(Group), dplyr::desc(percent))

race_slide

## # A tibble: 8 × 5
##   Group        Race_clean                    n     N percent
##   <chr>        <chr>                     <int> <int>   <dbl>
## 1 Enrolled     Black or African American    57   135    42.2
## 2 Enrolled     Other                        48   135    35.6
## 3 Enrolled     White                        27   135    20  
## 4 Enrolled     Asian                         2   135     1.5
## 5 All Patients White                       379   924    41  
## 6 All Patients Other                       261   924    28.2
## 7 All Patients Black or African American   248   924    26.8
## 8 All Patients Asian                        30   924     3.2

eth_tab_full <- df_status %>%
  dplyr::transmute(
    Group = dplyr::if_else(enrolled_group == "Enrolled", "Enrolled", "All Patients"),
    Eth_raw = stringr::str_to_upper(stringr::str_squish(as.character(.data[[eth_col]]))),
    Eth_clean = dplyr::case_when(
      stringr::str_detect(Eth_raw, "E1") ~ "Spanish/Hispanic/Latino",
      stringr::str_detect(Eth_raw, "E2") ~ "Not Hispanic/Latino",
      TRUE ~ "Missing/Declined/Unknown"
    )
  ) %>%
  dplyr::count(Group, Eth_clean, name = "n") %>%
  dplyr::group_by(Group) %>%
  dplyr::mutate(
    N = sum(n),
    percent = round(100 * n / N, 1)
  ) %>%
  dplyr::ungroup()

eth_tab_full

## # A tibble: 6 × 5
##   Group        Eth_clean                    n     N percent
##   <chr>        <chr>                    <int> <int>   <dbl>
## 1 All Patients Missing/Declined/Unknown    37   817     4.5
## 2 All Patients Not Hispanic/Latino        556   817    68.1
## 3 All Patients Spanish/Hispanic/Latino    224   817    27.4
## 4 Enrolled     Missing/Declined/Unknown     1   136     0.7
## 5 Enrolled     Not Hispanic/Latino         83   136    61  
## 6 Enrolled     Spanish/Hispanic/Latino     52   136    38.2

sex_base <- df_status %>%
  dplyr::transmute(
    enrolled_group,
    Sex = stringr::str_squish(as.character(.data[[sex_col]]))
  ) %>%
  dplyr::filter(is_nonblank(Sex))

sex_tab <- dplyr::bind_rows(
  sex_base %>% dplyr::mutate(Group = "All Patients"),
  sex_base %>% dplyr::filter(enrolled_group == "Enrolled") %>% 
    dplyr::mutate(Group = "Enrolled")
) %>%
  dplyr::count(Group, Sex, name = "n") %>%
  dplyr::group_by(Group) %>%
  dplyr::mutate(
    N = sum(n),
    percent = round(100 * n / N, 1)
  ) %>%
  dplyr::ungroup()

sex_tab

## # A tibble: 4 × 5
##   Group        Sex        n     N percent
##   <chr>        <chr>  <int> <int>   <dbl>
## 1 All Patients Female   414   953    43.4
## 2 All Patients Male     539   953    56.6
## 3 Enrolled     Female    74   136    54.4
## 4 Enrolled     Male      62   136    45.6

# ============================================================
# FOOD UTILIZATION + WASTE (T1 + T2)
# Uses existing df_status object (DO NOT redefine df_status)
# Focus: ENROLLED patients, stratified by cohort_final
# ============================================================

# ---- Dependencies assumed already loaded: dplyr, stringr, tidyr, ggplot2 ----

# -------------------------
# 0) Reusable helpers (define ONCE)
# -------------------------
is_nonblank <- function(x) {
  x_chr <- as.character(x)
  x_std <- stringr::str_to_upper(stringr::str_squish(x_chr))
  !(is.na(x_chr) | x_std == "" | x_std %in% c("NA", "N/A", "NULL"))
}

clean_yesno <- function(x) {
  s <- stringr::str_to_upper(stringr::str_squish(as.character(x)))
  dplyr::case_when(
    is.na(s) ~ NA_character_,
    s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,
    s %in% c("YES", "Y") ~ "Yes",
    s %in% c("NO", "N")  ~ "No",
    TRUE ~ NA_character_
  )
}

clean_meals <- function(x) {
  s <- stringr::str_to_lower(stringr::str_squish(as.character(x)))

  # text -> digits
  s <- stringr::str_replace_all(s, c(
    "zero"  = "0",
    "one"   = "1",
    "two"   = "2",
    "three" = "3",
    "four"  = "4",
    "five"  = "5",
    "six"   = "6",
    "seven" = "7",
    "eight" = "8"
  ))

  dplyr::case_when(
    is.na(s) ~ NA_character_,
    s %in% c("", "na", "n/a", "null", "unsure of #", "unsure") ~ NA_character_,

    stringr::str_detect(s, "ran out") ~ "0",

    stringr::str_detect(s, "^0$") ~ "0",
    stringr::str_detect(s, "^1$") ~ "1",

    stringr::str_detect(s, "2-3") | stringr::str_detect(s, "2–3") | stringr::str_detect(s, "2 to 3") |
      stringr::str_detect(s, "^2$") | stringr::str_detect(s, "^3$") |
      stringr::str_detect(s, "2 a day") | stringr::str_detect(s, "3 a day") ~ "2–3",

    stringr::str_detect(s, "4-6") | stringr::str_detect(s, "4 to 6") |
      stringr::str_detect(s, "4 to 5") |
      stringr::str_detect(s, "3-4") | stringr::str_detect(s, "3 to 4") |
      stringr::str_detect(s, "^4$") | stringr::str_detect(s, "^5$") | stringr::str_detect(s, "^6$") |
      stringr::str_detect(s, "5-6") ~ "4–6",

    stringr::str_detect(s, "\\b7\\b") | stringr::str_detect(s, "\\b8\\b") ~ "7+",
    TRUE ~ NA_character_
  )
}

clean_ease <- function(x) {
  s <- stringr::str_to_upper(stringr::str_squish(as.character(x)))
  dplyr::case_when(
    is.na(s) ~ NA_character_,
    s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,

    # numeric variants
    s %in% c("1") ~ "Very easy",
    s %in% c("2") ~ "Easy",
    s %in% c("3") ~ "Neutral",

    # text variants
    stringr::str_detect(s, "VERY EASY") ~ "Very easy",
    s == "EASY" ~ "Easy",
    stringr::str_detect(s, "NEUTRAL") ~ "Neutral",
    TRUE ~ NA_character_
  )
}

# -------------------------
# 1) Column names (EDIT ONLY IF NEEDED)
# -------------------------
col_meals_t1 <- "t1_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"
col_meals_t2 <- "t2_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"

col_ease_t1  <- "t1_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"
col_ease_t2  <- "t2_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"

col_waste_t1 <- "t1_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"
col_waste_t2 <- "t2_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"

need_cols <- c(
  "enrolled_group", "cohort_final",
  col_meals_t1, col_meals_t2,
  col_ease_t1, col_ease_t2,
  col_waste_t1, col_waste_t2
)

missing_cols <- setdiff(need_cols, names(df_status))
if (length(missing_cols) > 0) {
  stop("Missing columns:\n  - ", paste(missing_cols, collapse = "\n  - "))
}

# -------------------------
# 2) Base enrolled dataset (cohort-specific, enrolled only)
# -------------------------
df_enrolled <- df_status %>%
  dplyr::filter(enrolled_group == "Enrolled", cohort_final %in% c("Cohort 1", "Cohort 2", "Cohort 3")) %>%
  dplyr::mutate(cohort_final = factor(cohort_final, levels = c("Cohort 1", "Cohort 2", "Cohort 3")))

# ============================================================
# A) MEALS MADE USING PROGRAM FOOD (T1 + T2)
# ============================================================
make_meals_tab <- function(df, col_name, time_label) {
  df %>%
    dplyr::transmute(
      cohort_final,
      response = clean_meals(.data[[col_name]])
    ) %>%
    dplyr::filter(!is.na(response)) %>%
    dplyr::mutate(
      response = factor(response, levels = c("0", "1", "2–3", "4–6", "7+"), ordered = TRUE)
    ) %>%
    dplyr::count(cohort_final, response, name = "n") %>%
    dplyr::group_by(cohort_final) %>%
    dplyr::mutate(
      N = sum(n),
      percent = round(100 * n / N, 1),
      timepoint = time_label
    ) %>%
    dplyr::ungroup()
}

plot_meals_by_cohort <- function(tab, title_text, y_max = 70) {
  ggplot2::ggplot(tab, ggplot2::aes(x = response, y = percent)) +
    ggplot2::geom_col(width = 0.75, fill = "#1F4E79") +
    ggplot2::geom_text(
      ggplot2::aes(label = paste0(percent, "%\n(n=", n, ")")),
      vjust = -0.25, size = 3.5
    ) +
    ggplot2::facet_wrap(~ cohort_final, nrow = 1) +
    ggplot2::scale_y_continuous(limits = c(0, y_max), labels = function(x) paste0(x, "%")) +
    ggplot2::theme_bw() +
    ggplot2::labs(title = title_text, x = "Meals using program food (past week)", y = "% within cohort") +
    ggplot2::theme(
      plot.title = ggplot2::element_text(face = "bold", size = 14),
      axis.text.x = ggplot2::element_text(size = 10)
    )
}

meals_t1_tab <- make_meals_tab(df_enrolled, col_meals_t1, "T1")
meals_t2_tab <- make_meals_tab(df_enrolled, col_meals_t2, "T2")

meals_t1_tab

## # A tibble: 8 × 6
##   cohort_final response     n     N percent timepoint
##   <fct>        <ord>    <int> <int>   <dbl> <chr>    
## 1 Cohort 1     0            3    39     7.7 T1       
## 2 Cohort 1     1            1    39     2.6 T1       
## 3 Cohort 1     2–3         22    39    56.4 T1       
## 4 Cohort 1     4–6         12    39    30.8 T1       
## 5 Cohort 1     7+           1    39     2.6 T1       
## 6 Cohort 2     2–3         10    17    58.8 T1       
## 7 Cohort 2     4–6          4    17    23.5 T1       
## 8 Cohort 2     7+           3    17    17.6 T1

plot_meals_by_cohort(meals_t1_tab, "Meals Made Using Program Food — T1 (Enrolled, by Cohort)", y_max = 70)

meals_t2_tab

## # A tibble: 5 × 6
##   cohort_final response     n     N percent timepoint
##   <fct>        <ord>    <int> <int>   <dbl> <chr>    
## 1 Cohort 1     0            3    41     7.3 T2       
## 2 Cohort 1     1            1    41     2.4 T2       
## 3 Cohort 1     2–3         17    41    41.5 T2       
## 4 Cohort 1     4–6         15    41    36.6 T2       
## 5 Cohort 1     7+           5    41    12.2 T2

plot_meals_by_cohort(meals_t2_tab, "Meals Made Using Program Food — T2 (Enrolled, by Cohort)", y_max = 70)

# ============================================================
# B) EASE OF USING PROGRAM FOOD (T1 + T2)
# ============================================================
make_ease_tab <- function(df, col_name, time_label) {
  df %>%
    dplyr::transmute(
      cohort_final,
      response = clean_ease(.data[[col_name]])
    ) %>%
    dplyr::filter(!is.na(response)) %>%
    dplyr::mutate(
      response = factor(response, levels = c("Neutral", "Easy", "Very easy"), ordered = TRUE)
    ) %>%
    dplyr::count(cohort_final, response, name = "n") %>%
    dplyr::group_by(cohort_final) %>%
    dplyr::mutate(
      N = sum(n),
      percent = round(100 * n / N, 1),
      timepoint = time_label
    ) %>%
    dplyr::ungroup()
}

plot_ease_by_cohort <- function(tab, title_text, y_max = 70) {
  ggplot2::ggplot(tab, ggplot2::aes(x = response, y = percent)) +
    ggplot2::geom_col(width = 0.75, fill = "#2E86C1") +
    ggplot2::geom_text(
      ggplot2::aes(label = paste0(percent, "%\n(n=", n, ")")),
      vjust = -0.25, size = 3.5
    ) +
    ggplot2::facet_wrap(~ cohort_final, nrow = 1) +
    ggplot2::scale_y_continuous(limits = c(0, y_max), labels = function(x) paste0(x, "%")) +
    ggplot2::theme_bw() +
    ggplot2::labs(title = title_text, x = "", y = "% within cohort") +
    ggplot2::theme(plot.title = ggplot2::element_text(face = "bold", size = 14))
}

ease_t1_tab <- make_ease_tab(df_enrolled, col_ease_t1, "T1")
ease_t2_tab <- make_ease_tab(df_enrolled, col_ease_t2, "T2")

ease_t1_tab

## # A tibble: 6 × 6
##   cohort_final response      n     N percent timepoint
##   <fct>        <ord>     <int> <int>   <dbl> <chr>    
## 1 Cohort 1     Neutral       3    38     7.9 T1       
## 2 Cohort 1     Easy         14    38    36.8 T1       
## 3 Cohort 1     Very easy    21    38    55.3 T1       
## 4 Cohort 2     Neutral       2    24     8.3 T1       
## 5 Cohort 2     Easy         14    24    58.3 T1       
## 6 Cohort 2     Very easy     8    24    33.3 T1

plot_ease_by_cohort(ease_t1_tab, "Ease of Using Program Food in Meals — T1 (Enrolled, by Cohort)", y_max = 70)

ease_t2_tab

## # A tibble: 3 × 6
##   cohort_final response      n     N percent timepoint
##   <fct>        <ord>     <int> <int>   <dbl> <chr>    
## 1 Cohort 1     Neutral       2    40     5   T2       
## 2 Cohort 1     Easy         21    40    52.5 T2       
## 3 Cohort 1     Very easy    17    40    42.5 T2

plot_ease_by_cohort(ease_t2_tab, "Ease of Using Program Food in Meals — T2 (Enrolled, by Cohort)", y_max = 70)

# ============================================================
# C) WASTE: THREW AWAY ANY FOOD FROM MOST RECENT DELIVERY (T1 + T2)
# ============================================================
make_waste_tab <- function(df, col_name, time_label) {
  df %>%
    dplyr::transmute(
      cohort_final,
      response = clean_yesno(.data[[col_name]])
    ) %>%
    dplyr::filter(!is.na(response)) %>%
    dplyr::mutate(response = factor(response, levels = c("No", "Yes"))) %>%
    dplyr::count(cohort_final, response, name = "n") %>%
    dplyr::group_by(cohort_final) %>%
    dplyr::mutate(
      N = sum(n),
      percent = round(100 * n / N, 1),
      timepoint = time_label
    ) %>%
    dplyr::ungroup()
}

plot_waste_by_cohort <- function(tab, title_text) {
  ggplot2::ggplot(tab, ggplot2::aes(x = response, y = percent, fill = response)) +
    ggplot2::geom_col(width = 0.65) +
    ggplot2::geom_text(
      ggplot2::aes(label = paste0(percent, "%\n(n=", n, ")")),
      vjust = -0.25, size = 3.6
    ) +
    ggplot2::facet_wrap(~ cohort_final, nrow = 1) +
    ggplot2::scale_fill_manual(values = c("No" = "#1a9850", "Yes" = "#d73027")) +
    ggplot2::scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
    ggplot2::theme_bw() +
    ggplot2::labs(title = title_text, x = "", y = "% within cohort") +
    ggplot2::theme(
      legend.position = "none",
      plot.title = ggplot2::element_text(face = "bold", size = 14)
    )
}

waste_t1_tab <- make_waste_tab(df_enrolled, col_waste_t1, "T1")
waste_t2_tab <- make_waste_tab(df_enrolled, col_waste_t2, "T2")

waste_t1_tab

## # A tibble: 4 × 6
##   cohort_final response     n     N percent timepoint
##   <fct>        <fct>    <int> <int>   <dbl> <chr>    
## 1 Cohort 1     No          30    37    81.1 T1       
## 2 Cohort 1     Yes          7    37    18.9 T1       
## 3 Cohort 2     No          21    26    80.8 T1       
## 4 Cohort 2     Yes          5    26    19.2 T1

plot_waste_by_cohort(waste_t1_tab, "Threw Away Any Delivered Food? — T1 (Enrolled, by Cohort)")

waste_t2_tab

## # A tibble: 2 × 6
##   cohort_final response     n     N percent timepoint
##   <fct>        <fct>    <int> <int>   <dbl> <chr>    
## 1 Cohort 1     No          37    40    92.5 T2       
## 2 Cohort 1     Yes          3    40     7.5 T2

plot_waste_by_cohort(waste_t2_tab, "Threw Away Any Delivered Food? — T2 (Enrolled, by Cohort)")

# ============================================================
# FOOD UTILIZATION + WASTE (T1 + T2) — ENROLLED ONLY (NO COHORT)
# ============================================================

# ---- Column names (edit ONLY if needed) ----
col_meals_t1 <- "t1_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"
col_meals_t2 <- "t2_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"

col_ease_t1  <- "t1_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"
col_ease_t2  <- "t2_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"

col_waste_t1 <- "t1_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"
col_waste_t2 <- "t2_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"

need_cols <- c("enrolled_group", col_meals_t1, col_meals_t2, col_ease_t1, col_ease_t2, col_waste_t1, col_waste_t2)
missing_cols <- setdiff(need_cols, names(df_status))
if (length(missing_cols) > 0) stop("Missing columns:\n  - ", paste(missing_cols, collapse = "\n  - "))

# ---- Enrolled only (single object; df_status untouched) ----
df_enrolled <- df_status %>% dplyr::filter(enrolled_group == "Enrolled")

# =========================
# A) MEALS USING PROGRAM FOOD (T1 + T2)
# =========================
make_meals_tab <- function(df, col_name, time_label) {
  df %>%
    dplyr::transmute(response = clean_meals(.data[[col_name]])) %>%
    dplyr::filter(!is.na(response)) %>%
    dplyr::mutate(response = factor(response, levels = c("0", "1", "2–3", "4–6", "7+"), ordered = TRUE)) %>%
    dplyr::count(response, name = "n") %>%
    dplyr::mutate(
      N = sum(n),
      percent = round(100 * n / N, 1),
      timepoint = time_label,
      label = paste0(percent, "% (n=", n, ")")
    )
}

plot_meals <- function(tab, title_text, y_max = 70) {
  ggplot2::ggplot(tab, ggplot2::aes(x = response, y = percent)) +
    ggplot2::geom_col(width = 0.75, fill = "#1F4E79") +
    ggplot2::geom_text(ggplot2::aes(label = label), vjust = -0.25, size = 3.8) +
    ggplot2::scale_y_continuous(limits = c(0, y_max), labels = function(x) paste0(x, "%")) +
    ggplot2::theme_bw() +
    ggplot2::labs(title = title_text, x = "Number of meals", y = "% of enrolled respondents") +
    ggplot2::theme(plot.title = ggplot2::element_text(face = "bold", size = 14))
}

meals_t1_tab <- make_meals_tab(df_enrolled, col_meals_t1, "T1")
meals_t2_tab <- make_meals_tab(df_enrolled, col_meals_t2, "T2")

meals_t1_tab

## # A tibble: 5 × 6
##   response     n     N percent timepoint label       
##   <ord>    <int> <int>   <dbl> <chr>     <chr>       
## 1 0            3    56     5.4 T1        5.4% (n=3)  
## 2 1            1    56     1.8 T1        1.8% (n=1)  
## 3 2–3         32    56    57.1 T1        57.1% (n=32)
## 4 4–6         16    56    28.6 T1        28.6% (n=16)
## 5 7+           4    56     7.1 T1        7.1% (n=4)

plot_meals(meals_t1_tab, paste0("Meals Prepared Using Program Food — T1 (Enrolled, n=", unique(meals_t1_tab$N), ")"), y_max = 70)

meals_t2_tab

## # A tibble: 5 × 6
##   response     n     N percent timepoint label       
##   <ord>    <int> <int>   <dbl> <chr>     <chr>       
## 1 0            3    41     7.3 T2        7.3% (n=3)  
## 2 1            1    41     2.4 T2        2.4% (n=1)  
## 3 2–3         17    41    41.5 T2        41.5% (n=17)
## 4 4–6         15    41    36.6 T2        36.6% (n=15)
## 5 7+           5    41    12.2 T2        12.2% (n=5)

plot_meals(meals_t2_tab, paste0("Meals Prepared Using Program Food — T2 (Enrolled, n=", unique(meals_t2_tab$N), ")"), y_max = 70)

# =========================
# B) EASE OF USING PROGRAM FOOD (T1 + T2)
# =========================
make_ease_tab <- function(df, col_name, time_label) {
  df %>%
    dplyr::transmute(response = clean_ease(.data[[col_name]])) %>%
    dplyr::filter(!is.na(response)) %>%
    dplyr::mutate(response = factor(response, levels = c("Neutral", "Easy", "Very easy"), ordered = TRUE)) %>%
    dplyr::count(response, name = "n") %>%
    dplyr::mutate(
      N = sum(n),
      percent = round(100 * n / N, 1),
      timepoint = time_label,
      label = paste0(percent, "% (n=", n, ")")
    )
}

plot_ease <- function(tab, title_text, y_max = 70) {
  ggplot2::ggplot(tab, ggplot2::aes(x = response, y = percent)) +
    ggplot2::geom_col(width = 0.75, fill = "#2E86C1") +
    ggplot2::geom_text(ggplot2::aes(label = label), vjust = -0.25, size = 3.8) +
    ggplot2::scale_y_continuous(limits = c(0, y_max), labels = function(x) paste0(x, "%")) +
    ggplot2::theme_bw() +
    ggplot2::labs(title = title_text, x = "", y = "% of enrolled respondents") +
    ggplot2::theme(plot.title = ggplot2::element_text(face = "bold", size = 14))
}

ease_t1_tab <- make_ease_tab(df_enrolled, col_ease_t1, "T1")
ease_t2_tab <- make_ease_tab(df_enrolled, col_ease_t2, "T2")

ease_t1_tab

## # A tibble: 3 × 6
##   response      n     N percent timepoint label       
##   <ord>     <int> <int>   <dbl> <chr>     <chr>       
## 1 Neutral       5    62     8.1 T1        8.1% (n=5)  
## 2 Easy         28    62    45.2 T1        45.2% (n=28)
## 3 Very easy    29    62    46.8 T1        46.8% (n=29)

plot_ease(ease_t1_tab, paste0("Ease of Using Program Food in Meals — T1 (Enrolled, n=", unique(ease_t1_tab$N), ")"), y_max = 70)

ease_t2_tab

## # A tibble: 3 × 6
##   response      n     N percent timepoint label       
##   <ord>     <int> <int>   <dbl> <chr>     <chr>       
## 1 Neutral       2    40     5   T2        5% (n=2)    
## 2 Easy         21    40    52.5 T2        52.5% (n=21)
## 3 Very easy    17    40    42.5 T2        42.5% (n=17)

plot_ease(ease_t2_tab, paste0("Ease of Using Program Food in Meals — T2 (Enrolled, n=", unique(ease_t2_tab$N), ")"), y_max = 70)

# =========================
# C) WASTE (T1 + T2)
# =========================
make_waste_tab <- function(df, col_name, time_label) {
  df %>%
    dplyr::transmute(response = clean_yesno(.data[[col_name]])) %>%
    dplyr::filter(!is.na(response)) %>%
    dplyr::mutate(response = factor(response, levels = c("No", "Yes"))) %>%
    dplyr::count(response, name = "n") %>%
    dplyr::mutate(
      N = sum(n),
      percent = round(100 * n / N, 1),
      timepoint = time_label,
      label = paste0(percent, "%\n(n=", n, ")")
    )
}

plot_waste <- function(tab, title_text) {
  ggplot2::ggplot(tab, ggplot2::aes(x = response, y = percent, fill = response)) +
    ggplot2::geom_col(width = 0.65) +
    ggplot2::geom_text(ggplot2::aes(label = label), vjust = -0.25, size = 4) +
    ggplot2::scale_fill_manual(values = c("No" = "#1a9850", "Yes" = "#d73027")) +
    ggplot2::scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
    ggplot2::theme_bw() +
    ggplot2::labs(title = title_text, x = "", y = "% of enrolled respondents") +
    ggplot2::theme(
      legend.position = "none",
      plot.title = ggplot2::element_text(face = "bold", size = 14)
    )
}

waste_t1_tab <- make_waste_tab(df_enrolled, col_waste_t1, "T1")
waste_t2_tab <- make_waste_tab(df_enrolled, col_waste_t2, "T2")

waste_t1_tab

## # A tibble: 2 × 6
##   response     n     N percent timepoint label        
##   <fct>    <int> <int>   <dbl> <chr>     <chr>        
## 1 No          51    63      81 T1        "81%\n(n=51)"
## 2 Yes         12    63      19 T1        "19%\n(n=12)"

plot_waste(waste_t1_tab, paste0("Threw Away Any Delivered Food? — T1 (Enrolled, n=", unique(waste_t1_tab$N), ")"))

waste_t2_tab

## # A tibble: 2 × 6
##   response     n     N percent timepoint label          
##   <fct>    <int> <int>   <dbl> <chr>     <chr>          
## 1 No          37    40    92.5 T2        "92.5%\n(n=37)"
## 2 Yes          3    40     7.5 T2        "7.5%\n(n=3)"

plot_waste(waste_t2_tab, paste0("Threw Away Any Delivered Food? — T2 (Enrolled, n=", unique(waste_t2_tab$N), ")"))

# ============================================================
# EDUCATION DATA: SELF-MANAGEMENT SURVEY (T1) — ENROLLED ONLY
# Verifies the 4 headline % tiles + confidence table + control pie
# Uses existing objects: df_status, is_nonblank(), clean_confidence(), clean_impact()
# df_status is NOT modified.
# ============================================================

# ---- Enrolled only (new object; df_status untouched) ----
df_enrolled <- df_status %>% dplyr::filter(enrolled_group == "Enrolled")

# =========================
# 0) Column names (from your script)
# =========================
col_t1_manage      <- "t1_have_you_done_anything_in_the_past_week_to_help_manage_your_diabetes_like_walking_portion_control_or_drinking_more_water"
col_help_choices   <- "t1_did_the_nutrition_material_you_got_from_white_plains_hospital_help_you_make_healthier_food_choices"
col_conf_food      <- "t1_how_confident_are_you_in_making_food_choices_that_help_control_your_blood_sugar"
col_conf_labels    <- "t1_how_confident_are_you_in_understanding_food_labels_or_nutrition_information"
col_conf_prepare   <- "t1_how_confident_are_you_in_your_ability_to_prepare_healthy_meals_with_the_food_you_typically_have_at_home"
col_help_control   <- "t1_how_much_did_this_program_help_you_feel_more_in_control_of_your_diabetes"

need_cols <- c(col_t1_manage, col_help_choices, col_conf_food, col_conf_labels, col_conf_prepare, col_help_control)
missing_cols <- setdiff(need_cols, names(df_status))
if (length(missing_cols) > 0) stop("Missing columns:\n  - ", paste(missing_cols, collapse = "\n  - "))

# =========================
# 1) Tile helper: compute % for a condition among VALID responses
# =========================
tile_pct <- function(df, col, clean_fun, keep_levels = NULL, predicate_fun) {
  d <- df %>%
    dplyr::transmute(resp = clean_fun(.data[[col]])) %>%
    dplyr::filter(!is.na(resp))

  if (!is.null(keep_levels)) d <- d %>% dplyr::filter(resp %in% keep_levels)

  N <- nrow(d)
  if (N == 0) return(tibble::tibble(n = 0, N = 0, pct = NA_real_))

  n <- sum(predicate_fun(d$resp), na.rm = TRUE)
  tibble::tibble(n = n, N = N, pct = round(100 * n / N, 1))
}

# =========================
# 2) TILE 1 — "take daily self-mgmt actions"
# You categorized daily in your earlier chunk via strings like "DAILY..."
# We'll re-use that same approach with a light cleaner here.
# =========================
clean_manage <- function(x) {
  raw_chr <- as.character(x)
  raw_std <- stringr::str_to_upper(stringr::str_squish(raw_chr))

  missing_tokens <- c(
    "", "NA", "N/A", "NULL", "NONE",
    "NOT SURVEYED", "NOT ASKED", "NOT APPLICABLE",
    "SKIPPED", "MISSING", "UNKNOWN", "-", ".", "--"
  )

  dplyr::case_when(
    is.na(raw_chr) ~ NA_character_,
    raw_std %in% missing_tokens ~ NA_character_,
    stringr::str_detect(raw_std, "^DAILY") ~ "Daily",
    raw_std %in% c("Y", "YES") ~ "Yes (unspecified)",
    raw_std %in% c("N", "NO")  ~ "No",
    stringr::str_detect(raw_std, "OCCASION") ~ "Occasionally",
    TRUE ~ "Other / unclear"
  )
}

tile_daily <- tile_pct(
  df_enrolled,
  col_t1_manage,
  clean_manage,
  keep_levels = c("Daily","Yes (unspecified)","Occasionally","No","Other / unclear"),
  predicate_fun = function(r) r == "Daily"
)

# =========================
# 3) TILE 2 — "say nutrition material helped food choices"
# Cleaner: clean_impact() already maps to Yes/No for yes/no items.
# Count % == "Yes"
# =========================
tile_helped_choices <- tile_pct(
  df_enrolled,
  col_help_choices,
  clean_impact,
  keep_levels = c("Yes","No"),
  predicate_fun = function(r) r == "Yes"
)

# =========================
# 4) TILE 3 — "confident in food choices for A1C"
# Tile is "Very confident + Confident" (per your slide)
# =========================
tile_conf_food <- tile_pct(
  df_enrolled,
  col_conf_food,
  clean_confidence,
  keep_levels = c("Not confident","Somewhat not confident","Somewhat confident","Confident","Very confident","Maybe"),
  predicate_fun = function(r) r %in% c("Very confident","Confident")
)

# =========================
# 5) TILE 4 — "confident preparing healthy meals"
# "Very confident + Confident"
# =========================
tile_conf_prepare <- tile_pct(
  df_enrolled,
  col_conf_prepare,
  clean_confidence,
  keep_levels = c("Not confident","Somewhat not confident","Somewhat confident","Confident","Very confident","Maybe"),
  predicate_fun = function(r) r %in% c("Very confident","Confident")
)

# =========================
# 6) Print tile verification table (matches slide tiles)
# =========================
tile_verify <- dplyr::bind_rows(
  tile_daily %>% dplyr::mutate(Metric = "Daily self-mgmt actions"),
  tile_helped_choices %>% dplyr::mutate(Metric = "Nutrition material helped food choices (Yes)"),
  tile_conf_food %>% dplyr::mutate(Metric = "Confident in food choices for A1C (Very+Confident)"),
  tile_conf_prepare %>% dplyr::mutate(Metric = "Confident preparing healthy meals (Very+Confident)")
) %>%
  dplyr::select(Metric, n, N, pct) %>%
  dplyr::mutate(label = paste0(pct, "% (", n, "/", N, ")"))

tile_verify

## # A tibble: 4 × 5
##   Metric                                                 n     N   pct label    
##   <chr>                                              <int> <int> <dbl> <chr>    
## 1 Daily self-mgmt actions                               22    40  55   55% (22/…
## 2 Nutrition material helped food choices (Yes)          57    61  93.4 93.4% (5…
## 3 Confident in food choices for A1C (Very+Confident)    46    59  78   78% (46/…
## 4 Confident preparing healthy meals (Very+Confident)    51    56  91.1 91.1% (5…

# =========================
# 7) Confidence table (the one on the slide)
# Rows: Food choices / Labels / Preparing meals
# Columns: Very confident, Confident, Combined
# Denominator: respondents to that question (enrolled only)
# =========================
make_conf_row <- function(df, col, domain_label) {
  d <- df %>%
    dplyr::transmute(resp = clean_confidence(.data[[col]])) %>%
    dplyr::filter(!is.na(resp))

  N <- nrow(d)
  vc <- sum(d$resp == "Very confident")
  c  <- sum(d$resp == "Confident")
  tibble::tibble(
    Domain = domain_label,
    N = N,
    `Very Confident` = round(100 * vc / N, 1),
    `Confident`      = round(100 * c  / N, 1),
    `Combined`       = round(100 * (vc + c) / N, 1)
  )
}

conf_table <- dplyr::bind_rows(
  make_conf_row(df_enrolled, col_conf_food,    "Food choices to control A1C"),
  make_conf_row(df_enrolled, col_conf_labels,  "Understanding food labels"),
  make_conf_row(df_enrolled, col_conf_prepare, "Preparing healthy meals at home")
)

conf_table

## # A tibble: 3 × 5
##   Domain                              N `Very Confident` Confident Combined
##   <chr>                           <int>            <dbl>     <dbl>    <dbl>
## 1 Food choices to control A1C        64             56.2      15.6     71.9
## 2 Understanding food labels          64             57.8      17.2     75  
## 3 Preparing healthy meals at home    61             63.9      19.7     83.6

# =========================
# 8) Program helped feel more in control (impact distribution)
# Slide shows: % "A lot" | % "Somewhat" | % "A little"
# We'll compute distribution among valid responses (enrolled only)
# =========================
control_tab <- df_enrolled %>%
  dplyr::transmute(resp = clean_impact(.data[[col_help_control]])) %>%
  dplyr::filter(!is.na(resp), resp %in% c("A lot","Somewhat","A little","Not at all")) %>%
  dplyr::count(resp, name = "n") %>%
  dplyr::mutate(
    N = sum(n),
    pct = round(100 * n / N, 1)
  ) %>%
  dplyr::arrange(dplyr::desc(pct))

control_tab

## # A tibble: 3 × 4
##   resp         n     N   pct
##   <chr>    <int> <int> <dbl>
## 1 A lot       39    63  61.9
## 2 Somewhat    13    63  20.6
## 3 A little    11    63  17.5

# Optional: single-line string like your slide footer
control_footer <- control_tab %>%
  dplyr::filter(resp %in% c("A lot","Somewhat","A little")) %>%
  dplyr::mutate(txt = paste0(pct, "% \"", resp, "\"")) %>%
  dplyr::pull(txt)

paste(control_footer, collapse = " | ")

## [1] "61.9% \"A lot\" | 20.6% \"Somewhat\" | 17.5% \"A little\""

df_enrolled %>%
  filter(!is.na(.data[[col_conf_prepare]])) %>%
  summarise(N = n())

## # A tibble: 1 × 1
##       N
##   <int>
## 1    68

df_enrolled %>%
  count(.data[[col_conf_prepare]])

## # A tibble: 18 × 2
##    t1_how_confident_are_you_in_your_ability_to_prepare_healthy_meals_wit…¹     n
##    <chr>                                                                   <int>
##  1 1                                                                           1
##  2 2                                                                           2
##  3 3                                                                           9
##  4 4                                                                          14
##  5 5                                                                           2
##  6 COFIDENT                                                                    1
##  7 CONFIDENT                                                                   2
##  8 N/A                                                                         3
##  9 NEUTRAL                                                                     1
## 10 Neutral                                                                     4
## 11 SOMEWHAT CONFIDENT                                                          1
## 12 Somewhat confident                                                          1
## 13 VERY CONFIDENT                                                              3
## 14 Very Confident                                                              1
## 15 Very confident                                                             17
## 16 n/a                                                                         4
## 17 very confident                                                              2
## 18 <NA>                                                                       68
## # ℹ abbreviated name:
## #   ¹t1_how_confident_are_you_in_your_ability_to_prepare_healthy_meals_with_the_food_you_typically_have_at_home

# ============================================================
# PRIMARY OUTCOMES: A1C SLIDE — VERIFY ALL NUMBERS
# Uses your existing objects: df_status, enrolled_group, cohort_final,
# and your existing helpers: parse_delta(), is_nonblank()
# DOES NOT overwrite df_status
# ============================================================

# ---- Columns used on the slide (already in your script) ----
a1c_t1_col <- "a1c_change_t1"   # change for: Cohort 1 + Cohort 2 (PDSA2 window)
a1c_t2_col <- "a1c_change_t2"   # change for: Cohort 1 only (most recent window)

# ---- Settings that match the slide tiles ----
thr <- 1.0  # ≥1.0% change threshold

# ---- Safety checks ----
stopifnot("enrolled_group" %in% names(df_status))
stopifnot("cohort_final" %in% names(df_status))
stopifnot(a1c_t1_col %in% names(df_status))
stopifnot(a1c_t2_col %in% names(df_status))

# ============================================================
# 0) Helper: classify A1C change exactly like the slide
# (Improved = delta <= -1, Worsened = delta >= +1, else No change)
# ============================================================
classify_a1c <- function(delta, thr = 1.0) {
  dplyr::case_when(
    is.na(delta) ~ NA_character_,
    delta <= -thr ~ "Improved",
    delta >=  thr ~ "Worsened",
    TRUE ~ "No change"
  )
}

# ============================================================
# 1) A1C CHANGE @ T1 TILE
#   Slide: "A1C Change at T1: Cohort 1 + Cohort 2 (n=__)"
#   We use ENROLLED ONLY, cohort 1 or 2, and non-missing numeric deltas
# ============================================================
a1c_t1_df <- df_status %>%
  dplyr::filter(enrolled_group == "Enrolled",
                cohort_final %in% c("Cohort 1", "Cohort 2")) %>%
  dplyr::transmute(
    cohort_final,
    delta = parse_delta(.data[[a1c_t1_col]]),
    bucket = classify_a1c(parse_delta(.data[[a1c_t1_col]]), thr)
  ) %>%
  dplyr::filter(!is.na(delta), !is.na(bucket))

a1c_t1_tile <- a1c_t1_df %>%
  dplyr::count(bucket, name = "n") %>%
  dplyr::mutate(
    N = sum(n),
    percent = round(100 * n / N, 1)
  ) %>%
  dplyr::arrange(factor(bucket, levels = c("Improved","No change","Worsened")))

a1c_t1_tile

## # A tibble: 3 × 4
##   bucket        n     N percent
##   <chr>     <int> <int>   <dbl>
## 1 Improved     32    51    62.7
## 2 No change    17    51    33.3
## 3 Worsened      2    51     3.9

# This table should match the 3 tile numbers on the slide:
# Improved %, No change %, Worsened %, and N (denominator)

# ============================================================
# 2) A1C CHANGE @ T2 TILE
#   Slide: "A1C Change at T2: Cohort 1 (n=__)"
#   ENROLLED ONLY, Cohort 1, and non-missing numeric deltas
# ============================================================
a1c_t2_df <- df_status %>%
  dplyr::filter(enrolled_group == "Enrolled",
                cohort_final == "Cohort 1") %>%
  dplyr::transmute(
    delta = parse_delta(.data[[a1c_t2_col]]),
    bucket = classify_a1c(parse_delta(.data[[a1c_t2_col]]), thr)
  ) %>%
  dplyr::filter(!is.na(delta), !is.na(bucket))

a1c_t2_tile <- a1c_t2_df %>%
  dplyr::count(bucket, name = "n") %>%
  dplyr::mutate(
    N = sum(n),
    percent = round(100 * n / N, 1)
  ) %>%
  dplyr::arrange(factor(bucket, levels = c("Improved","No change","Worsened")))

a1c_t2_tile

## # A tibble: 3 × 4
##   bucket        n     N percent
##   <chr>     <int> <int>   <dbl>
## 1 Improved     19    30    63.3
## 2 No change     9    30    30  
## 3 Worsened      2    30     6.7

# This table should match the Cohort 1 T2 tile numbers (e.g., 63.3 / 30.0 / 6.7 and N=30)

# ============================================================
# 3) COHORT-LEVEL SUMMARY TABLE (the big table on the slide)
#   Rows: Cohort 1 / Cohort 2 / Cohort 3
#   Columns used on slide:
#     N (total enrolled in cohort)
#     A1c Improved (T1): count/denom (%)
#     A1c Improved (T2): count/denom (%) [Cohort 1 only]
#     Chart review completion line at bottom:
#       Cohort 1: 19/60 (31.7%)   Cohort 2: 23/41 (56.1%)
#   IMPORTANT: The slide’s A1c Improved cells appear to be using
#     cohort N as denominator (e.g., 29/60), NOT the number with A1C deltas.
#   The code below computes BOTH denominators so you can see which matches.
# ============================================================

# ---- cohort N (enrolled total) ----
cohort_N <- df_status %>%
  dplyr::filter(enrolled_group == "Enrolled") %>%
  dplyr::count(cohort_final, name = "N_enrolled") %>%
  dplyr::filter(cohort_final %in% c("Cohort 1","Cohort 2","Cohort 3"))

# ---- A1C improved counts (numerator) + delta-available denominator ----
a1c_t1_by_cohort <- df_status %>%
  dplyr::filter(enrolled_group == "Enrolled",
                cohort_final %in% c("Cohort 1","Cohort 2","Cohort 3")) %>%
  dplyr::transmute(
    cohort_final,
    delta_t1 = parse_delta(.data[[a1c_t1_col]]),
    bucket_t1 = classify_a1c(parse_delta(.data[[a1c_t1_col]]), thr)
  ) %>%
  dplyr::filter(!is.na(delta_t1), !is.na(bucket_t1)) %>%
  dplyr::group_by(cohort_final) %>%
  dplyr::summarise(
    denom_delta_t1 = dplyr::n(),
    num_improved_t1 = sum(bucket_t1 == "Improved"),
    pct_improved_t1 = round(100 * num_improved_t1 / denom_delta_t1, 1),
    .groups = "drop"
  )

a1c_t2_by_cohort1 <- df_status %>%
  dplyr::filter(enrolled_group == "Enrolled", cohort_final == "Cohort 1") %>%
  dplyr::transmute(
    cohort_final,
    delta_t2 = parse_delta(.data[[a1c_t2_col]]),
    bucket_t2 = classify_a1c(parse_delta(.data[[a1c_t2_col]]), thr)
  ) %>%
  dplyr::filter(!is.na(delta_t2), !is.na(bucket_t2)) %>%
  dplyr::summarise(
    denom_delta_t2 = dplyr::n(),
    num_improved_t2 = sum(bucket_t2 == "Improved"),
    pct_improved_t2 = round(100 * num_improved_t2 / denom_delta_t2, 1)
  )

a1c_t2_by_cohort1

## # A tibble: 1 × 3
##   denom_delta_t2 num_improved_t2 pct_improved_t2
##            <int>           <int>           <dbl>
## 1             30              19            63.3

# ---- Combine into one cohort summary with BOTH possible denominators ----
cohort_a1c_summary <- cohort_N %>%
  dplyr::left_join(a1c_t1_by_cohort, by = "cohort_final") %>%
  dplyr::mutate(
    # if the slide is using cohort N as the denominator (e.g., 29/60)
    pct_improved_t1_over_cohortN = round(100 * num_improved_t1 / N_enrolled, 1),
    a1c_improved_t1_over_cohortN = paste0(num_improved_t1, "/", N_enrolled, " (", pct_improved_t1_over_cohortN, "%)"),

    # if the slide is using delta-available denom (more defensible)
    a1c_improved_t1_over_deltaN  = paste0(num_improved_t1, "/", denom_delta_t1, " (", pct_improved_t1, "%)")
  ) %>%
  dplyr::select(cohort_final, N_enrolled,
                num_improved_t1, denom_delta_t1,
                a1c_improved_t1_over_cohortN, a1c_improved_t1_over_deltaN)

cohort_a1c_summary

## # A tibble: 3 × 6
##   cohort_final N_enrolled num_improved_t1 denom_delta_t1 a1c_improved_t1_over_…¹
##   <chr>             <int>           <int>          <int> <chr>                  
## 1 Cohort 1             60              19             28 19/60 (31.7%)          
## 2 Cohort 2             41              13             23 13/41 (31.7%)          
## 3 Cohort 3             35              NA             NA NA/35 (NA%)            
## # ℹ abbreviated name: ¹a1c_improved_t1_over_cohortN
## # ℹ 1 more variable: a1c_improved_t1_over_deltaN <chr>

# Compare a1c_improved_t1_over_cohortN vs your slide cell values.
# If your slide shows 29/60, it’s using cohort N as denominator.

# ============================================================
# 4) CHART REVIEW COMPLETION (bottom line on slide)
#   Uses your existing chart-review rules:
#     Cohort 1 requires: a1c_t1, ed_t1, ip_t1, a1c_t2, ed_t2, hosp_t2
#     Cohort 2 requires: a1c_t1, ed_t1, ip_t1
#   We compute 19/60 and 23/41 style.
# ============================================================

# Use your existing *_col vars if already defined earlier:
ed_t1_col   <- "ed_utilization_t1"
ip_t1_col   <- "ip_utilization_t1"
ed_t2_col   <- "ed_utilization_t2"
hosp_t2_col <- "hospital_utilization_t2"

stopifnot(all(c(ed_t1_col, ip_t1_col, ed_t2_col, hosp_t2_col) %in% names(df_status)))

c1_required <- c(a1c_t1_col, ed_t1_col, ip_t1_col, a1c_t2_col, ed_t2_col, hosp_t2_col)
c2_required <- c(a1c_t1_col, ed_t1_col, ip_t1_col)

chart_complete_flag <- function(df, req_cols) {
  # TRUE if all required fields are nonblank for that row
  apply(df[, req_cols, drop = FALSE], 1, function(row) all(is_nonblank(row)))
}

df_c1_enr <- df_status %>% dplyr::filter(enrolled_group == "Enrolled", cohort_final == "Cohort 1")
df_c2_enr <- df_status %>% dplyr::filter(enrolled_group == "Enrolled", cohort_final == "Cohort 2")

c1_complete_n <- sum(chart_complete_flag(df_c1_enr, c1_required))
c2_complete_n <- sum(chart_complete_flag(df_c2_enr, c2_required))

c1_total <- nrow(df_c1_enr)
c2_total <- nrow(df_c2_enr)

chart_review_line <- tibble::tibble(
  Cohort = c("Cohort 1","Cohort 2"),
  complete_n = c(c1_complete_n, c2_complete_n),
  total_n = c(c1_total, c2_total),
  percent = round(100 * complete_n / total_n, 1),
  label = paste0(complete_n, "/", total_n, " (", percent, "%)")
)

chart_review_line

## # A tibble: 2 × 5
##   Cohort   complete_n total_n percent label        
##   <chr>         <int>   <int>   <dbl> <chr>        
## 1 Cohort 1         19      60    31.7 19/60 (31.7%)
## 2 Cohort 2         23      41    56.1 23/41 (56.1%)

# This should reproduce the slide line:
# Cohort 1: 19/60 (31.7%) | Cohort 2: 23/41 (56.1%)

# ============================================================
# A1C CHANGE DISTRIBUTION (T1 and/or T2)
# Percent *within each cohort* out of patients with VALID answers (non-missing delta)
# Buckets: Reduction (≤ -1.0), No change (-1.0 < delta < +1.0), Increase (≥ +1.0)
# ============================================================

thr <- 1.0

# ---- helper (use your existing parse_delta) ----
classify_a1c <- function(delta, thr = 1.0) {
  dplyr::case_when(
    is.na(delta) ~ NA_character_,
    delta <= -thr ~ "Reduction (≥1.0% decrease)",
    delta >=  thr ~ "Increase (≥1.0% increase)",
    TRUE ~ "No change (<1.0% change)"
  )
}

# ------------------------------------------------------------
# A) A1C Change @ T1: Cohort 1 + Cohort 2 (cohort-specific %)
# ------------------------------------------------------------
a1c_t1_by_cohort <- df_status %>%
  dplyr::filter(
    enrolled_group == "Enrolled",
    cohort_final %in% c("Cohort 1", "Cohort 2")
  ) %>%
  dplyr::transmute(
    cohort_final,
    delta = parse_delta(.data[[a1c_t1_col]]),
    category = classify_a1c(parse_delta(.data[[a1c_t1_col]]), thr)
  ) %>%
  dplyr::filter(!is.na(delta), !is.na(category)) %>%
  dplyr::count(cohort_final, category, name = "n") %>%
  dplyr::group_by(cohort_final) %>%
  dplyr::mutate(
    N_valid = sum(n),
    percent = round(100 * n / N_valid, 1),
    label = paste0(n, "/", N_valid, " (", percent, "%)")
  ) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(
    cohort_final,
    factor(category, levels = c("Reduction (≥1.0% decrease)", "No change (<1.0% change)", "Increase (≥1.0% increase)"))
  )

a1c_t1_by_cohort

## # A tibble: 6 × 6
##   cohort_final category                       n N_valid percent label        
##   <chr>        <chr>                      <int>   <int>   <dbl> <chr>        
## 1 Cohort 1     Reduction (≥1.0% decrease)    19      28    67.9 19/28 (67.9%)
## 2 Cohort 1     No change (<1.0% change)       8      28    28.6 8/28 (28.6%) 
## 3 Cohort 1     Increase (≥1.0% increase)      1      28     3.6 1/28 (3.6%)  
## 4 Cohort 2     Reduction (≥1.0% decrease)    13      23    56.5 13/23 (56.5%)
## 5 Cohort 2     No change (<1.0% change)       9      23    39.1 9/23 (39.1%) 
## 6 Cohort 2     Increase (≥1.0% increase)      1      23     4.3 1/23 (4.3%)

# Optional: wide format for slide table layout
a1c_t1_by_cohort_wide <- a1c_t1_by_cohort %>%
  dplyr::select(cohort_final, category, n, N_valid, percent, label) %>%
  tidyr::pivot_wider(
    names_from = category,
    values_from = c(n, percent, label),
    names_sep = " | "
  )

a1c_t1_by_cohort_wide

## # A tibble: 2 × 11
##   cohort_final N_valid `n | Reduction (≥1.0% decrease)` n | No change (<1.0% c…¹
##   <chr>          <int>                            <int>                    <int>
## 1 Cohort 1          28                               19                        8
## 2 Cohort 2          23                               13                        9
## # ℹ abbreviated name: ¹`n | No change (<1.0% change)`
## # ℹ 7 more variables: `n | Increase (≥1.0% increase)` <int>,
## #   `percent | Reduction (≥1.0% decrease)` <dbl>,
## #   `percent | No change (<1.0% change)` <dbl>,
## #   `percent | Increase (≥1.0% increase)` <dbl>,
## #   `label | Reduction (≥1.0% decrease)` <chr>,
## #   `label | No change (<1.0% change)` <chr>, …

# ------------------------------------------------------------
# B) A1C Change @ T2: Cohort 1 only (same logic, valid answers)
# (If you later have Cohort 2 T2, add it to cohort_final filter.)
# ------------------------------------------------------------
a1c_t2_by_cohort <- df_status %>%
  dplyr::filter(
    enrolled_group == "Enrolled",
    cohort_final %in% c("Cohort 1")   # add "Cohort 2" if/when you have T2 deltas
  ) %>%
  dplyr::transmute(
    cohort_final,
    delta = parse_delta(.data[[a1c_t2_col]]),
    category = classify_a1c(parse_delta(.data[[a1c_t2_col]]), thr)
  ) %>%
  dplyr::filter(!is.na(delta), !is.na(category)) %>%
  dplyr::count(cohort_final, category, name = "n") %>%
  dplyr::group_by(cohort_final) %>%
  dplyr::mutate(
    N_valid = sum(n),
    percent = round(100 * n / N_valid, 1),
    label = paste0(n, "/", N_valid, " (", percent, "%)")
  ) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(
    cohort_final,
    factor(category, levels = c("Reduction (≥1.0% decrease)", "No change (<1.0% change)", "Increase (≥1.0% increase)"))
  )

a1c_t2_by_cohort

## # A tibble: 3 × 6
##   cohort_final category                       n N_valid percent label        
##   <chr>        <chr>                      <int>   <int>   <dbl> <chr>        
## 1 Cohort 1     Reduction (≥1.0% decrease)    19      30    63.3 19/30 (63.3%)
## 2 Cohort 1     No change (<1.0% change)       9      30    30   9/30 (30%)   
## 3 Cohort 1     Increase (≥1.0% increase)      2      30     6.7 2/30 (6.7%)

# ============================================================
# A1C CHANGE DISTRIBUTION TABLE
# - Cohort 1: T1 and T2
# - Cohort 2: T1 only
# - Denominator = valid (non-missing) delta within cohort+timepoint
# ============================================================

thr <- 1.0

classify_a1c <- function(delta, thr = 1.0) {
  dplyr::case_when(
    is.na(delta) ~ NA_character_,
    delta <= -thr ~ "Reduction (≥1.0% decrease)",
    delta >=  thr ~ "Increase (≥1.0% increase)",
    TRUE ~ "No change (<1.0% change)"
  )
}

# Build a "long" delta table with explicit timepoints
a1c_delta_long <- dplyr::bind_rows(
  # ---- T1: Cohort 1 + Cohort 2 ----
  df_status %>%
    dplyr::filter(enrolled_group == "Enrolled",
                  cohort_final %in% c("Cohort 1", "Cohort 2")) %>%
    dplyr::transmute(
      cohort_final,
      timepoint = "T1",
      delta = parse_delta(.data[[a1c_t1_col]])
    ),

  # ---- T2: Cohort 1 only ----
  df_status %>%
    dplyr::filter(enrolled_group == "Enrolled",
                  cohort_final == "Cohort 1") %>%
    dplyr::transmute(
      cohort_final,
      timepoint = "T2",
      delta = parse_delta(.data[[a1c_t2_col]])
    )
) %>%
  dplyr::mutate(
    category = classify_a1c(delta, thr),
    category = factor(
      category,
      levels = c(
        "Reduction (≥1.0% decrease)",
        "No change (<1.0% change)",
        "Increase (≥1.0% increase)"
      )
    ),
    timepoint = factor(timepoint, levels = c("T1", "T2"))
  ) %>%
  dplyr::filter(!is.na(delta), !is.na(category))

# Final table (this is the one you want)
a1c_change_table <- a1c_delta_long %>%
  dplyr::count(cohort_final, timepoint, category, name = "n") %>%
  dplyr::group_by(cohort_final, timepoint) %>%
  dplyr::mutate(
    N_valid = sum(n),
    percent = round(100 * n / N_valid, 1),
    label = paste0(n, "/", N_valid, " (", percent, "%)")
  ) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(cohort_final, timepoint, category)

a1c_change_table

## # A tibble: 9 × 7
##   cohort_final timepoint category                       n N_valid percent label 
##   <chr>        <fct>     <fct>                      <int>   <int>   <dbl> <chr> 
## 1 Cohort 1     T1        Reduction (≥1.0% decrease)    19      28    67.9 19/28…
## 2 Cohort 1     T1        No change (<1.0% change)       8      28    28.6 8/28 …
## 3 Cohort 1     T1        Increase (≥1.0% increase)      1      28     3.6 1/28 …
## 4 Cohort 1     T2        Reduction (≥1.0% decrease)    19      30    63.3 19/30…
## 5 Cohort 1     T2        No change (<1.0% change)       9      30    30   9/30 …
## 6 Cohort 1     T2        Increase (≥1.0% increase)      2      30     6.7 2/30 …
## 7 Cohort 2     T1        Reduction (≥1.0% decrease)    13      23    56.5 13/23…
## 8 Cohort 2     T1        No change (<1.0% change)       9      23    39.1 9/23 …
## 9 Cohort 2     T1        Increase (≥1.0% increase)      1      23     4.3 1/23 …

# ============================================================
# CHART REVIEW COMPLETION (TABLES ONLY)
# Cohort 1 + Cohort 2, ENROLLED ONLY
# Split by timepoint (T1/T2) and metric (A1c / ED / IP-Hosp)
# Does NOT overwrite df_status
# ============================================================


stopifnot(exists("df_status"))
stopifnot(all(c("enrolled_group", "cohort_final") %in% names(df_status)))

# ---- Use your known column names (edit ONLY if you ever rename in Excel) ----
a1c_t1_col  <- "a1c_change_t1"
ed_t1_col   <- "ed_utilization_t1"
ip_t1_col   <- "ip_utilization_t1"

a1c_t2_col  <- "a1c_change_t2"
ed_t2_col   <- "ed_utilization_t2"
hosp_t2_col <- "hospital_utilization_t2"

need_cols <- c(a1c_t1_col, ed_t1_col, ip_t1_col, a1c_t2_col, ed_t2_col, hosp_t2_col)
missing_cols <- setdiff(need_cols, names(df_status))
if (length(missing_cols) > 0) stop("Missing chart-review columns:\n  - ", paste(missing_cols, collapse="\n  - "))

# ---- Nonblank logic consistent with your earlier code ----
is_nonblank <- function(x) {
  x_chr <- as.character(x)
  x_std <- str_to_upper(str_squish(x_chr))
  !(is.na(x_chr) | x_std == "" | x_std %in% c("NA", "N/A", "NULL"))
}

# ---- Work on a subset df (no mutation of df_status) ----
df_enrolled_c12 <- df_status %>%
  filter(enrolled_group == "Enrolled", cohort_final %in% c("Cohort 1", "Cohort 2"))

# Denominators: enrolled patients in each cohort (matches slide logic)
denoms <- df_enrolled_c12 %>%
  count(cohort_final, name = "N_enrolled")

# Helper: completion for one cohort + one column
completion_one <- function(cohort_name, col_name, metric_label, time_label) {
  d <- df_enrolled_c12 %>% filter(cohort_final == cohort_name)
  N <- nrow(d)
  n_complete <- sum(is_nonblank(d[[col_name]]))

  tibble(
    cohort_final = cohort_name,
    timepoint = time_label,
    metric = metric_label,
    n_complete = n_complete,
    N_enrolled = N,
    percent = round(100 * n_complete / N, 1),
    label = paste0(n_complete, "/", N, " (", round(100 * n_complete / N, 1), "%)")
  )
}

# ============================================================
# A) FIELD-LEVEL COMPLETION (what % of cohort has each field present)
# ============================================================
chart_review_completion_fields <- bind_rows(
  # Cohort 1: T1 fields
  completion_one("Cohort 1", a1c_t1_col,  "A1c change",          "T1"),
  completion_one("Cohort 1", ed_t1_col,   "ED utilization",      "T1"),
  completion_one("Cohort 1", ip_t1_col,   "IP utilization",      "T1"),

  # Cohort 1: T2 fields
  completion_one("Cohort 1", a1c_t2_col,  "A1c change",          "T2"),
  completion_one("Cohort 1", ed_t2_col,   "ED utilization",      "T2"),
  completion_one("Cohort 1", hosp_t2_col, "Hospital utilization","T2"),

  # Cohort 2: T1 fields only (per your schema)
  completion_one("Cohort 2", a1c_t1_col,  "A1c change",          "T1"),
  completion_one("Cohort 2", ed_t1_col,   "ED utilization",      "T1"),
  completion_one("Cohort 2", ip_t1_col,   "IP utilization",      "T1")
) %>%
  arrange(cohort_final, timepoint, metric)

chart_review_completion_fields

## # A tibble: 9 × 7
##   cohort_final timepoint metric              n_complete N_enrolled percent label
##   <chr>        <chr>     <chr>                    <int>      <int>   <dbl> <chr>
## 1 Cohort 1     T1        A1c change                  29         60    48.3 29/6…
## 2 Cohort 1     T1        ED utilization              60         60   100   60/6…
## 3 Cohort 1     T1        IP utilization              60         60   100   60/6…
## 4 Cohort 1     T2        A1c change                  31         60    51.7 31/6…
## 5 Cohort 1     T2        ED utilization              58         60    96.7 58/6…
## 6 Cohort 1     T2        Hospital utilizati…         59         60    98.3 59/6…
## 7 Cohort 2     T1        A1c change                  23         41    56.1 23/4…
## 8 Cohort 2     T1        ED utilization              41         41   100   41/4…
## 9 Cohort 2     T1        IP utilization              41         41   100   41/4…

# ============================================================
# B) TIMEPOINT-LEVEL "CHART REVIEW COMPLETE" (all required fields present)
#   - Cohort 1 T1 requires: A1c T1 + ED T1 + IP T1
#   - Cohort 1 T2 requires: A1c T2 + ED T2 + Hospital T2
#   - Cohort 2 T1 requires: A1c T1 + ED T1 + IP T1
# ============================================================

complete_all_required <- function(d, cols_required) {
  if (nrow(d) == 0) return(0L)
  sum(apply(d[, cols_required, drop = FALSE], 1, function(row) all(is_nonblank(row))))
}

chart_review_complete_timepoint <- bind_rows(
  # Cohort 1 - T1 complete
  {
    d <- df_enrolled_c12 %>% filter(cohort_final == "Cohort 1")
    n_complete <- complete_all_required(d, c(a1c_t1_col, ed_t1_col, ip_t1_col))
    N <- nrow(d)
    tibble(cohort_final="Cohort 1", timepoint="T1", rule="All required fields present",
           n_complete=n_complete, N_enrolled=N,
           percent=round(100*n_complete/N,1),
           label=paste0(n_complete,"/",N," (",round(100*n_complete/N,1),"%)"))
  },
  # Cohort 1 - T2 complete
  {
    d <- df_enrolled_c12 %>% filter(cohort_final == "Cohort 1")
    n_complete <- complete_all_required(d, c(a1c_t2_col, ed_t2_col, hosp_t2_col))
    N <- nrow(d)
    tibble(cohort_final="Cohort 1", timepoint="T2", rule="All required fields present",
           n_complete=n_complete, N_enrolled=N,
           percent=round(100*n_complete/N,1),
           label=paste0(n_complete,"/",N," (",round(100*n_complete/N,1),"%)"))
  },
  # Cohort 2 - T1 complete
  {
    d <- df_enrolled_c12 %>% filter(cohort_final == "Cohort 2")
    n_complete <- complete_all_required(d, c(a1c_t1_col, ed_t1_col, ip_t1_col))
    N <- nrow(d)
    tibble(cohort_final="Cohort 2", timepoint="T1", rule="All required fields present",
           n_complete=n_complete, N_enrolled=N,
           percent=round(100*n_complete/N,1),
           label=paste0(n_complete,"/",N," (",round(100*n_complete/N,1),"%)"))
  }
) %>%
  arrange(cohort_final, timepoint)

chart_review_complete_timepoint

## # A tibble: 3 × 7
##   cohort_final timepoint rule                n_complete N_enrolled percent label
##   <chr>        <chr>     <chr>                    <int>      <int>   <dbl> <chr>
## 1 Cohort 1     T1        All required field…         29         60    48.3 29/6…
## 2 Cohort 1     T2        All required field…         31         60    51.7 31/6…
## 3 Cohort 2     T1        All required field…         23         41    56.1 23/4…

# ============================================================
# VERIFY ED + HOSP/IP UTILIZATION SLIDE NUMBERS (TABLES ONLY)
# Uses existing df_status (does NOT overwrite it)
# Cohort-specific, ENROLLED only
#   - Cohort 1: Baseline + T1 + T2 (ED + Hosp)
#   - Cohort 2: Baseline + T1 (ED + IP)
# Outputs:
#   1) Valid-N + % complete per cohort/timepoint/metric
#   2) Distribution tables (0,1,2,3,4,5+ or 3+ for IP/Hosp if you want)
#   3) Change-category tables (Less / No change / More) with n + %
# ============================================================

library(dplyr)
library(stringr)
library(tidyr)
library(tibble)

stopifnot(exists("df_status"))
stopifnot(all(c("enrolled_group","cohort_final") %in% names(df_status)))

# ---- Column names from your file ----
col_ed_base   <- "ed_vis_last_90_days"
col_hosp_base <- "hosp_last_90_days"

col_ed_t1     <- "ed_utilization_t1"
col_ip_t1     <- "ip_utilization_t1"          # cohort 2 uses IP at T1
col_ed_t2     <- "ed_utilization_t2"
col_hosp_t2   <- "hospital_utilization_t2"    # cohort 1 uses Hospital at T2

need_cols <- c(col_ed_base, col_hosp_base, col_ed_t1, col_ip_t1, col_ed_t2, col_hosp_t2)
miss <- setdiff(need_cols, names(df_status))
if (length(miss) > 0) stop("Missing columns:\n  - ", paste(miss, collapse = "\n  - "))

# ---- Reuse your parser if it exists; otherwise define it (safe) ----
if (!exists("parse_count")) {
  parse_count <- function(x) {
    s <- str_squish(as.character(x))
    s[s == ""] <- NA_character_
    s[str_to_upper(s) %in% c("NA","N/A","NULL")] <- NA_character_
    suppressWarnings(as.numeric(str_replace_all(s, "[^0-9\\-\\.]", "")))
  }
}

# ---- Nonblank (completion) ----
is_nonblank <- function(x) {
  s <- str_to_upper(str_squish(as.character(x)))
  !(is.na(s) | s == "" | s %in% c("NA","N/A","NULL"))
}

# ---- Enrolled + cohort filter (analysis df only; df_status untouched) ----
df_enr <- df_status %>% filter(enrolled_group == "Enrolled", cohort_final %in% c("Cohort 1","Cohort 2"))

# ============================================================
# 1) COMPLETION (valid nonblank) — EXACTLY what you need for slide footers
# ============================================================
completion_one <- function(cohort_name, timepoint, metric, col_name) {
  d <- df_enr %>% filter(cohort_final == cohort_name)
  N <- nrow(d)
  n_valid <- sum(is_nonblank(d[[col_name]]))
  tibble(
    cohort_final = cohort_name,
    timepoint = timepoint,
    metric = metric,
    n_valid = n_valid,
    N_enrolled = N,
    percent = round(100 * n_valid / N, 1),
    label = paste0(n_valid, "/", N, " (", round(100 * n_valid / N, 1), "%)")
  )
}

util_completion <- bind_rows(
  # Cohort 1
  completion_one("Cohort 1","Baseline","ED visits (last 90d)", col_ed_base),
  completion_one("Cohort 1","Baseline","Hospital stays (last 90d)", col_hosp_base),
  completion_one("Cohort 1","T1","ED utilization", col_ed_t1),
  completion_one("Cohort 1","T1","IP utilization", col_ip_t1),
  completion_one("Cohort 1","T2","ED utilization", col_ed_t2),
  completion_one("Cohort 1","T2","Hospital utilization", col_hosp_t2),

  # Cohort 2
  completion_one("Cohort 2","Baseline","ED visits (last 90d)", col_ed_base),
  completion_one("Cohort 2","Baseline","Hospital stays (last 90d)", col_hosp_base),
  completion_one("Cohort 2","T1","ED utilization", col_ed_t1),
  completion_one("Cohort 2","T1","IP utilization", col_ip_t1)
) %>% arrange(cohort_final, timepoint, metric)

util_completion

## # A tibble: 10 × 7
##    cohort_final timepoint metric                n_valid N_enrolled percent label
##    <chr>        <chr>     <chr>                   <int>      <int>   <dbl> <chr>
##  1 Cohort 1     Baseline  ED visits (last 90d)       60         60   100   60/6…
##  2 Cohort 1     Baseline  Hospital stays (last…      60         60   100   60/6…
##  3 Cohort 1     T1        ED utilization             60         60   100   60/6…
##  4 Cohort 1     T1        IP utilization             60         60   100   60/6…
##  5 Cohort 1     T2        ED utilization             58         60    96.7 58/6…
##  6 Cohort 1     T2        Hospital utilization       59         60    98.3 59/6…
##  7 Cohort 2     Baseline  ED visits (last 90d)       41         41   100   41/4…
##  8 Cohort 2     Baseline  Hospital stays (last…      41         41   100   41/4…
##  9 Cohort 2     T1        ED utilization             41         41   100   41/4…
## 10 Cohort 2     T1        IP utilization             41         41   100   41/4…

# ============================================================
# 2) DISTRIBUTIONS — bucketed counts/percents (for slide bar numbers)
#    ED: 0,1,2,3,4,5+
#    IP/Hosp: 0,1,2,3+
# ============================================================
bucket_n <- function(x, cap) {
  xi <- as.integer(x)
  xi[xi < 0] <- NA_integer_
  b <- ifelse(xi >= cap, cap, xi)
  ifelse(is.na(b), NA_character_, ifelse(b == cap, paste0(cap, "+"), as.character(b)))
}

make_dist_tab <- function(cohort_name, timepoint, metric, col_name, cap) {
  d <- df_enr %>% filter(cohort_final == cohort_name)
  tab <- d %>%
    transmute(val = parse_count(.data[[col_name]])) %>%
    filter(!is.na(val)) %>%
    transmute(bucket = bucket_n(val, cap)) %>%
    count(bucket, name = "n") %>%
    tidyr::complete(bucket = c(as.character(0:(cap-1)), paste0(cap, "+")), fill = list(n = 0)) %>%
    mutate(
      N_valid = sum(n),
      percent = ifelse(N_valid > 0, round(100 * n / N_valid, 1), 0),
      label = paste0(n, "/", N_valid, " (", percent, "%)"),
      cohort_final = cohort_name,
      timepoint = timepoint,
      metric = metric
    ) %>%
    select(cohort_final, timepoint, metric, bucket, n, N_valid, percent, label)

  tab
}

# --- ED distributions (cap 5) ---
ed_dist_tables <- bind_rows(
  # Cohort 1
  make_dist_tab("Cohort 1","Baseline","ED visits", col_ed_base, 5),
  make_dist_tab("Cohort 1","T1","ED utilization", col_ed_t1, 5),
  make_dist_tab("Cohort 1","T2","ED utilization", col_ed_t2, 5),
  # Cohort 2
  make_dist_tab("Cohort 2","Baseline","ED visits", col_ed_base, 5),
  make_dist_tab("Cohort 2","T1","ED utilization", col_ed_t1, 5)
) %>% arrange(cohort_final, timepoint, bucket)

ed_dist_tables

## # A tibble: 30 × 8
##    cohort_final timepoint metric         bucket     n N_valid percent label     
##    <chr>        <chr>     <chr>          <chr>  <int>   <int>   <dbl> <chr>     
##  1 Cohort 1     Baseline  ED visits      0         26      60    43.3 26/60 (43…
##  2 Cohort 1     Baseline  ED visits      1         18      60    30   18/60 (30…
##  3 Cohort 1     Baseline  ED visits      2         11      60    18.3 11/60 (18…
##  4 Cohort 1     Baseline  ED visits      3          4      60     6.7 4/60 (6.7…
##  5 Cohort 1     Baseline  ED visits      4          0      60     0   0/60 (0%) 
##  6 Cohort 1     Baseline  ED visits      5+         1      60     1.7 1/60 (1.7…
##  7 Cohort 1     T1        ED utilization 0         41      60    68.3 41/60 (68…
##  8 Cohort 1     T1        ED utilization 1         11      60    18.3 11/60 (18…
##  9 Cohort 1     T1        ED utilization 2          6      60    10   6/60 (10%)
## 10 Cohort 1     T1        ED utilization 3          1      60     1.7 1/60 (1.7…
## # ℹ 20 more rows

# --- IP/Hospital distributions (cap 3) ---
ip_hosp_dist_tables <- bind_rows(
  # Cohort 1: baseline hosp + T1 IP + T2 hosp
  make_dist_tab("Cohort 1","Baseline","Hospital stays", col_hosp_base, 3),
  make_dist_tab("Cohort 1","T1","IP utilization", col_ip_t1, 3),
  make_dist_tab("Cohort 1","T2","Hospital utilization", col_hosp_t2, 3),
  # Cohort 2: baseline hosp + T1 IP
  make_dist_tab("Cohort 2","Baseline","Hospital stays", col_hosp_base, 3),
  make_dist_tab("Cohort 2","T1","IP utilization", col_ip_t1, 3)
) %>% arrange(cohort_final, timepoint, bucket)

ip_hosp_dist_tables

## # A tibble: 20 × 8
##    cohort_final timepoint metric              bucket     n N_valid percent label
##    <chr>        <chr>     <chr>               <chr>  <int>   <int>   <dbl> <chr>
##  1 Cohort 1     Baseline  Hospital stays      0         31      60    51.7 31/6…
##  2 Cohort 1     Baseline  Hospital stays      1         19      60    31.7 19/6…
##  3 Cohort 1     Baseline  Hospital stays      2          9      60    15   9/60…
##  4 Cohort 1     Baseline  Hospital stays      3+         1      60     1.7 1/60…
##  5 Cohort 1     T1        IP utilization      0         46      60    76.7 46/6…
##  6 Cohort 1     T1        IP utilization      1         10      60    16.7 10/6…
##  7 Cohort 1     T1        IP utilization      2          4      60     6.7 4/60…
##  8 Cohort 1     T1        IP utilization      3+         0      60     0   0/60…
##  9 Cohort 1     T2        Hospital utilizati… 0         54      59    91.5 54/5…
## 10 Cohort 1     T2        Hospital utilizati… 1          5      59     8.5 5/59…
## 11 Cohort 1     T2        Hospital utilizati… 2          0      59     0   0/59…
## 12 Cohort 1     T2        Hospital utilizati… 3+         0      59     0   0/59…
## 13 Cohort 2     Baseline  Hospital stays      0          0      41     0   0/41…
## 14 Cohort 2     Baseline  Hospital stays      1         26      41    63.4 26/4…
## 15 Cohort 2     Baseline  Hospital stays      2         10      41    24.4 10/4…
## 16 Cohort 2     Baseline  Hospital stays      3+         5      41    12.2 5/41…
## 17 Cohort 2     T1        IP utilization      0         36      41    87.8 36/4…
## 18 Cohort 2     T1        IP utilization      1          5      41    12.2 5/41…
## 19 Cohort 2     T1        IP utilization      2          0      41     0   0/41…
## 20 Cohort 2     T1        IP utilization      3+         0      41     0   0/41…

# ============================================================
# 3) CHANGE CATEGORIES — Less / No change / More
#    ED: Baseline→T1 (C1 & C2), and T1→T2 (C1 only)
#    IP/Hosp: Baseline→T1 uses (hosp_base→ip_t1), and T1→T2 (ip_t1→hosp_t2) for C1
# ============================================================
make_change_tab <- function(d, before_col, after_col, cohort_name, metric, change_label) {
  dd <- d %>%
    transmute(
      before = parse_count(.data[[before_col]]),
      after  = parse_count(.data[[after_col]])
    ) %>%
    filter(!is.na(before) & !is.na(after)) %>%
    transmute(
      category = case_when(
        after < before ~ "Less",
        after > before ~ "More",
        TRUE ~ "No change"
      )
    ) %>%
    count(category, name = "n") %>%
    mutate(
      category = factor(category, levels = c("Less","No change","More")),
      N_valid = sum(n),
      percent = round(100 * n / N_valid, 1),
      label = paste0(n, "/", N_valid, " (", percent, "%)"),
      cohort_final = cohort_name,
      metric = metric,
      change = change_label
    ) %>%
    arrange(category)

  dd
}

d_c1 <- df_enr %>% filter(cohort_final == "Cohort 1")
d_c2 <- df_enr %>% filter(cohort_final == "Cohort 2")

util_change_tables <- bind_rows(
  # ED change
  make_change_tab(d_c1, col_ed_base, col_ed_t1, "Cohort 1", "ED", "Baseline → T1"),
  make_change_tab(d_c2, col_ed_base, col_ed_t1, "Cohort 2", "ED", "Baseline → T1"),
  make_change_tab(d_c1, col_ed_t1,  col_ed_t2, "Cohort 1", "ED", "T1 → T2"),

  # IP/Hosp change (note: baseline uses hosp_last_90_days; T1 uses ip_utilization_t1; T2 uses hospital_utilization_t2)
  make_change_tab(d_c1, col_hosp_base, col_ip_t1, "Cohort 1", "IP/Hosp", "Baseline → T1"),
  make_change_tab(d_c2, col_hosp_base, col_ip_t1, "Cohort 2", "IP/Hosp", "Baseline → T1"),
  make_change_tab(d_c1, col_ip_t1,    col_hosp_t2, "Cohort 1", "IP/Hosp", "T1 → T2")
) %>%
  select(cohort_final, metric, change, category, n, N_valid, percent, label)

util_change_tables

## # A tibble: 17 × 8
##    cohort_final metric  change        category      n N_valid percent label     
##    <chr>        <chr>   <chr>         <fct>     <int>   <int>   <dbl> <chr>     
##  1 Cohort 1     ED      Baseline → T1 Less         25      60    41.7 25/60 (41…
##  2 Cohort 1     ED      Baseline → T1 No change    27      60    45   27/60 (45…
##  3 Cohort 1     ED      Baseline → T1 More          8      60    13.3 8/60 (13.…
##  4 Cohort 2     ED      Baseline → T1 Less         33      41    80.5 33/41 (80…
##  5 Cohort 2     ED      Baseline → T1 No change     7      41    17.1 7/41 (17.…
##  6 Cohort 2     ED      Baseline → T1 More          1      41     2.4 1/41 (2.4…
##  7 Cohort 1     ED      T1 → T2       Less         15      58    25.9 15/58 (25…
##  8 Cohort 1     ED      T1 → T2       No change    30      58    51.7 30/58 (51…
##  9 Cohort 1     ED      T1 → T2       More         13      58    22.4 13/58 (22…
## 10 Cohort 1     IP/Hosp Baseline → T1 Less         24      60    40   24/60 (40…
## 11 Cohort 1     IP/Hosp Baseline → T1 No change    30      60    50   30/60 (50…
## 12 Cohort 1     IP/Hosp Baseline → T1 More          6      60    10   6/60 (10%)
## 13 Cohort 2     IP/Hosp Baseline → T1 Less         38      41    92.7 38/41 (92…
## 14 Cohort 2     IP/Hosp Baseline → T1 No change     3      41     7.3 3/41 (7.3…
## 15 Cohort 1     IP/Hosp T1 → T2       Less         12      59    20.3 12/59 (20…
## 16 Cohort 1     IP/Hosp T1 → T2       No change    44      59    74.6 44/59 (74…
## 17 Cohort 1     IP/Hosp T1 → T2       More          3      59     5.1 3/59 (5.1…

# ============================================================
# PRIMARY OUTCOMES: UTILIZATION (ENROLLED ONLY, NO COHORT SPLITS)
# Verifies the slide boxes:
#   ED Util Change: Baseline -> T1 (Less / No change / More) with n + %
#   IP Util Change: Baseline -> T1 (Less / No change / More) with n + %
# And summary line:
#   "T2 enrolled: X% had zero inpatient stays (vs Y% at T1)"
# ============================================================

library(dplyr)
library(stringr)
library(tidyr)

stopifnot(exists("df_status"))
stopifnot("enrolled_group" %in% names(df_status))

# ---- columns (from your setup) ----
col_ed_base   <- "ed_vis_last_90_days"
col_ed_t1     <- "ed_utilization_t1"

col_ip_base   <- "hosp_last_90_days"          # baseline inpatient proxy in your sheet
col_ip_t1     <- "ip_utilization_t1"
col_ip_t2     <- "hospital_utilization_t2"   # this is what your slide calls "T2 enrolled: zero inpatient stays"

need <- c(col_ed_base, col_ed_t1, col_ip_base, col_ip_t1, col_ip_t2)
miss <- setdiff(need, names(df_status))
if (length(miss) > 0) stop("Missing columns:\n  - ", paste(miss, collapse = "\n  - "))

# ---- reuse your parser if it exists; otherwise define once ----
if (!exists("parse_count")) {
  parse_count <- function(x) {
    s <- str_squish(as.character(x))
    s[s == ""] <- NA_character_
    s[str_to_upper(s) %in% c("NA","N/A","NULL")] <- NA_character_
    suppressWarnings(as.numeric(str_replace_all(s, "[^0-9\\-\\.]", "")))
  }
}

# ---- enrolled-only analysis df (does not touch df_status) ----
df_enrolled <- df_status %>% filter(enrolled_group == "Enrolled")

# ============================================================
# A) Change-table helper (Less / No change / More) + exact N
# ============================================================
make_change_boxes <- function(df, before_col, after_col) {
  dd <- df %>%
    transmute(
      before = parse_count(.data[[before_col]]),
      after  = parse_count(.data[[after_col]])
    ) %>%
    filter(!is.na(before) & !is.na(after)) %>%
    mutate(
      category = case_when(
        after < before ~ "Less",
        after > before ~ "More",
        TRUE           ~ "No change"
      )
    )

  N <- nrow(dd)

  out <- dd %>%
    count(category, name = "n") %>%
    mutate(
      N = N,
      percent = round(100 * n / N, 1),
      label = paste0(percent, "% (n=", n, ")"),
      category = factor(category, levels = c("Less","No change","More"))
    ) %>%
    arrange(category)

  list(N = N, tab = out)
}

# ---- ED boxes: Baseline -> T1 ----
ed_boxes <- make_change_boxes(df_enrolled, col_ed_base, col_ed_t1)
ed_boxes$N

## [1] 101

ed_boxes$tab

## # A tibble: 3 × 5
##   category      n     N percent label       
##   <fct>     <int> <int>   <dbl> <chr>       
## 1 Less         58   101    57.4 57.4% (n=58)
## 2 No change    34   101    33.7 33.7% (n=34)
## 3 More          9   101     8.9 8.9% (n=9)

# ---- IP boxes: Baseline -> T1 ----
ip_boxes <- make_change_boxes(df_enrolled, col_ip_base, col_ip_t1)
ip_boxes$N

## [1] 101

ip_boxes$tab

## # A tibble: 3 × 5
##   category      n     N percent label       
##   <fct>     <int> <int>   <dbl> <chr>       
## 1 Less         62   101    61.4 61.4% (n=62)
## 2 No change    33   101    32.7 32.7% (n=33)
## 3 More          6   101     5.9 5.9% (n=6)

# ============================================================
# B) "T2 enrolled: % had zero inpatient stays (vs % at T1)"
# Here, "inpatient stays at T1" = ip_utilization_t1
# and "inpatient stays at T2" = hospital_utilization_t2
# Denominators are "valid answers" (non-missing numeric parses).
# ============================================================
pct_zero <- function(df, col) {
  v <- parse_count(df[[col]])
  v <- v[!is.na(v)]
  N <- length(v)
  n0 <- sum(v == 0)
  tibble(N_valid = N, n_zero = n0, percent_zero = round(100 * n0 / N, 1),
         label = paste0(round(100 * n0 / N, 1), "% (", n0, "/", N, ")"))
}

zero_t1 <- pct_zero(df_enrolled, col_ip_t1)
zero_t2 <- pct_zero(df_enrolled, col_ip_t2)

zero_t1

## # A tibble: 1 × 4
##   N_valid n_zero percent_zero label         
##     <int>  <int>        <dbl> <chr>         
## 1     101     82         81.2 81.2% (82/101)

zero_t2

## # A tibble: 1 × 4
##   N_valid n_zero percent_zero label        
##     <int>  <int>        <dbl> <chr>        
## 1      59     54         91.5 91.5% (54/59)

# If you want the exact sentence values:
summary_line <- tibble(
  t1 = zero_t1$label,
  t2 = zero_t2$label
)
summary_line

## # A tibble: 1 × 2
##   t1             t2           
##   <chr>          <chr>        
## 1 81.2% (82/101) 91.5% (54/59)

Cohort_Validation

2026-03-09