Cohort_Validation

df <- read_excel("FIM Data 2-26-2026 (DI).xlsx",
                 col_types = "text") %>%
  clean_names()

names(df)

##   [1] "pre_enrollment_notes"                                                                                                       
##   [2] "cohort"                                                                                                                     
##   [3] "enrolled"                                                                                                                   
##   [4] "age"                                                                                                                        
##   [5] "preferred_language"                                                                                                         
##   [6] "disch_date_time"                                                                                                            
##   [7] "cc"                                                                                                                         
##   [8] "admission_diagnosis"                                                                                                        
##   [9] "admitting_provider"                                                                                                         
##  [10] "discharge_disposition"                                                                                                      
##  [11] "lace_readmission_score_score_column"                                                                                        
##  [12] "unplanned_readmission_score"                                                                                                
##  [13] "primary_cvg"                                                                                                                
##  [14] "hb_a1c_value_last_3_months"                                                                                                 
##  [15] "pt_portal_status"                                                                                                           
##  [16] "hosp_last_365_days"                                                                                                         
##  [17] "ed_last_365_days"                                                                                                           
##  [18] "ed_vis_last_90_days"                                                                                                        
##  [19] "hosp_last_90_days"                                                                                                          
##  [20] "pcp"                                                                                                                        
##  [21] "race"                                                                                                                       
##  [22] "ethnicity"                                                                                                                  
##  [23] "legal_sex"                                                                                                                  
##  [24] "sdoh_high_risk_domains"                                                                                                     
##  [25] "zip_code"                                                                                                                   
##  [26] "prescreen_contacted_y_n"                                                                                                    
##  [27] "ps_in_the_past_12_months_were_you_ever_worried_your_food_would_run_out_before_you_had_money_to_buy_more"                    
##  [28] "ps_in_the_past_12_months_did_the_food_you_could_afford_not_last_until_the_end_of_the_month_and_you_couldn_t_get_more"       
##  [29] "ps_in_the_last_month_did_anyone_in_your_household_have_to_skip_meals_due_to_lack_of_food"                                   
##  [30] "pre_screen_interested_in_program"                                                                                           
##  [31] "t0_contact"                                                                                                                 
##  [32] "screen_for_enrollment_interested_in_program"                                                                                
##  [33] "t0_are_you_likely_to_eat_the_food_as_part_of_your_normal_daily_routine"                                                     
##  [34] "t0_in_the_past_week_how_often_did_you_check_your_blood_sugar"                                                               
##  [35] "t0_are_you_confident_in_making_food_choices_that_help_control_your_blood_sugar"                                             
##  [36] "t0_are_you_confident_in_understanding_how_to_read_food_labels_and_nutrition_information"                                    
##  [37] "t0_were_your_sugar_levels_usually_in_your_target_range_last_week"                                                           
##  [38] "t0_in_the_past_week_did_you_miss_any_doses_of_you_diabetes_medications"                                                     
##  [39] "t0_since_your_discharge_from_the_hospital_have_you_had_an_appointment_with_a_healthcare_provider_for_your_diabetes"         
##  [40] "t1_contact_y_n"                                                                                                             
##  [41] "t1_notes"                                                                                                                   
##  [42] "t1_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"                                                
##  [43] "t1_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"                                                       
##  [44] "if_yes_why_and_how_much_food_was_thrown_away_44"                                                                            
##  [45] "t1_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"                                                          
##  [46] "t1_in_the_past_week_how_often_did_you_check_your_blood_sugar"                                                               
##  [47] "t1_were_your_sugar_levels_usually_in_your_target_range_last_week"                                                           
##  [48] "t1_did_you_have_any_symptoms_of_high_or_low_blood_sugar_in_the_past_week"                                                   
##  [49] "t1_in_the_past_week_did_you_miss_any_doses_of_you_diabetes_medications"                                                     
##  [50] "do_you_have_access_to_your_medications_escalate_as_needed_50"                                                               
##  [51] "t1_since_our_last_check_in_have_you_had_an_appointment_with_a_healthcare_provider_for_your_diabetes"                        
##  [52] "t1_have_you_done_anything_in_the_past_week_to_help_manage_your_diabetes_like_walking_portion_control_or_drinking_more_water"
##  [53] "t1_how_much_did_this_program_help_you_feel_more_in_control_of_your_diabetes"                                                
##  [54] "t1_did_the_nutrition_material_you_got_from_white_plains_hospital_help_you_make_healthier_food_choices"                      
##  [55] "t1_how_confident_are_you_in_making_food_choices_that_help_control_your_blood_sugar"                                         
##  [56] "t1_how_confident_are_you_in_understanding_food_labels_or_nutrition_information"                                             
##  [57] "t1_how_confident_are_you_in_your_ability_to_prepare_healthy_meals_with_the_food_you_typically_have_at_home"                 
##  [58] "in_one_to_two_sentences_can_you_tell_me_what_feedback_you_have_that_can_improve_this_program_for_others_58"                 
##  [59] "t1_do_you_have_any_suggestions_or_feedback_to_improve_this_program_for_others"                                              
##  [60] "t2_contact_y_n"                                                                                                             
##  [61] "t2_notes"                                                                                                                   
##  [62] "t2_in_the_past_6_months_were_you_ever_worried_your_food_would_run_out_before_you_had_money_to_buy_more"                     
##  [63] "t2_in_the_past_6_months_did_the_food_you_could_afford_not_last_until_the_end_of_the_month_and_you_couldn_t_get_more"        
##  [64] "t2_in_the_last_month_did_anyone_in_your_household_have_to_skip_meals_due_to_lack_of_food"                                   
##  [65] "t2_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"                                                
##  [66] "t2_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"                                                       
##  [67] "if_yes_why_and_how_much_food_was_thrown_away_67"                                                                            
##  [68] "t2_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"                                                          
##  [69] "t2_in_the_past_week_how_often_did_you_check_your_blood_sugar"                                                               
##  [70] "do_you_have_access_to_your_medications_escalate_as_needed_70"                                                               
##  [71] "t2_were_your_sugar_levels_usually_in_your_target_range_last_week"                                                           
##  [72] "t2_in_the_past_week_did_you_have_any_symptoms_of_high_or_low_blood_sugar"                                                   
##  [73] "t2_in_the_past_week_did_you_miss_any_doses_of_your_diabetes_medication"                                                     
##  [74] "t2_since_our_last_check_in_have_you_had_an_appointment_with_a_healthcare_provider_for_your_diabetes"                        
##  [75] "t2_have_you_done_anything_in_the_past_week_to_help_manage_your_diabetes_like_walking_portion_control_drinking_more_water"   
##  [76] "t2_how_much_did_this_program_help_you_feel_more_in_control_of_your_diabetes"                                                
##  [77] "t2_did_the_nutrition_material_you_got_from_white_plains_hospital_help_you_make_healthier_food_choices"                      
##  [78] "t2_how_confident_are_you_in_making_food_choices_that_help_control_your_blood_sugar"                                         
##  [79] "t2_how_confident_are_you_in_understanding_food_labels_or_nutrition_information"                                             
##  [80] "t2_how_confident_are_you_in_your_ability_to_prepare_healthy_meals_with_the_foods_you_typically_have_at_home"                
##  [81] "in_one_to_two_sentences_can_you_tell_me_what_feedback_you_have_that_can_improve_this_program_for_others_81"                 
##  [82] "t2_do_you_have_any_suggestions_or_feedback_to_improve_this_program_for_others"                                              
##  [83] "hemoglobin_a1c_8_1_25_10_2_2025"                                                                                            
##  [84] "hospital_utilization_8_1_25_10_2_25"                                                                                        
##  [85] "a1c_change_8_1_25_10_2_25"                                                                                                  
##  [86] "a1c_under_8_8_1_25_10_2_25"                                                                                                 
##  [87] "hemoglobin_a1c_11_20_2_20_t1"                                                                                               
##  [88] "ed_utilization_t1"                                                                                                          
##  [89] "ip_utilization_t1"                                                                                                          
##  [90] "a1c_change_t1"                                                                                                              
##  [91] "a1c_under_8_t1"                                                                                                             
##  [92] "additional_feedback"                                                                                                        
##  [93] "hemoglobin_a1c_t2"                                                                                                          
##  [94] "a1c_date_t2"                                                                                                                
##  [95] "ed_utilization_t2"                                                                                                          
##  [96] "hospital_utilization_t2"                                                                                                    
##  [97] "a1c_change_t2"                                                                                                              
##  [98] "a1c_under_8_t2"                                                                                                             
##  [99] "housing"                                                                                                                    
## [100] "housing_2"                                                                                                                  
## [101] "food"                                                                                                                       
## [102] "utilities"                                                                                                                  
## [103] "transport"                                                                                                                  
## [104] "finance"                                                                                                                    
## [105] "caregiver"                                                                                                                  
## [106] "legal"                                                                                                                      
## [107] "violence"                                                                                                                   
## [108] "employment"                                                                                                                 
## [109] "education"                                                                                                                  
## [110] "completed"

view(df)

# Confirm required columns exist
stopifnot("enrolled" %in% names(df))
stopifnot("cohort" %in% names(df))

df_clean <- df %>%
  mutate(
    enrolled_clean = str_to_upper(str_squish(as.character(enrolled))),
    cohort_clean   = str_squish(as.character(cohort)),
    cohort_num     = suppressWarnings(as.integer(str_extract(cohort_clean, "\\d+")))
  )

df_status <- df_clean %>%
  mutate(
    enrolled_group = case_when(
      enrolled_clean == "Y" ~ "Enrolled",
      TRUE ~ "Not enrolled"
    ),
    cohort_final = case_when(
      enrolled_group == "Enrolled" & cohort_num == 1 ~ "Cohort 1",
      enrolled_group == "Enrolled" & cohort_num == 2 ~ "Cohort 2",
      enrolled_group == "Enrolled" & cohort_num == 3 ~ "Cohort 3",
      TRUE ~ "Not enrolled"
    )
  )

cohort_counts <- df_status %>%
  count(cohort_final, name = "N") %>%
  arrange(cohort_final)

cohort_counts

## # A tibble: 4 × 2
##   cohort_final     N
##   <chr>        <int>
## 1 Cohort 1        60
## 2 Cohort 2        41
## 3 Cohort 3        35
## 4 Not enrolled   817

a1c_t1_col  <- "a1c_change_t1"
ed_t1_col   <- "ed_utilization_t1"
ip_t1_col   <- "ip_utilization_t1"

a1c_t2_col  <- "a1c_change_t2"
ed_t2_col   <- "ed_utilization_t2"
hosp_t2_col <- "hospital_utilization_t2"

# ----------------------------
# 7) Safety checks: chart-review columns exist
# ----------------------------
chart_cols <- c(a1c_t1_col, ed_t1_col, ip_t1_col, a1c_t2_col, ed_t2_col, hosp_t2_col)

missing_cols <- setdiff(chart_cols, names(df_status))
if (length(missing_cols) > 0) {
  stop(
    "These chart-review columns are missing from df_status:\n  - ",
    paste(missing_cols, collapse = "\n  - "),
    "\n\nRun: names(df_status) and update the *_col variables to match."
  )
}

# ----------------------------
# 8) Cohort subsets
# ----------------------------
df_c1 <- df_status %>% filter(cohort_final == "Cohort 1")
df_c2 <- df_status %>% filter(cohort_final == "Cohort 2")
df_c3 <- df_status %>% filter(cohort_final == "Cohort 3")

# ----------------------------
# 9) Completion helpers
#   - "nonblank" means not NA and not "" and not "NA"/"N/A"/"NULL"
# ----------------------------
is_nonblank <- function(x) {
  x_chr <- as.character(x)
  x_std <- str_to_upper(str_squish(x_chr))
  !(is.na(x_chr) | x_std == "" | x_std %in% c("NA", "N/A", "NULL"))
}

pct_complete <- function(df, column) {
  total <- nrow(df)
  if (total == 0) return("-")

  complete_n <- sum(is_nonblank(df[[column]]))
  pct <- round(100 * complete_n / total, 1)
  paste0(complete_n, "/", total, " (", pct, "%)")
}

pct_complete_all_required <- function(df, cols_required) {
  total <- nrow(df)
  if (total == 0) return("-")

  complete_n <- sum(apply(df[, cols_required, drop = FALSE], 1, function(row) {
    all(is_nonblank(row))
  }))
  pct <- round(100 * complete_n / total, 1)
  paste0(complete_n, "/", total, " (", pct, "%)")
}

# ----------------------------
# 10) Basic cohort size table
# ----------------------------
cohort_size_table <- tibble(
  Cohort = c("Cohort 1", "Cohort 2", "Cohort 3"),
  N = c(nrow(df_c1), nrow(df_c2), nrow(df_c3))
)

cohort_size_table

## # A tibble: 3 × 2
##   Cohort       N
##   <chr>    <int>
## 1 Cohort 1    60
## 2 Cohort 2    41
## 3 Cohort 3    35

# ----------------------------
# CHART REVIEW
# ----------------------------
c1_required <- c(a1c_t1_col, ed_t1_col, ip_t1_col, a1c_t2_col, ed_t2_col, hosp_t2_col)
c2_required <- c(a1c_t1_col, ed_t1_col, ip_t1_col)

chart_review_table <- tibble(
  Cohort = c("Cohort 1", "Cohort 2", "Cohort 3"),
  N = c(nrow(df_c1), nrow(df_c2), nrow(df_c3)),

  # Field-level completeness 
  `A1c Change (T1)` = c(pct_complete(df_c1, a1c_t1_col), pct_complete(df_c2, a1c_t1_col), "-"),
  `ED Utilization (T1)` = c(pct_complete(df_c1, ed_t1_col), pct_complete(df_c2, ed_t1_col), "-"),
  `IP Utilization (T1)` = c(pct_complete(df_c1, ip_t1_col), pct_complete(df_c2, ip_t1_col), "-"),

  `A1c Change (T2)` = c(pct_complete(df_c1, a1c_t2_col), "-", "-"),
  `ED Utilization (T2)` = c(pct_complete(df_c1, ed_t2_col), "-", "-"),
  `Hospital Utilization (T2)` = c(pct_complete(df_c1, hosp_t2_col), "-", "-"),

  # Cohort-level "complete chart review"
  `Chart review complete (per cohort rules)` = c(
    pct_complete_all_required(df_c1, c1_required),
    pct_complete_all_required(df_c2, c2_required),
    "-"
  )
)

chart_review_table

## # A tibble: 3 × 9
##   Cohort       N `A1c Change (T1)` `ED Utilization (T1)` `IP Utilization (T1)`
##   <chr>    <int> <chr>             <chr>                 <chr>                
## 1 Cohort 1    60 29/60 (48.3%)     60/60 (100%)          60/60 (100%)         
## 2 Cohort 2    41 23/41 (56.1%)     41/41 (100%)          41/41 (100%)         
## 3 Cohort 3    35 -                 -                     -                    
## # ℹ 4 more variables: `A1c Change (T2)` <chr>, `ED Utilization (T2)` <chr>,
## #   `Hospital Utilization (T2)` <chr>,
## #   `Chart review complete (per cohort rules)` <chr>

# -------------------------
# A1c Change (T1): Cohort 1 + Cohort 2
# -------------------------
a1c_t1 <- df_status %>%
  filter(cohort_final %in% c("Cohort 1", "Cohort 2")) %>%
  transmute(
    delta = suppressWarnings(as.numeric(str_replace_all(str_squish(as.character(.data[[a1c_t1_col]])), "[^0-9.\\-]", "")))
  ) %>%
  filter(!is.na(delta)) %>%
  mutate(
    Category = case_when(
      delta <= -1.0 ~ "Improved (≥1.0% ↓)",
      delta >=  1.0 ~ "Worsened (≥1.0% ↑)",
      TRUE          ~ "No meaningful change (<1.0%)"
    ),
    Category = factor(Category, levels = c("Improved (≥1.0% ↓)", "No meaningful change (<1.0%)", "Worsened (≥1.0% ↑)"))
  )

n_t1 <- nrow(a1c_t1)

a1c_t1_tab <- a1c_t1 %>%
  count(Category, name = "n") %>%
  mutate(
    pct = round(100 * n / sum(n), 1),
    label = paste0(pct, "%\n(n=", n, ")")
  )


# ---- small helper: parse numeric safely ----
parse_delta <- function(x) {
  suppressWarnings(as.numeric(str_replace_all(str_squish(as.character(x)), "[^0-9.\\-]", "")))
}

# ---- style settings (you asked for color) ----
col_improved <- "#2E7D32"   # green
col_nochange <- "#757575"   # gray
col_worsened <- "#C62828"   # red

make_a1c_plots <- function(df_in, delta_col, title_text) {

  d <- df_in %>%
    transmute(delta = parse_delta(.data[[delta_col]])) %>%
    filter(!is.na(delta)) %>%
    mutate(
      Category = case_when(
        delta <= -1.0 ~ "Improved (≥1.0%)",
        delta >=  1.0 ~ "Worsened (≥1.0%)",
        TRUE          ~ "No meaningful change"
      ),
      Category = factor(Category, levels = c("Improved (≥1.0%)", "No meaningful change", "Worsened (≥1.0%)"))
    )

  n <- nrow(d)

  tab <- d %>%
    count(Category, name = "n") %>%
    mutate(
      pct = round(100 * n / sum(n), 1),
      label = paste0(pct, "%  (n=", n, ")")
    )

  # 1) Colored category bar chart
  p_bar <- ggplot(tab, aes(x = Category, y = pct, fill = Category)) +
    geom_col(width = 0.72) +
    geom_text(aes(label = label), vjust = -0.35, size = 4) +
    scale_fill_manual(values = c(
      "Improved (≥1.0%)" = col_improved,
      "No meaningful change" = col_nochange,
      "Worsened (≥1.0%)" = col_worsened
    )) +
    scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
    labs(title = paste0(title_text, " (n = ", n, ")"), x = NULL, y = "% of cohort") +
    theme_minimal(base_size = 13) +
    theme(
      plot.title = element_text(face = "bold", size = 16),
      axis.text.x = element_text(size = 12),
      legend.position = "none"
    )

  # 2) Replace histogram with: jitter + boxplot + threshold bands/lines
  p_dist <- ggplot(d, aes(x = "", y = delta)) +
    annotate("rect", xmin = -Inf, xmax = Inf, ymin = -1, ymax = 1, alpha = 0.10, fill = col_nochange) +
    geom_hline(yintercept = c(-1, 1), linetype = "dashed", linewidth = 0.7, color = "black") +
    geom_boxplot(width = 0.22, outlier.shape = NA, alpha = 0.35) +
    geom_jitter(width = 0.12, height = 0, alpha = 0.55, size = 1.6) +
    coord_flip() +
    labs(
      title = "A1c change distribution (each dot = 1 patient)",
      x = NULL,
      y = "A1c change (percentage points)"
    ) +
    theme_minimal(base_size = 13) +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      axis.text.y = element_blank(),
      axis.ticks.y = element_blank()
    )

  list(tab = tab, bar = p_bar, dist = p_dist)
}

# =========================
# T1: Cohort 1 + Cohort 2
# =========================
df_a1c_t1 <- df_status %>% filter(cohort_final %in% c("Cohort 1", "Cohort 2"))
out_t1 <- make_a1c_plots(df_a1c_t1, a1c_t1_col, "A1C Change (T1): Cohort 1 + Cohort 2")
out_t1$tab

## # A tibble: 3 × 4
##   Category                 n   pct label        
##   <fct>                <int> <dbl> <chr>        
## 1 Improved (≥1.0%)        32  62.7 62.7%  (n=32)
## 2 No meaningful change    17  33.3 33.3%  (n=17)
## 3 Worsened (≥1.0%)         2   3.9 3.9%  (n=2)

out_t1$bar

# =========================
# T2: Cohort 1 only
# =========================
df_a1c_t2 <- df_status %>% filter(cohort_final == "Cohort 1")
out_t2 <- make_a1c_plots(df_a1c_t2, a1c_t2_col, "A1C Change (T2): Cohort 1")
out_t2$tab

## # A tibble: 3 × 4
##   Category                 n   pct label        
##   <fct>                <int> <dbl> <chr>        
## 1 Improved (≥1.0%)        19  63.3 63.3%  (n=19)
## 2 No meaningful change     9  30   30%  (n=9)   
## 3 Worsened (≥1.0%)         2   6.7 6.7%  (n=2)

out_t2$bar

# -------------------------
# Column names you specified
# -------------------------
col_a1c_all <- "hb_a1c_value_last_3_months"
col_a1c_t1  <- "hemoglobin_a1c_11_20_2_20_t1"
col_a1c_t2  <- "hemoglobin_a1c_t2"

need_cols <- c(col_a1c_all, col_a1c_t1, col_a1c_t2)
missing <- setdiff(need_cols, names(df_status))
if (length(missing) > 0) {
  stop("Missing columns:\n  - ", paste(missing, collapse = "\n  - "),
       "\n\nCheck names(df_status) and fix the col_a1c_* variables above.")
}

# -------------------------
# Clean HbA1c values → numeric
# -------------------------
clean_a1c <- function(x) {
  x <- str_squish(as.character(x))
  x[x %in% c("", "NA", "N/A", "na", "n/a", "NULL", "null")] <- NA_character_
  x[x == "<4.0"]  <- "4.0"
  x[x == ">14.0"] <- "14.0"
  x[x == ">20.0"] <- "20.0"
  x <- gsub("%", "", x)
  suppressWarnings(as.numeric(x))
}

# -------------------------
# Build long dataset (3 panels)
# -------------------------
a1c_long <- df_status %>%
  transmute(
    id = row_number(),
    A1c_All = clean_a1c(.data[[col_a1c_all]]),
    A1c_T1  = clean_a1c(.data[[col_a1c_t1]]),
    A1c_T2  = clean_a1c(.data[[col_a1c_t2]])
  ) %>%
  pivot_longer(cols = c(A1c_All, A1c_T1, A1c_T2),
               names_to = "Timepoint",
               values_to = "HbA1c") %>%
  filter(!is.na(HbA1c)) %>%
  mutate(
    Timepoint = recode(Timepoint,
                       A1c_All = "All patients (chart review HbA1c)",
                       A1c_T1  = "T1 HbA1c",
                       A1c_T2  = "T2 HbA1c"),
    Timepoint = factor(Timepoint,
                       levels = c("All patients (chart review HbA1c)", "T1 HbA1c", "T2 HbA1c")),
    Hb_bin = floor(HbA1c),
    Hb_mid = Hb_bin + 0.5
  )

# -------------------------
# Summarize into 1.0%-wide bins and label percentages
# -------------------------
a1c_hist <- a1c_long %>%
  count(Timepoint, Hb_bin, Hb_mid, name = "n") %>%
  group_by(Timepoint) %>%
  mutate(
    pct = round(100 * n / sum(n), 1),
    label = ifelse(pct > 0, paste0(pct, "%"), ""),
    N = sum(n)
  ) %>%
  ungroup()

# For facet labels with n
facet_labs <- setNames(
  paste0(levels(a1c_hist$Timepoint), " (n = ", tapply(a1c_long$HbA1c, a1c_long$Timepoint, length), ")"),
  levels(a1c_hist$Timepoint)
)

# Determine x-range
min_bin <- min(a1c_hist$Hb_bin, na.rm = TRUE)
max_bin <- max(a1c_hist$Hb_bin, na.rm = TRUE)

# -------------------------
# Plot (3 stacked panels)
# -------------------------
ggplot(a1c_hist, aes(x = Hb_mid, y = pct)) +
  geom_col(width = 0.9, fill = "#4E79A7", color = "white") +
  geom_text(aes(label = label), vjust = -0.35, size = 3.4) +
  facet_wrap(
    ~ Timepoint,
    ncol = 1,
    labeller = labeller(Timepoint = facet_labs)
  ) +
  scale_x_continuous(
    breaks = seq(min_bin, max_bin, by = 1),
    minor_breaks = NULL
  ) +
  scale_y_continuous(labels = function(x) paste0(x, "%")) +
  coord_cartesian(ylim = c(0, max(a1c_hist$pct, na.rm = TRUE) + 10)) +
  theme_bw() +
  labs(
    title = "HbA1c Distributions (All vs T1 vs T2)",
    x = "HbA1c (%)",
    y = "% within timepoint"
  ) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    strip.text = element_text(face = "bold", size = 12),
    axis.text  = element_text(size = 11)
  )

# -------------------------
# 1) COLUMN NAMES (EDIT THESE 5 to match names(df_status))
# -------------------------
col_ed_base  <- "ed_vis_last_90_days"
col_ed_t1    <- "ed_utilization_t1"
col_ed_t2    <- "ed_utilization_t2"

col_ip_base  <- "hosp_last_90_days"          # <-- replace with your true IP baseline column (NOT hosp)
col_ip_t1    <- "ip_utilization_t1"
col_ip_t2  <- "hospital_utilization_t2"


# -------------------------
# 2) Helpers
# -------------------------
parse_count <- function(x) {
  x <- as.character(x)
  x <- str_squish(x)
  x[x == ""] <- NA_character_
  x[str_to_upper(x) %in% c("NA","N/A","NULL")] <- NA_character_
  suppressWarnings(as.numeric(str_replace_all(x, "[^0-9\\-\\.]", "")))
}

make_bucket <- function(x, cap = 5) {
  bx <- ifelse(x >= cap, cap, as.integer(x))
  ifelse(bx == cap, paste0(cap, "+"), as.character(bx))
}

# -------------------------
# 3) Safety checks
# -------------------------
need_cols <- c(col_ed_base, col_ed_t1, col_ed_t2,
               col_ip_base, col_ip_t1, col_ip_t2,
               "enrolled_group")
missing <- setdiff(need_cols, names(df_status))
if (length(missing) > 0) {
  stop("Missing columns:\n  - ", paste(missing, collapse = "\n  - "),
       "\n\nFix the col_* names above using names(df_status).")
}

# -------------------------
# 4) Colors (simple + clear)
# -------------------------
col_less <- "#2E7D32"   # green
col_same <- "#757575"   # gray
col_more <- "#C62828"   # red

col_zero <- "#546E7A"   # blue-gray
col_onep <- "#8E24AA"   # purple

make_dist_df <- function(df_status, base_col, t1_col, t2_col, cap = 5, domain_label = "ED") {

  d <- df_status %>%
    transmute(
      group = if_else(enrolled_group == "Enrolled", "Enrolled", "All patients"),
      base = parse_count(.data[[base_col]]),
      t1   = parse_count(.data[[t1_col]]),
      t2   = parse_count(.data[[t2_col]])
    ) %>%
    pivot_longer(cols = c(base, t1, t2), names_to = "timepoint", values_to = "count") %>%
    filter(!is.na(count)) %>%
    mutate(
      timepoint = recode(timepoint,
                         base = "Baseline (last 90d)",
                         t1   = "T1",
                         t2   = "T2"),
      timepoint = factor(timepoint, levels = c("Baseline (last 90d)", "T1", "T2")),
      bucket = make_bucket(count, cap = cap),
      bucket = factor(bucket, levels = c(as.character(0:(cap-1)), paste0(cap, "+"))),
      group = factor(group, levels = c("All patients", "Enrolled")),
      domain = domain_label
    )

  d %>%
    count(domain, group, timepoint, bucket, name = "n") %>%
    tidyr::complete(domain, group, timepoint, bucket, fill = list(n = 0)) %>%  # ensures missing buckets show as 0
    group_by(domain, group, timepoint) %>%
    mutate(
      denom = sum(n),
      pct = ifelse(denom > 0, round(100 * n / denom, 1), 0),
      label = ifelse(n > 0, paste0(pct, "%"), "")
    ) %>%
    ungroup()
}

plot_dist <- function(tab, title_text, xlab) {
  ggplot(tab, aes(x = bucket, y = pct)) +
    geom_col(width = 0.8, fill = "#2E6F9E") +
    geom_text(aes(label = label), vjust = -0.25, size = 3.6) +
    facet_grid(group ~ timepoint) +
    scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
    theme_bw() +
    labs(title = title_text, x = xlab, y = "% of patients within group") +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      strip.background = element_rect(fill = "grey90"),
      strip.text = element_text(face = "bold")
    )
}

# -------------------------
# 5) ED distribution plot (cap at 5+)
# -------------------------
ed_dist <- make_dist_df(df_status, col_ed_base, col_ed_t1, col_ed_t2, cap = 5, domain_label = "ED")
plot_dist(ed_dist, "ED Utilization Distribution: All patients vs Enrolled", "ED visits")

# -------------------------
# 6) IP distribution plot (cap at 3+)
# -------------------------
ip_dist <- make_dist_df(df_status, col_ip_base, col_ip_t1, col_ip_t2, cap = 3, domain_label = "IP")
plot_dist(ip_dist, "Inpatient Utilization Distribution: All patients vs Enrolled", "Inpatient stays")

# =========================
# 7) CHANGE PLOTS: Baseline → T1 (Enrolled only)
#     X-axis: Less visits / No change / More visits
# =========================

# Build enrolled paired cohort (must have baseline + T1 for each domain)
df_util_pairs <- df_status %>%
  filter(enrolled_group == "Enrolled") %>%
  transmute(
    ED_base = parse_count(.data[[col_ed_base]]),
    ED_T1   = parse_count(.data[[col_ed_t1]]),
    IP_base = parse_count(.data[[col_ip_base]]),
    IP_T1   = parse_count(.data[[col_ip_t1]])
  ) %>%
  filter(!is.na(ED_base) & !is.na(ED_T1),
         !is.na(IP_base) & !is.na(IP_T1))

n_pairs <- nrow(df_util_pairs)

make_change_tab <- function(df, base_var, t1_var) {
  df %>%
    transmute(
      Change = case_when(
        .data[[t1_var]] < .data[[base_var]] ~ "Less visits",
        .data[[t1_var]] > .data[[base_var]] ~ "More visits",
        TRUE                                 ~ "No change"
      )
    ) %>%
    count(Change, name = "n") %>%
    mutate(
      pct = round(100 * n / sum(n), 1),
      label = paste0(pct, "%\n(n=", n, ")"),
      Change = factor(Change, levels = c("Less visits", "No change", "More visits"))
    )
}

plot_change_bar <- function(tab, title_text) {
  ggplot(tab, aes(x = Change, y = pct, fill = Change)) +
    geom_col(width = 0.72) +
    geom_text(aes(label = label), vjust = -0.35, size = 4) +
    scale_fill_manual(values = c(
      "Less visits" = col_less,
      "No change"   = col_same,
      "More visits" = col_more
    )) +
    scale_y_continuous(limits = c(0, 100), labels = function(x) paste0(x, "%")) +
    labs(
      title = title_text,
      x = NULL,
      y = "% of enrolled cohort"
    ) +
    theme_bw() +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      legend.position = "none"
    )
}

# ED change
ed_change_tab <- make_change_tab(df_util_pairs, "ED_base", "ED_T1")
p_ed_change <- plot_change_bar(
  ed_change_tab,
  paste0("ED Utilization Change: Baseline \u2192 T1 (n = ", n_pairs, ")")
)

# IP change
ip_change_tab <- make_change_tab(df_util_pairs, "IP_base", "IP_T1")
p_ip_change <- plot_change_bar(
  ip_change_tab,
  paste0("IP Utilization Change: Baseline \u2192 T1 (n = ", n_pairs, ")")
)

# show tables if you want
ed_change_tab

## # A tibble: 3 × 4
##   Change          n   pct label          
##   <fct>       <int> <dbl> <chr>          
## 1 Less visits    58  57.4 "57.4%\n(n=58)"
## 2 More visits     9   8.9 "8.9%\n(n=9)"  
## 3 No change      34  33.7 "33.7%\n(n=34)"

ip_change_tab

## # A tibble: 3 × 4
##   Change          n   pct label          
##   <fct>       <int> <dbl> <chr>          
## 1 Less visits    62  61.4 "61.4%\n(n=62)"
## 2 More visits     6   5.9 "5.9%\n(n=6)"  
## 3 No change      33  32.7 "32.7%\n(n=33)"

# stacked like your screenshot
p_ed_change / p_ip_change

# =========================
# DEMOGRAPHIC TABLE
# Columns (left->right):
# Category, Value, All Patients, Not enrolled, Enrolled, Cohort 1, Cohort 2, Cohort 3
# Combines multi-select responses that appear in different order
# =========================

# ---- A) Identify (or set) the demographic source columns ----
# If your columns are named differently, change ONLY these two lines.
race_col <- "race"
eth_col  <- "ethnicity"

stopifnot(race_col %in% names(df_status))
stopifnot(eth_col  %in% names(df_status))

# ---- B) Canonicalize multi-select answers (newline/;/, order-insensitive) ----
canon_multiselect <- function(x) {
  x <- as.character(x)
  x <- str_trim(x)
  x[x == ""] <- NA_character_

  sapply(x, function(val) {
    if (is.na(val)) return(NA_character_)
    parts <- unlist(str_split(val, "\\r?\\n|\\s*[;,]\\s*"))
    parts <- str_trim(parts)
    parts <- parts[parts != ""]
    if (length(parts) == 0) return(NA_character_)
    paste(sort(unique(parts)), collapse = "; ")
  }, USE.NAMES = FALSE)
}

# ---- C) Map Race codes to labels (keeps multi-select combined) ----
race_map <- c(
  "R1" = "American Indian / Alaska Native",
  "R2" = "Asian",
  "R3" = "Black or African-American",
  "R4" = "Native Hawaiian / Pacific Islander",
  "R5" = "White",
  "R9" = "Other",
  "S1" = "Patient Declined",
  "S2" = "Patient Unavailable",
  "S3" = "Not Applicable / Unknown"
)

clean_race_value <- function(x_vec) {
  x_vec <- canon_multiselect(x_vec)
  sapply(x_vec, function(s) {
    if (is.na(s)) return(NA_character_)
    items <- str_split(s, ";\\s*")[[1]]
    items <- str_trim(items)

    recoded <- vapply(items, function(it) {
      code <- str_extract(it, "^[A-Za-z]\\d+")
      if (!is.na(code) && code %in% names(race_map)) race_map[[code]] else it
    }, character(1))

    paste(sort(unique(recoded)), collapse = "; ")
  }, USE.NAMES = FALSE)
}

# ---- D) Map Ethnicity codes to labels (keeps multi-select combined if present) ----
clean_eth_value <- function(x_vec) {
  x_vec <- canon_multiselect(x_vec)
  sapply(x_vec, function(s) {
    if (is.na(s)) return(NA_character_)
    items <- str_split(s, ";\\s*")[[1]]
    items <- str_trim(items)

    recoded <- dplyr::case_when(
      str_detect(items, "E1") ~ "Spanish / Hispanic / Latino",
      str_detect(items, "E2") ~ "Not Spanish / Hispanic / Latino",
      str_detect(items, "S1") ~ "Patient Declined",
      str_detect(items, "S2") ~ "Patient Unavailable",
      TRUE ~ NA_character_
    )
    recoded <- recoded[!is.na(recoded)]
    if (length(recoded) == 0) return(NA_character_)
    paste(sort(unique(recoded)), collapse = "; ")
  }, USE.NAMES = FALSE)
}

# ---- E) Add cleaned demographic fields to df_status ----
df_status <- df_status %>%
  mutate(
    race_clean      = clean_race_value(.data[[race_col]]),
    ethnicity_clean = clean_eth_value(.data[[eth_col]])
  )

# ---- F) Formatter: n (percent) with denominator within each column/group ----
fmt_np <- function(n, denom) {
  if (is.na(n) || denom == 0) return("-")
  paste0(n, " (", sprintf("%.1f", 100 * n / denom), "%)")
}

# ---- G) Generic demo table builder for one variable ----
make_demo_table <- function(data, variable, label) {

  d <- data %>%
    mutate(Value = canon_multiselect(.data[[variable]])) %>%   # canonicalize again (safe)
    filter(!is.na(Value))

  # denominators (only among rows that answered this variable)
  denom_all <- nrow(d)
  denom_not <- nrow(filter(d, enrolled_group == "Not enrolled"))
  denom_enr <- nrow(filter(d, enrolled_group == "Enrolled"))
  denom_c1  <- nrow(filter(d, cohort_final == "Cohort 1"))
  denom_c2  <- nrow(filter(d, cohort_final == "Cohort 2"))
  denom_c3  <- nrow(filter(d, cohort_final == "Cohort 3"))

  # counts
  all_ct <- d %>% count(Value, name = "n_all")
  not_ct <- d %>% filter(enrolled_group == "Not enrolled") %>% count(Value, name = "n_not")
  enr_ct <- d %>% filter(enrolled_group == "Enrolled") %>% count(Value, name = "n_enr")
  c1_ct  <- d %>% filter(cohort_final == "Cohort 1") %>% count(Value, name = "n_c1")
  c2_ct  <- d %>% filter(cohort_final == "Cohort 2") %>% count(Value, name = "n_c2")
  c3_ct  <- d %>% filter(cohort_final == "Cohort 3") %>% count(Value, name = "n_c3")

  out <- all_ct %>%
    full_join(not_ct, by = "Value") %>%
    full_join(enr_ct, by = "Value") %>%
    full_join(c1_ct,  by = "Value") %>%
    full_join(c2_ct,  by = "Value") %>%
    full_join(c3_ct,  by = "Value") %>%
    mutate(
      Category = label,
      `All Patients` = mapply(fmt_np, n_all, denom_all),
      `Not enrolled` = mapply(fmt_np, n_not, denom_not),
      `Enrolled`     = mapply(fmt_np, n_enr, denom_enr),
      `Cohort 1`     = mapply(fmt_np, n_c1,  denom_c1),
      `Cohort 2`     = mapply(fmt_np, n_c2,  denom_c2),
      `Cohort 3`     = mapply(fmt_np, n_c3,  denom_c3)
    ) %>%
    select(Category, Value, `All Patients`, `Not enrolled`, `Enrolled`, `Cohort 1`, `Cohort 2`, `Cohort 3`) %>%
    arrange(Category, Value)

  out
}

# ---- H) Build tables and combine ----
race_table      <- make_demo_table(df_status, "race_clean", "Race")
ethnicity_table <- make_demo_table(df_status, "ethnicity_clean", "Ethnicity")

demographic_table <- bind_rows(race_table, ethnicity_table)

demographic_table

## # A tibble: 23 × 8
##    Category Value   `All Patients` `Not enrolled` Enrolled `Cohort 1` `Cohort 2`
##    <chr>    <chr>   <chr>          <chr>          <chr>    <chr>      <chr>     
##  1 Race     Americ… 4 (0.4%)       3 (0.4%)       1 (0.7%) 1 (1.7%)   -         
##  2 Race     Asian   29 (3.0%)      27 (3.3%)      2 (1.5%) -          -         
##  3 Race     Asian;… 1 (0.1%)       1 (0.1%)       -        -          -         
##  4 Race     Asian;… 1 (0.1%)       1 (0.1%)       -        -          -         
##  5 Race     Black … 234 (24.6%)    180 (22.0%)    54 (39.… 23 (38.3%) 14 (34.1%)
##  6 Race     Black … 1 (0.1%)       -              1 (0.7%) 1 (1.7%)   -         
##  7 Race     Black … 12 (1.3%)      11 (1.3%)      1 (0.7%) 1 (1.7%)   -         
##  8 Race     Black … 3 (0.3%)       2 (0.2%)       1 (0.7%) -          1 (2.4%)  
##  9 Race     Black … 2 (0.2%)       2 (0.2%)       -        -          -         
## 10 Race     Native… 2 (0.2%)       2 (0.2%)       -        -          -         
## # ℹ 13 more rows
## # ℹ 1 more variable: `Cohort 3` <chr>

# -------------------------
# 0) COLUMN NAMES (edit if needed)
# -------------------------
race_col <- "race"

sex_col  <- "legal_sex"
eth_col  <- "ethnicity"

needed_demo_cols <- c("enrolled_group", race_col, sex_col, eth_col)
missing_demo_cols <- setdiff(needed_demo_cols, names(df_status))
if (length(missing_demo_cols) > 0) {
  stop(
    "Missing demographic columns:\n  - ",
    paste(missing_demo_cols, collapse = "\n  - ")
  )
}

# -------------------------
# Helper for legend labels
# -------------------------
make_legend_labels <- function(df_long) {
  sizes <- df_long %>%
    distinct(id, Group) %>%
    count(Group, name = "N")

  setNames(
    paste0(sizes$Group, " (n = ", sizes$N, ")"),
    sizes$Group
  )
}

# =========================
# 1) RACE
# =========================

race_map <- c(
  "R1" = "American Indian / Alaska Native",
  "R2" = "Asian",
  "R3" = "Black or African American",
  "R4" = "Native Hawaiian / Pacific Islander",
  "R5" = "White",
  "R9" = "Other"
)

race_long <- df_status %>%
  transmute(
    id = row_number(),
    enrolled_group,
    Race_raw = str_squish(as.character(.data[[race_col]]))
  ) %>%
  filter(!is.na(Race_raw), Race_raw != "") %>%
  mutate(Race_raw = str_split(Race_raw, "\\r?\\n")) %>%
  unnest(Race_raw) %>%
  mutate(
    Race_code = str_extract(Race_raw, "^R\\d+"),
    Race_clean = unname(race_map[Race_code])
  ) %>%
  filter(!is.na(Race_clean))

race_tab <- bind_rows(
  race_long %>% mutate(Group = "All patients"),
  race_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
) %>%
  count(Group, Race_clean, name = "n") %>%
  group_by(Group) %>%
  mutate(percent = round(100 * n / sum(n), 1)) %>%
  ungroup()

race_tab <- race_tab %>%
  mutate(Group = factor(Group, levels = c("All patients", "Enrolled")))

race_legend_labels <- make_legend_labels(
  bind_rows(
    race_long %>% mutate(Group = "All patients"),
    race_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
  )
)

ggplot(race_tab, aes(x = Race_clean, y = percent, fill = Group)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_text(aes(label = paste0(percent, "%")),
            position = position_dodge(width = 0.8),
            hjust = -0.1, size = 3.6) +
  coord_flip(clip = "off") +
  scale_fill_manual(values = c("All patients" = "#CFE1F2",
                               "Enrolled" = "#1F4E79"),
                    labels = race_legend_labels) +
  expand_limits(y = max(race_tab$percent, na.rm = TRUE) + 8) +
  theme_bw() +
  labs(title = "Race Distribution: Enrolled vs All Patients",
       x = NULL,
       y = "% within group",
       fill = "") +
  theme(plot.title = element_text(face = "bold"),
        legend.position = "top")

# =========================
# 2) LEGAL SEX
# =========================

sex_long <- df_status %>%
  transmute(
    id = row_number(),
    enrolled_group,
    Sex = str_squish(as.character(.data[[sex_col]]))
  ) %>%
  filter(!is.na(Sex), Sex != "")

sex_tab <- bind_rows(
  sex_long %>% mutate(Group = "All patients"),
  sex_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
) %>%
  count(Group, Sex, name = "n") %>%
  group_by(Group) %>%
  mutate(percent = round(100 * n / sum(n), 1)) %>%
  ungroup() %>%
  mutate(Group = factor(Group, levels = c("All patients", "Enrolled")))

sex_legend_labels <- make_legend_labels(
  bind_rows(
    sex_long %>% mutate(Group = "All patients"),
    sex_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
  )
)

ggplot(sex_tab, aes(x = Sex, y = percent, fill = Group)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_text(aes(label = paste0(percent, "%")),
            position = position_dodge(width = 0.8),
            hjust = -0.1, size = 3.6) +
  coord_flip(clip = "off") +
  scale_fill_manual(values = c("All patients" = "#CFE1F2",
                               "Enrolled" = "#1F4E79"),
                    labels = sex_legend_labels) +
  expand_limits(y = max(sex_tab$percent, na.rm = TRUE) + 8) +
  theme_bw() +
  labs(title = "Legal Sex: Enrolled vs All Patients",
       x = NULL,
       y = "% within group",
       fill = "") +
  theme(plot.title = element_text(face = "bold"),
        legend.position = "top")

# =========================
# 3) ETHNICITY
# =========================

eth_long <- df_status %>%
  transmute(
    id = row_number(),
    enrolled_group,
    Eth_raw = str_squish(as.character(.data[[eth_col]])),
    Eth_clean = case_when(
      str_detect(Eth_raw, "E1") ~ "Spanish/Hispanic/Latino",
      str_detect(Eth_raw, "E2") ~ "Not Spanish/Hispanic/Latino",
      str_detect(Eth_raw, "S1") ~ "Patient declined",
      str_detect(Eth_raw, "S2") ~ "Patient unavailable",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!is.na(Eth_clean), Eth_clean != "")

eth_tab <- bind_rows(
  eth_long %>% mutate(Group = "All patients"),
  eth_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
) %>%
  count(Group, Eth_clean, name = "n") %>%
  group_by(Group) %>%
  mutate(percent = round(100 * n / sum(n), 1)) %>%
  ungroup() %>%
  mutate(Group = factor(Group, levels = c("All patients", "Enrolled")))

eth_legend_labels <- make_legend_labels(
  bind_rows(
    eth_long %>% mutate(Group = "All patients"),
    eth_long %>% filter(enrolled_group == "Enrolled") %>% mutate(Group = "Enrolled")
  )
)

ggplot(eth_tab, aes(x = Eth_clean, y = percent, fill = Group)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_text(aes(label = paste0(percent, "%")),
            position = position_dodge(width = 0.8),
            hjust = -0.1, size = 3.6) +
  coord_flip(clip = "off") +
  scale_fill_manual(values = c("All patients" = "#CFE1F2",
                               "Enrolled" = "#1F4E79"),
                    labels = eth_legend_labels) +
  expand_limits(y = max(eth_tab$percent, na.rm = TRUE) + 8) +
  theme_bw() +
  labs(title = "Ethnicity: Enrolled vs All Patients",
       x = NULL,
       y = "% within group",
       fill = "") +
  theme(plot.title = element_text(face = "bold"),
        legend.position = "top")

# =========================
# T1 SELF-MANAGEMENT (PAST WEEK)
# =========================


col_t1_manage <- "t1_have_you_done_anything_in_the_past_week_to_help_manage_your_diabetes_like_walking_portion_control_or_drinking_more_water"

# ---- Safety check ----
if (!col_t1_manage %in% names(df_status)) {
  stop("Column not found:\n  ", col_t1_manage,
       "\n\nCheck names(df_status) and update col_t1_manage.")
}

# ---- 1) Clean + categorize responses ----
# ---- 1) Clean + categorize responses (FIXED: blanks/not surveyed -> NA) ----
missing_tokens <- c(
  "", "NA", "N/A", "NULL", "NONE",
  "NOT SURVEYED", "NOT ASKED", "NOT APPLICABLE",
  "SKIPPED", "MISSING", "UNKNOWN"
)

df_manage <- df_status %>%
  transmute(
    raw = .data[[col_t1_manage]],
    raw_chr = as.character(raw),
    raw_std = str_to_upper(str_squish(raw_chr)),
    category = case_when(
      # TRUE missing (actual NA)
      is.na(raw_chr) ~ NA_character_,

      # empty/whitespace after squish
      raw_std == "" ~ NA_character_,

      # common placeholders meaning "not surveyed" / missing
      raw_std %in% missing_tokens ~ NA_character_,

      # if your system uses "-" or "." as blank, include them:
      raw_std %in% c("-", ".", "--") ~ NA_character_,

      # actual responses
      raw_std %in% c("N", "NO") ~ "No",
      raw_std %in% c("Y", "YES") ~ "Yes (unspecified)",
      str_detect(raw_std, "^DAILY") ~ "Daily",
      str_detect(raw_std, "OCCASION") ~ "Occasionally",

      TRUE ~ "Other / unclear"
    )
  )

# ---- 2) Summary table ----
manage_tab <- df_manage %>%
  filter(!is.na(category)) %>%
  count(category, name = "n") %>%
  mutate(
    percent = round(100 * n / sum(n), 1)
  ) %>%
  arrange(desc(n))

manage_tab

## # A tibble: 4 × 3
##   category              n percent
##   <chr>             <int>   <dbl>
## 1 Daily                22    55  
## 2 Occasionally         12    30  
## 3 Yes (unspecified)     5    12.5
## 4 No                    1     2.5

# ---- 3) Bar chart ----
ggplot(manage_tab, aes(x = reorder(category, percent), y = percent)) +
  geom_col(width = 0.7, fill = "#4E79A7") +
  geom_text(
    aes(label = paste0(percent, "% (n=", n, ")")),
    hjust = -0.05,
    size = 3.6
  ) +
  coord_flip(clip = "off") +
  expand_limits(y = max(manage_tab$percent, na.rm = TRUE) + 8) +
  scale_y_continuous(labels = function(x) paste0(x, "%")) +
  theme_bw() +
  labs(
    title = "T1: Diabetes Self-Management Actions (Past Week)",
    x = "",
    y = "% of respondents"
  ) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    axis.text = element_text(size = 11)
  )

col_help_choices   <- "t1_did_the_nutrition_material_you_got_from_white_plains_hospital_help_you_make_healthier_food_choices"
col_conf_food      <- "t1_how_confident_are_you_in_making_food_choices_that_help_control_your_blood_sugar"
col_conf_labels    <- "t1_how_confident_are_you_in_understanding_food_labels_or_nutrition_information"
col_conf_prepare   <- "t1_how_confident_are_you_in_your_ability_to_prepare_healthy_meals_with_the_food_you_typically_have_at_home"
col_help_control   <- "t1_how_much_did_this_program_help_you_feel_more_in_control_of_your_diabetes"

# CLEAN + STANDARDIZE LIKERT RESPONSES
# =========================
# =========================
# CLEANERS
# =========================

# A) Confidence-style questions (maps numeric + misspellings + MAYBE)
clean_confidence <- function(x) {
  s <- str_to_upper(str_squish(as.character(x)))

  case_when(
    is.na(s) ~ NA_character_,
    s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,

    # numeric scales (support both 1-4 and 1-5)
    s %in% c("1") ~ "Not confident",
    s %in% c("2") ~ "Somewhat not confident",
    s %in% c("3") ~ "Confident",
    s %in% c("4") ~ "Very confident",
    s %in% c("5") ~ "Very confident",   # if a stray 5 appears, bucket to Very confident

    # explicit text + common typos
    str_detect(s, "NOT CONFIDENT") ~ "Not confident",
    str_detect(s, "SOMEWHAT NOT CONFIDENT") ~ "Somewhat not confident",
    str_detect(s, "\\bMAYBE\\b") ~ "Maybe",
    s %in% c("CONFIDENT", "COFIDENT", "CONFIDEMT", "CONFIDNT", "COFIDENT") ~ "Confident",
    str_detect(s, "VERY CONFIDENT") ~ "Very confident",
    str_detect(s, "SOMEWHAT CONFIDENT") ~ "Somewhat confident",

    TRUE ~ "Other / unclear"
  )
}

# B) Program-help / impact-style questions (A lot / Somewhat / A little, etc.)
clean_impact <- function(x) {
  s <- str_to_upper(str_squish(as.character(x)))

  case_when(
    is.na(s) ~ NA_character_,
    s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,

    # numeric (if any)
    s %in% c("5") ~ "A lot",
    s %in% c("4") ~ "Somewhat",
    s %in% c("3") ~ "A little",
    s %in% c("2") ~ "A little",
    s %in% c("1") ~ "Not at all",

    # text variants (case + spacing)
    str_detect(s, "^A LOT") ~ "A lot",
    str_detect(s, "^SOMEWHAT") ~ "Somewhat",
    str_detect(s, "A LITTLE") ~ "A little",
    str_detect(s, "NOT AT ALL") ~ "Not at all",

    # yes/no (if it sneaks in)
    s %in% c("YES", "Y") ~ "Yes",
    s %in% c("NO", "N") ~ "No",

    TRUE ~ "Other / unclear"
  )
}
plot_t1_question <- function(df, col_name, title_text, cleaner_fun) {

  if (!col_name %in% names(df)) stop("Column not found: ", col_name)

  df_plot <- df %>%
    transmute(response = cleaner_fun(.data[[col_name]])) %>%
    filter(!is.na(response)) %>%
    count(response, name = "n") %>%
    mutate(percent = round(100 * n / sum(n), 1)) %>%
    arrange(percent)

  ggplot(df_plot, aes(x = reorder(response, percent), y = percent)) +
    geom_col(fill = "#1F4E79", width = 0.7) +
    geom_text(aes(label = paste0(percent, "% (n=", n, ")")),
              hjust = -0.05, size = 3.6) +
    coord_flip(clip = "off") +
    expand_limits(y = max(df_plot$percent, na.rm = TRUE) + 8) +
    scale_y_continuous(labels = function(x) paste0(x, "%")) +
    theme_bw() +
    labs(title = title_text, x = "", y = "% of respondents") +
    theme(plot.title = element_text(face = "bold", size = 14),
          axis.text = element_text(size = 11))
}

p1 <- plot_t1_question(df_status, col_help_choices,
                       "T1: Nutrition Material Helped Make Healthier Food Choices",
                       cleaner_fun = clean_impact)

p2 <- plot_t1_question(df_status, col_conf_food,
                       "T1: Confidence in Making Food Choices to Control A1C",
                       cleaner_fun = clean_confidence)

p3 <- plot_t1_question(df_status, col_conf_labels,
                       "T1: Confidence Understanding Food Labels / Nutrition Info",
                       cleaner_fun = clean_confidence)

p4 <- plot_t1_question(df_status, col_conf_prepare,
                       "T1: Confidence Preparing Healthy Meals at Home",
                       cleaner_fun = clean_confidence)



print(p1); print(p2); print(p3); print(p4)

impact_tab <- df_status %>%
  transmute(response = clean_impact(.data[[col_help_control]])) %>%
  filter(!is.na(response)) %>%
  count(response, name = "n") %>%
  mutate(
    percent = round(100 * n / sum(n), 1),
    label = paste0(response, "\n", percent, "% (n=", n, ")")
  )

ggplot(impact_tab, aes(x = "", y = n, fill = response)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  geom_text(
    aes(label = paste0(percent, "%")),
    position = position_stack(vjust = 0.5),
    size = 4,
    color = "white"
  ) +
  scale_fill_manual(
    values = c(
      "A lot"    = "#1F4E79",
      "Somewhat" = "#2E86C1",
      "A little" = "#5DADE2",
      "Yes"      = "#A9CCE3"
    )
  ) +
  theme_void() +
  labs(
    title = "T1: Program Helped Patient Feel More in Control of Diabetes",
    fill = ""
  ) +
  theme(
    plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
    legend.position = "right"
  )

# =========================
# MEALS MADE USING PROGRAM FOOD — T1 + T2


# ---- EDIT if needed to match names(df_status) ----
col_meals_t1 <- "t1_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"
col_meals_t2 <- "t2_in_the_past_week_how_many_meals_did_you_make_using_food_from_the_program"

# ---- safety check ----
need_cols <- c(col_meals_t1, col_meals_t2)
missing <- setdiff(need_cols, names(df_status))
if (length(missing) > 0) {
  stop("Missing columns:\n  - ", paste(missing, collapse = "\n  - "),
       "\n\nFix col_meals_t1 / col_meals_t2 using names(df_status).")
}

# ---- robust cleaner (works for BOTH T1 + T2) ----
clean_meals <- function(x) {

  s <- str_to_lower(str_squish(as.character(x)))

  # convert text numbers to digits
  s <- str_replace_all(s, c(
    "zero"  = "0",
    "one"   = "1",
    "two"   = "2",
    "three" = "3",
    "four"  = "4",
    "five"  = "5",
    "six"   = "6",
    "seven" = "7",
    "eight" = "8"
  ))

  case_when(
    # missing
    is.na(s) ~ NA_character_,
    s %in% c("", "na", "n/a", "null", "unsure of #", "unsure") ~ NA_character_,

    # ran out = 0
    str_detect(s, "ran out") ~ "0",

    # explicit 0 / 1
    str_detect(s, "^0$") ~ "0",
    str_detect(s, "^1$") ~ "1",

    # 2–3 bucket
    str_detect(s, "2-3") | str_detect(s, "2–3") | str_detect(s, "2 to 3") |
      str_detect(s, "^2$") | str_detect(s, "^3$") |
      str_detect(s, "2 a day") | str_detect(s, "3 a day") ~ "2–3",

    # 4–6 bucket
    str_detect(s, "4-6") | str_detect(s, "4 to 6") |
      str_detect(s, "4 to 5") |
      str_detect(s, "3-4") | str_detect(s, "3 to 4") |
      str_detect(s, "^4$") | str_detect(s, "^5$") | str_detect(s, "^6$") |
      str_detect(s, "5-6") ~ "4–6",

    # 7+
    str_detect(s, "7") | str_detect(s, "8") ~ "7+",

    TRUE ~ NA_character_
  )
}

# ---- plotting helper ----

plot_meals <- function(df, col_name, title_text, y_max = 60) {

  tab <- df %>%
    transmute(meals = clean_meals(.data[[col_name]])) %>%
    filter(!is.na(meals)) %>%
    count(meals, name = "n") %>%
    mutate(
      meals = factor(meals, levels = c("0", "1", "2–3", "4–6", "7+")),
      percent = round(100 * n / sum(n), 1),
      label = paste0(percent, "% (n=", n, ")")
    ) %>%
    arrange(meals)

  ggplot(tab, aes(x = meals, y = percent)) +
    geom_col(fill = "#1F4E79", width = 0.7) +
    geom_text(aes(label = label), vjust = -0.35, size = 4) +
    scale_y_continuous(
      limits = c(0, y_max),      # <-- FIXED Y LIMIT
      labels = function(x) paste0(x, "%")
    ) +
    theme_bw() +
    labs(
      title = title_text,
      x = "Number of meals",
      y = "% of respondents"
    ) +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      axis.text = element_text(size = 11)
    )
}
p_meals_t1 <- plot_meals(df_status, col_meals_t1,
                         "Meals Prepared Using Program Food (Past Week) — T1",
                         y_max = 60)

p_meals_t2 <- plot_meals(df_status, col_meals_t2,
                         "Meals Prepared Using Program Food (Past Week) — T2",
                         y_max = 60)

print(p_meals_t1)

print(p_meals_t2)

# =========================
# EASE OF USING PROGRAM FOOD IN MEALS (T1)
# =========================
# ---- EDIT to match names(df_status) if needed ----
col_ease_t1 <- "t1_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"
col_ease_t2 <- "t2_in_the_past_week_how_easy_was_it_to_use_the_food_in_your_meals"

if (!col_ease_t1 %in% names(df_status)) {
  stop("Column not found: ", col_ease_t1,
       "\n\nFind the right one with: names(df_status)[str_detect(names(df_status), 'easy')]")
}

# ---- cleaner ----
clean_ease <- function(x) {
  s <- str_to_upper(str_squish(as.character(x)))

  case_when(
    is.na(s) ~ NA_character_,
    s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,

    # numeric scale variants (adjust if your coding differs)
    s %in% c("1") ~ "Very easy",
    s %in% c("2") ~ "Easy",
    s %in% c("3") ~ "Neutral",

    # text variants
    str_detect(s, "VERY EASY") ~ "Very easy",
    s == "EASY" ~ "Easy",
    str_detect(s, "NEUTRAL") ~ "Neutral",

    TRUE ~ NA_character_
  )
}

ease_summary <- df_status %>%
  transmute(response = clean_ease(.data[[col_ease_t1]])) %>%
  filter(!is.na(response)) %>%
  count(response, name = "n") %>%
  mutate(
    response = factor(response, levels = c("Neutral", "Easy", "Very easy"), ordered = TRUE),
    percent = round(100 * n / sum(n), 1),
    label = paste0(percent, "% (n=", n, ")")
  ) %>%
  arrange(response)

ease_summary_t2 <- df_status %>%
  transmute(response = clean_ease(.data[[col_ease_t2]])) %>%
  filter(!is.na(response)) %>%
  count(response, name = "n") %>%
  mutate(
    response = factor(response,
                      levels = c("Neutral", "Easy", "Very easy"),
                      ordered = TRUE),
    percent = round(100 * n / sum(n), 1),
    label = paste0(percent, "% (n=", n, ")")
  ) %>%
  arrange(response)


ggplot(ease_summary, aes(x = response, y = percent)) +
  geom_col(fill = "#2E86C1", width = 0.7) +
  geom_text(aes(label = label), vjust = -0.35, size = 4) +
  scale_y_continuous(labels = function(x) paste0(x, "%"),
                     expand = expansion(mult = c(0, 0.12))) +
  theme_bw() +
  labs(
    title = "Ease of Using Program Food in Meals (T1)",
    x = "",
    y = "% of participants"
  ) +
  theme(
    axis.text.x = element_text(size = 11),
    plot.title  = element_text(size = 14, face = "bold")
  )

ggplot(ease_summary_t2, aes(x = response, y = percent)) +
  geom_col(fill = "#2E86C1", width = 0.7) +
  geom_text(aes(label = label), vjust = -0.35, size = 4) +
  scale_y_continuous(
    limits = c(0, 60),   # adjust if needed to match T1
    labels = function(x) paste0(x, "%")
  ) +
  theme_bw() +
  labs(
    title = "Ease of Using Program Food in Meals (T2)",
    x = "",
    y = "% of participants"
  ) +
  theme(
    axis.text.x = element_text(size = 11),
    plot.title  = element_text(size = 14, face = "bold")
  )

# 1. Identify the T1 waste column explicitly
col_waste_t1 <- "t1_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"                                                       
col_waste_t2 <- "t2_did_you_throw_away_any_of_the_food_from_your_most_recent_delivery"  
# ---- Safety check ----
need_cols <- c(col_waste_t1, col_waste_t2)
missing <- setdiff(need_cols, names(df_status))
if (length(missing) > 0) {
  stop("Missing columns:\n  - ",
       paste(missing, collapse = "\n  - "),
       "\n\nRun names(df_status) and correct the column names above.")
}

# ---- Clean Yes / No properly (ignore blanks & non-surveyed) ----
clean_yesno <- function(x) {
  s <- str_to_upper(str_squish(as.character(x)))

  case_when(
    is.na(s) ~ NA_character_,
    s %in% c("", "NA", "N/A", "NULL") ~ NA_character_,
    s %in% c("YES", "Y") ~ "Yes",
    s %in% c("NO", "N")  ~ "No",
    TRUE ~ NA_character_
  )
}

# ---- Function to build summary + plot ----
plot_waste <- function(df, col_name, title_text) {

  tab <- df %>%
    transmute(Response = clean_yesno(.data[[col_name]])) %>%
    filter(!is.na(Response)) %>%
    count(Response, name = "n") %>%
    mutate(
      Response = factor(Response, levels = c("No", "Yes")),
      percent = round(100 * n / sum(n), 1),
      label = paste0(percent, "%\n(n=", n, ")")
    )

  ggplot(tab, aes(x = Response, y = percent, fill = Response)) +
    geom_col(width = 0.6) +
    geom_text(aes(label = label), vjust = -0.3, size = 4) +
    scale_fill_manual(values = c("No" = "#1a9850", "Yes" = "#d73027")) +
    scale_y_continuous(limits = c(0, 100),
                       labels = function(x) paste0(x, "%")) +
    theme_bw() +
    labs(
      title = title_text,
      x = "",
      y = "% of respondents"
    ) +
    theme(
      legend.position = "none",
      plot.title = element_text(size = 14, face = "bold")
    )
}

# ---- Draw both graphs ----
p_waste_t1 <- plot_waste(df_status, col_waste_t1,
                         "Did Patients Throw Away Delivered Food? (T1)")

p_waste_t2 <- plot_waste(df_status, col_waste_t2,
                         "Did Patients Throw Away Delivered Food? (T2)")

print(p_waste_t1)

print(p_waste_t2)

Cohort_Validation

2026-03-02